In [1]:
from rdkit import Chem
from tqdm import tqdm
import pandas as pd
import numpy as np
import sys
import os

root = "."
sys.path.append(os.path.join(root, "..", "src"))
from default import CONFIGPATH

In [2]:
# Define output path
output_dir = os.path.join(CONFIGPATH, "chembl_processed")
os.makedirs(output_dir, exist_ok=True)

# Load tables
df1 = pd.read_csv(os.path.join(CONFIGPATH, "chembl_activities", "compound_structures.csv"), low_memory=False)
df2 = pd.read_csv(os.path.join(CONFIGPATH, "chembl_activities", "molecule_dictionary.csv"), low_memory=False)

In [18]:
diff = set(set(df2['molregno'])).difference(df1['molregno'])
print(len(diff) == len(df2) - len(df1))
print(len(diff))
diff2 = set(set(df1['molregno'])).difference(df2['molregno'])
print(len(diff2))

True
23320
0


In [4]:
# Molregno to canonical SMILES
molregno_to_smiles = (
    df1.groupby("molregno")["canonical_smiles"]
       .nunique()
)
viol_molregno = molregno_to_smiles[molregno_to_smiles > 1]
print(viol_molregno)

# Canonical SMILES to molregno
smiles_to_molregno = (
    df1.groupby("canonical_smiles")["molregno"]
       .nunique()
)
viol_smiles = smiles_to_molregno[smiles_to_molregno > 1]
print(viol_smiles, max(pd.DataFrame(viol_smiles)['molregno']))
del viol_molregno, viol_smiles

Series([], Name: canonical_smiles, dtype: int64)
canonical_smiles
C/C1=C/C(=O)O[C@H](C(C)(C)C)CCC[C@H](O)CCC[C@H](O)CCC[C@H](O)CCC[C@@H](O)C[C@@H](O)C[C@H](O)CCC[C@@H](O)CCC[C@@H](O)C[C@@H](O)CCCCC1    2
C1=CC(=Nn2cccc2)C=CC1=Nn1cccc1                                                                                                          2
C1=Cc2nc1c(-c1ccccc1)c1ccc([n-]1)c(-c1ccccc1)c1nc(c(-c3ccccc3)c3ccc([n-]3)c2-c2ccccc2)C=C1.[Zn+2]                                       2
C1=Cc2nc1c(-c1ccccc1)c1ccc([nH]1)c(-c1ccccc1)c1nc(c(-c3ccccc3)c3ccc([nH]3)c2-c2ccccc2)C=C1                                              2
C1=Cc2nc1c(-c1ccncc1)c1ccc([nH]1)c(-c1ccncc1)c1nc(c(-c3ccncc3)c3ccc([nH]3)c2-c2ccncc2)C=C1                                              2
                                                                                                                                       ..
c1cc2cc(c1)C[n+]1ccc(c3ccccc31)NCc1ccc(cc1)CNc1cc[n+](c3ccccc13)C2                                        

In [5]:
# Molregno to ChEMBL ID
molregno_to_chembl = (
    df2.groupby("molregno")["chembl_id"]
       .nunique()
)
viol_molregno = molregno_to_chembl[molregno_to_chembl > 1]
print(viol_molregno)

# ChEMBL ID to Molregno
chembl_to_molregno = (
    df2.groupby("chembl_id")["molregno"]
       .nunique()
)
viol_chembl = chembl_to_molregno[chembl_to_molregno > 1]
print(viol_chembl)


Series([], Name: chembl_id, dtype: int64)
Series([], Name: molregno, dtype: int64)


In [6]:
df_merged = df1.merge(df2[['molregno', 'chembl_id']], on='molregno', how='left')

In [7]:
len(df_merged)

2854815