In [1]:
from rdkit import Chem
from tqdm import tqdm
import pandas as pd
import numpy as np
import sys
import os

root = "."
sys.path.append(os.path.join(root, "..", "src"))
from default import CONFIGPATH

In [2]:
# Define output path
output_dir = os.path.join(CONFIGPATH, "chembl_processed")
os.makedirs(output_dir, exist_ok=True)

# Load tables
df1 = pd.read_csv(os.path.join(CONFIGPATH, "chembl_activities", "compound_structures.csv"), low_memory=False)
df2 = pd.read_csv(os.path.join(CONFIGPATH, "chembl_activities", "molecule_dictionary.csv"), low_memory=False)

In [7]:
diff = set(set(df2['molregno'])).difference(df1['molregno'])
print("Compounds in molecule_dictionary but NOT in compound_structures")
print(len(diff))
diff2 = set(set(df1['molregno'])).difference(df2['molregno'])
print("Compounds in compound_structures but NOT in molecule_dictionary")
print(len(diff2))

Compounds in molecule_dictionary but NOT in compound_structures
23320
Compounds in compound_structures but NOT in molecule_dictionary
0


In [13]:
# Molregno to canonical SMILES
molregno_to_smiles = (
    df1.groupby("molregno")["canonical_smiles"]
       .nunique()
)
viol_molregno = molregno_to_smiles[molregno_to_smiles > 1]
print(f"Number of molregno with >1 SMILES: {len(viol_molregno)}")

# Canonical SMILES to molregno
smiles_to_molregno = (
    df1.groupby("canonical_smiles")["molregno"]
       .nunique()
)
viol_smiles = smiles_to_molregno[smiles_to_molregno > 1]
print(f"Number of SMILES with >1 molregno: {len(viol_smiles)}")
print(f"Max number: {max(viol_smiles)}")
del viol_molregno, viol_smiles

Number of molregno with >1 SMILES: 0
Number of SMILES with >1 molregno: 155
Max number: 3


In [14]:
# Molregno to ChEMBL ID
molregno_to_chembl = (
    df2.groupby("molregno")["chembl_id"]
       .nunique()
)
viol_molregno = molregno_to_chembl[molregno_to_chembl > 1]
print(f"Number of molregno with >1 ChEMBL ID: {len(viol_molregno)}")

# ChEMBL ID to Molregno
chembl_to_molregno = (
    df2.groupby("chembl_id")["molregno"]
       .nunique()
)
viol_chembl = chembl_to_molregno[chembl_to_molregno > 1]
print(f"Number of ChEMBL IDs with >1 molregno: {len(viol_chembl)}")
del viol_chembl, viol_molregno

Number of molregno with >1 ChEMBL ID: 0
Number of ChEMBL IDs with >1 molregno: 0
