In [None]:
from rdkit import Chem
from tqdm import tqdm
import pandas as pd
import numpy as np
import sys
import os

root = "."
sys.path.append(os.path.join(root, "..", "src"))
from default import CONFIGPATH

In [None]:
# Define output path
output_dir = os.path.join(CONFIGPATH, "chembl_processed")
os.makedirs(output_dir, exist_ok=True)

# Load tables
df1 = pd.read_csv(os.path.join(CONFIGPATH, "chembl_activities", "compound_structures.csv"), low_memory=False)
df2 = pd.read_csv(os.path.join(CONFIGPATH, "chembl_activities", "molecule_dictionary.csv"), low_memory=False)

In [None]:
diff = set(set(df2['molregno'])).difference(df1['molregno'])
print(len(diff) == len(df2) - len(df1))
print(len(diff))

In [None]:
# Molregno to canonical SMILES
molregno_to_smiles = (
    df1.groupby("molregno")["canonical_smiles"]
       .nunique()
)
viol_molregno = molregno_to_smiles[molregno_to_smiles > 1]
print(viol_molregno)

# Canonical SMILES to molregno
smiles_to_molregno = (
    df1.groupby("canonical_smiles")["molregno"]
       .nunique()
)
viol_smiles = smiles_to_molregno[smiles_to_molregno > 1]
print(viol_smiles, max(pd.DataFrame(viol_smiles)['molregno']))
del viol_molregno, viol_smiles

In [None]:
# Molregno to ChEMBL ID
molregno_to_chembl = (
    df2.groupby("molregno")["chembl_id"]
       .nunique()
)
viol_molregno = molregno_to_chembl[molregno_to_chembl > 1]
print(viol_molregno)

# ChEMBL ID to Molregno
chembl_to_molregno = (
    df2.groupby("chembl_id")["molregno"]
       .nunique()
)
viol_chembl = chembl_to_molregno[chembl_to_molregno > 1]
print(viol_chembl)


In [None]:
df_merged = df1.merge(df2[['molregno', 'chembl_id']], on='molregno', how='left')

In [None]:
len(df_merged)