In [1]:
from chembl_structure_pipeline import standardizer
from concurrent.futures import ProcessPoolExecutor
from rdkit import Chem
from rdkit import RDLogger
from rdkit.Chem import Descriptors
from tqdm import tqdm
import datamol as dm
import os
import pandas as pd
import sys
RDLogger.DisableLog('rdApp.*')  # Disable RDKit warnings

def get_canonical_smiles_datamol(smiles):
    try:
        mol = dm.to_mol(smiles)
        mol = dm.fix_mol(mol)
        mol = dm.sanitize_mol(mol)
        smiles = Chem.MolToSmiles(mol, canonical=True, isomericSmiles=True)
        return mol, smiles
    except:
        return None, ""

def get_canonical_smiles_rdkit(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        smiles = Chem.MolToSmiles(mol, canonical=True, isomericSmiles=True)
        return mol, smiles
    except:
        return None, ""

def get_canonical_smiles(smiles):
    smiles = str(smiles).strip()
    try:
        mol, canonical_smiles = get_canonical_smiles_datamol(smiles)
        if mol is not None:
            return mol, canonical_smiles
    except Exception:
        pass
    try:
        mol, canonical_smiles = get_canonical_smiles_rdkit(smiles)
        return mol, canonical_smiles
    except Exception:
        return None, ""
    
def get_standardized_smiles(mol):
    try:
        mol, _ = standardizer.get_parent_mol(mol)
        mol = standardizer.standardize_mol(mol)
        standardized_smiles = Chem.MolToSmiles(mol, canonical=True, isomericSmiles=True)
        return mol, standardized_smiles
    except:
        return None, ""

def calculate_mw(mol):
    try:
        mw = Descriptors.MolWt(mol)
        return str(round(mw, 3))
    except:
        return None


[11:49:55] Initializing Normalizer


In [2]:
# def process_one(smiles):
#     mol, canonical_smiles = get_canonical_smiles(smiles)
#     mol, standardized_smiles = get_standardized_smiles(mol)
#     mw = calculate_mw(mol)
#     return [canonical_smiles, standardized_smiles, mw]

In [3]:
# root = os.path.dirname(os.path.abspath(__file__))
root = "."
sys.path.append(os.path.join(root, "..", "src"))
from default import DATAPATH

print("Step 02")
print("Loading compound SMILES")
compounds = pd.read_csv(os.path.join(DATAPATH, "chembl_processed", "compound_info.csv"))
SMILES = compounds['canonical_smiles'][1800000:1820000].tolist()

Step 02
Loading compound SMILES


In [5]:
OUTPUT = []

print("Standardizing compounds and recalculating Molecular Weight")
for smiles in tqdm(SMILES):

    # Get canonical SMILES
    mol, canonical_smiles = get_canonical_smiles(smiles)

    # Get standardized SMILES
    mol, standardized_smiles = get_standardized_smiles(mol)

    # Calculate mw
    mw = calculate_mw(mol)

    # Store results
    OUTPUT.append([standardized_smiles, mw])

OUTPUT = pd.DataFrame(OUTPUT, columns=["standardized_smiles", 'standardized_MW'])
# OUTPUT.to_csv(os.path.join(DATAPATH, "chembl_processed", "compound_info_standardized.csv"), index=False)

Standardizing compounds and recalculating Molecular Weight


100%|██████████| 20000/20000 [01:36<00:00, 206.44it/s]


In [4]:
# n_jobs = 16
# chunksize = 50_000

# with ProcessPoolExecutor(max_workers=n_jobs) as ex:
#     OUTPUT = list(tqdm(ex.map(process_one, SMILES, chunksize=chunksize), total=len(SMILES)))

# OUTPUT = pd.DataFrame(OUTPUT, columns=["canonical_smiles", "standardized_smiles", "standardized_MW"])

In [None]:
from collections import Counter
from rdkit import Chem
import pandas as pd
import os

In [None]:
compound_info = pd.read_csv("../data/chembl_processed/compound_info.csv")
compound_info_standardized = pd.read_csv("../data/chembl_processed/compound_info_standardized.csv")

In [None]:
compound_info_standardized

In [None]:
compound_info["standardized_smiles"] = compound_info_standardized['standardized_smiles']
compound_info["standardized_MW"] = compound_info_standardized['standardized_MW']

In [None]:
Counter(compound_info['MW'] == compound_info['standardized_MW'])

In [None]:
Counter(compound_info['MW'] >= compound_info['standardized_MW'])

In [None]:
Counter(compound_info['canonical_smiles'] == compound_info['standardized_smiles'])

In [None]:
compound_info['MW_difference'] = [i-j for i,j in compound_info[['MW', 'standardized_MW']].values]

In [None]:
compound_info.sort_values('MW_difference', ascending=False)[:5].values

In [None]:
Chem.MolFromSmiles("CN(Cc1cnc2nc(N)nc(N)c2n1)c1ccc(C(=O)NC(CN)C(=O)O)cc1.O=C(O)C(F)(F)F.O=C(O)C(F)(F)F")

In [None]:
Chem.MolFromSmiles("CN(Cc1cnc2nc(N)nc(N)c2n1)c1ccc(C(=O)NC(CN)C(=O)O)cc1")