In [1]:
import warnings
import numpy as np
import pandas as pd
from rdkit import Chem, RDLogger
from rdkit.Chem import MolToSmiles
from rdkit.Chem.inchi import MolToInchi
from pubchempy import get_compounds, Compound

warnings.filterwarnings("ignore")
RDLogger.DisableLog('rdApp.*')

In [2]:
data = pd.read_excel("4_delete_uncertain_smiles.xlsx")
print(data.shape)
data.head()

(7473, 11)


Unnamed: 0,chemical_name,cas_number,smiles,VDss_L_kg,SD,fu_h,reference_number,Comments_1,Comments_2,t 1/2,smiles_supplementary
0,(-)dOTC,160707-69-7,C1=CN(C2SC(OC2)CO)C(=O)N=C1N,1.18,,,R1,"PATRICK F. SMITH, ALAN FORREST, CHARLES H. BAL...",Dosed as 100 mg of racemate. (-) form called a...,,C1=CN(C2SC(OC2)CO)C(=O)N=C1N
1,(+)dOTC,160707-68-6,C1=CN(C2SC(OC2)CO)C(=O)N=C1N,0.84,,,R1,"PATRICK F. SMITH, ALAN FORREST, CHARLES H. BAL...",Dosed as 100 mg of racemate. (-) form called a...,,C1=CN(C2SC(OC2)CO)C(=O)N=C1N
2,1-Aminocyclopropanecarboxylic Acid,22059-21-8,C(=O)([O-])C([NH3+])(C1)C1,0.73,,,R1,"Cherkofsky, S.C. (1995) 1-Aminocyclopropylcarb...",,,C(=O)([O-])C([NH3+])(C1)C1
3,5-Hydroxymethyl tolterodine,207679-81-0,c1ccc(C(CC[NH+](C(C)C)C(C)C)c(c(O)c2)cc(CO)c2)cc1,2.41,,,R1,PDR reference on fesoterodine. Accessed July 1...,"After oral administration, fesoterodine is wel...",,c1ccc(C(CC[NH+](C(C)C)C(C)C)c(c(O)c2)cc(CO)c2)cc1
4,619C89,130800-90-7,Clc1c(cc(cc1c(cnc2N3CC[NH+](C)CC3)c(N)n2)Cl)Cl,15.31,,,R1,"A. J. MERCER, R. J. LAMB, HUSSEIN, S. HOBBIGER...",Data digitized from reported plot. 1 mg/kg onl...,,Clc1c(cc(cc1c(cnc2N3CC[NH+](C)CC3)c(N)n2)Cl)Cl


In [3]:
def rdkit_smiles(df):
    """This function checks and unifies SMILES."""  
    for smiles, i in zip(df["smiles_supplementary"], range(df.shape[0])):
        smi = 'error'
        # noinspection PyBroadException
        try:
            # unified SMILES format
            mol = Chem.MolFromSmiles(smiles)
            smi = Chem.MolToSmiles(mol)
            df.loc[i, 'isomeric_smiles'] = smi
        except:
            # regenerate SMILES with chemical name and CAS number
            # noinspection PyBroadException
            try:
                # use the CAS number or chemical name to
                # acquire isomeric SMILES to augment null SMILES
                cas = df.loc[i, 'cas_number']
                if pd.isnull(cas):
                    chem = df.loc[i, 'chemical_name']
                    if pd.isnull(chem):
                        df.loc[i, 'isomeric_smiles'] = smi
                    else:
                        # generate isomeric SMILES using the chemical name
                        for compound in get_compounds(chem, 'name'):
                            smiles = compound.isomeric_smiles
                        df.loc[i, 'smiles_supplementary'] = smiles
                else:
                    # generate isomeric SMILES using CAS number
                    for compound in get_compounds(cas, 'name'):
                        smiles = compound.isomeric_smiles
                    df.loc[i, 'smiles_supplementary'] = smiles
                # unify SMILES
                mol = Chem.MolFromSmiles(smiles)
                smi = Chem.MolToSmiles(mol)
                df.loc[i, 'isomeric_smiles'] = smi
            except:
                df.loc[i, 'isomeric_smiles'] = smi

    return df

In [4]:
def modify_smiles(df):
    """This function gets correct RDKit SMILES."""
    smiles = 'error'
    for chem, i in zip(df['chemical_name'], range(df.shape[0])):
        # noinspection PyBroadException
        try:
            if pd.isnull(chem):
                continue
            else:
                # generate isomeric SMILES using the chemical name
                for compound in get_compounds(chem, 'name'):
                    smiles = compound.isomeric_smiles
                df.loc[i, 'smiles_supplementary'] = smiles
                # get RDKit SMILES
                mol = Chem.MolFromSmiles(smiles)
                smi = Chem.MolToSmiles(mol)
                df.loc[i, 'isomeric_smiles'] = smi
        except:
            continue

    return df

In [5]:
if __name__ == "__main__":
    # check SMILES
    data = rdkit_smiles(data)
    # filter out data that contains 'error'
    error = data.loc[data["isomeric_smiles"].str.contains('error')]
    # update index
    error.index = range(error.shape[0])
    # update the wrong SMILES
    modify_error = modify_smiles(error)
    # combine data
    data = pd.concat([modify_error, data], ignore_index=True)

In [6]:
# delete the row that contains the content of 'error' in the column 'isomeric_smiles'
data = data.loc[-data["isomeric_smiles"].str.contains('error')]
# update index
data.index = range(data.shape[0])
# output data shape
print(data.shape)

(7460, 12)


In [7]:
# save file
data.to_excel('5_delete_rdkit_smiles_error.xlsx', index=False)