In [1]:
import logging
import warnings
import pandas as pd
from rdkit import Chem, RDLogger
from rdkit.Chem import AllChem, rdmolops
from rdkit.Chem.SaltRemover import SaltRemover

warnings.filterwarnings("ignore")
RDLogger.DisableLog('rdApp.*')

In [2]:
data = pd.read_excel("5_delete_rdkit_smiles_error.xlsx")
print(data.shape)
data.head()

(7460, 12)


Unnamed: 0,chemical_name,cas_number,smiles,VDss_L_kg,SD,fu_h,reference_number,Comments_1,Comments_2,t 1/2,smiles_supplementary,isomeric_smiles
0,Gadofosveset,742680-48-4,C(C([N+]([Gd+++]([N+]1(C2)C3)([N+](C2)(C2)C4)(...,0.15,,0.16,R5,PDR Accessed on March 15 2013. http://www.pdr3...,0.03 mmol/kg dose. VDss from MRT and clearance.,16.3,[H+].[H+].[H+].C1CC(CCC1OP(=O)([O-])OC[C@@H](C...,O=C([O-])CN(CCN(CC(=O)[O-])C[C@H](COP(=O)([O-]...
1,Gadoxetate,770677-60-6,C(C([N+]([Gd+++]([N+]1(C2)C3)([N+](C2)(C2)C4)(...,0.21,,0.9,R5,PDR accessed March 15 2013 http://www.pdr3d.co...,VDss from MRT and clearance. Ppb < 10%. CAS n...,0.95,CCOC1=CC=C(C=C1)C[C@@H](CN(CCN(CC(=O)[O-])CC(=...,CCOc1ccc(C[C@@H](CN(CCN(CC(=O)[O-])CC(=O)[O-])...
2,Oxaliplatin,61825-94-3,C1CC[C@H]2[C@H](C1)N[Pt+2]3(N2)[O-]C(=O)C(=O)[...,0.5,,0.13,R5,"Jing Liu, Eric Kraut, Jonathan Bender, Rebecca...",Eloxatin. NSC 266046. Total plasma Pt data fro...,58.92,C1CC[C@H]([C@@H](C1)[NH-])[NH-].C(=O)(C(=O)O)O...,O=C(O)C(=O)O.[NH-][C@@H]1CCCC[C@H]1[NH-].[Pt+2]
3,(-)dOTC,160707-69-7,C1=CN(C2SC(OC2)CO)C(=O)N=C1N,1.18,,,R1,"PATRICK F. SMITH, ALAN FORREST, CHARLES H. BAL...",Dosed as 100 mg of racemate. (-) form called a...,,C1=CN(C2SC(OC2)CO)C(=O)N=C1N,Nc1ccn(C2COC(CO)S2)c(=O)n1
4,(+)dOTC,160707-68-6,C1=CN(C2SC(OC2)CO)C(=O)N=C1N,0.84,,,R1,"PATRICK F. SMITH, ALAN FORREST, CHARLES H. BAL...",Dosed as 100 mg of racemate. (-) form called a...,,C1=CN(C2SC(OC2)CO)C(=O)N=C1N,Nc1ccn(C2COC(CO)S2)c(=O)n1


In [3]:
metal_atoms = {3, 4, 11, 12, 13, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
               29, 30, 31, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
               49, 50, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
               68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82,
               83, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
               101, 102, 103}

heavy_metal_atoms = {21, 22, 23, 24, 25, 26, 27, 28,
                     29, 30, 31, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
                     49, 50, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
                     68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82,
                     83, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
                     101, 102, 103}

In [4]:
def strip_salts(df):
    """This function removes salts and updates SMILES."""  
    # build remover
    remover = SaltRemover(defnFilename='salts.smi')

    for row_idx, smi in zip(range(df.shape[0]), df['isomeric_smiles']):
        mol = Chem.MolFromSmiles(smi)
        # remove salts
        mol_new = remover.StripMol(mol)
        # standardize molecules
        smi_new = Chem.MolToSmiles(mol_new)
        # update SMILES
        df['isomeric_smiles'].iat[row_idx] = smi_new

    # update index
    df.reset_index(drop=True, inplace=True)

    return df

In [5]:
def keep_largest_fragment(df):
    """This function keeps the largest fragment.""" 
    for row_idx, smi in zip(range(df.shape[0]), df['isomeric_smiles']):
        mol = Chem.MolFromSmiles(smi)
        mols = rdmolops.GetMolFrags(mol, asMols=True)
        mol = max(mols, default=mol, key=lambda m: m.GetNumAtoms())
        smi_new = Chem.MolToSmiles(mol)
        # update SMILES
        df['isomeric_smiles'].iat[row_idx] = smi_new

    # update index
    df.reset_index(drop=True, inplace=True)

    return df

In [6]:
def remove_metal_molecules(df):
    """this function removes heavy atoms."""  
    metal_count = 0

    for row_idx, smi in zip(range(df.shape[0]), df['isomeric_smiles']):
        try:
            mol = Chem.MolFromSmiles(smi)
            if check_heavy_atoms(mol=mol):
                df = df.drop(row_idx)
                metal_count += 1
        except AttributeError:
            continue

    logging.debug("In total %d metal molecules are removed.",
                  metal_count)
    # update index
    df.reset_index(drop=True, inplace=True)

    return df

In [7]:
def neutralize(df):
    """This function neutralizes molecules."""    
    for row_idx, smi in zip(range(df.shape[0]), df['isomeric_smiles']):
        # neutralizing molecules
        smi_new, _ = neutralisecharges(smiles=smi, reactions=None)
        df['isomeric_smiles'].iat[row_idx] = smi_new

    # Update index
    df.reset_index(drop=True, inplace=True)

    return df

In [8]:
def check_heavy_atoms(mol):
    """This function checks if heavy atoms are in molecules."""
    atom_list = [atom.GetAtomicNum() for atom in mol.GetAtoms()]
    # Return not set(atom_list).isdisjoint(heavy_metal_atoms)

    return not set(atom_list).isdisjoint(metal_atoms)

In [9]:
def neutralisecharges(smiles, reactions=None):
    """This function neutralises charges."""   
    global _reactions
    if reactions is None:
        if _reactions is None:
            _reactions = _InitialiseNeutralisationReactions()
        reactions = _reactions
    mol = Chem.MolFromSmiles(smiles)
    replaced = False
    for i, (reactant, product) in enumerate(reactions):
        while mol.HasSubstructMatch(reactant):
            replaced = True
            rms = AllChem.ReplaceSubstructs(mol, reactant, product)
            mol = rms[0]
    if replaced:
        return Chem.MolToSmiles(mol, True), True
    else:
        return smiles, False

In [10]:
def _InitialiseNeutralisationReactions():
    """This function initialises neutralisation reactions."""
    patts = (
        # Imidazoles
        ('[n+;H]', 'n'),
        # Amines
        ('[N+;!H0]', 'N'),
        # Carboxylic acids and alcohols
        ('[$([O-]);!$([O-][#7])]', 'O'),
        # Thiols
        ('[S-;X1]', 'S'),
        # Sulfonamides
        ('[$([N-;X2]S(=O)=O)]', 'N'),
        # Enamines
        ('[$([N-;X2][C,N]=C)]', 'N'),
        # Tetrazoles
        ('[n-]', '[nH]'),
        # Sulfoxides
        ('[$([S-]=O)]', 'S'),
        # Amides
        ('[$([N-]C=O)]', 'N'),
    )
    
    return [(Chem.MolFromSmarts(x), Chem.MolFromSmiles(y, False))
            for x, y in patts]

In [11]:
if __name__ == "__main__":
    _reactions = None
    # remove molecules that contain heavy metals
    data = remove_metal_molecules(data)
    # keep the largest fragment
    data = keep_largest_fragment(data)
    # remove salts
    data = strip_salts(data)
    # neutralize molecules
    data = neutralize(data)

In [12]:
data.to_excel("6_isomeric_strip_salt.xlsx", index=False)