# Standardize and Clean molecules 

The following function will get a dataset ready for training a Generative AI model

In [4]:
from rdkit import Chem
import pandas as pd

import rdkit
print(f"I am RDKit version: {rdkit.__version__}")
import sys
print(f"I am python version {sys.version}")

I am RDKit version: 2023.09.5
I am python version 3.12.2 (main, Feb 11 2024, 00:01:24) [Clang 15.0.0 (clang-1500.1.0.2.5)]


### Detect invalid SMILES:

Many a times, smiles generated from AI models or obtained by any other means might have an invalid syntax. This is some code I found useful to detect and filter out such smiles from a dataframe:

In [5]:
# Consider a sample dataframe, containing some faulty smiles:

data = pd.read_csv("data/test_molecules.csv")
print(f"There are {data.shape[0]} in the dataframe, some of which are faulty")

data.head(3)

There are 1971 in the dataframe, some of which are faulty


Unnamed: 0,Item Name,M.w.,Solvent,Formula,SMILES,Solubility,Form,Pathway,Target,Information,URL,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
0,Angiotensin II,1046.2,H2O,C50H71N13O12,CCC(C)C(C(=O)NC(CC1=CN=CN1)C(=O)N2CCCC2C(=O)NC...,"?100.2mg/mL in H2O, <2.09mg/mL in DMSO",,Cardiovascular,Angiotensin Receptor,,http://www.apexbt.com/search.php?catalog=A1042,,,,
1,Levetiracetam,170.21,DMSO,C8H14N2O2,CCC(C(=O)N)N1CCCC1=O,>8.5mg/mL in DMSO,Free Base,Membrane Transporter/Ion Channel,Calcium Channel,Antiepileptic drug,http://www.apexbt.com/search.php?catalog=A1198,,,,
2,Daptomycin,1620.67,DMSO,C72H101N17O26,CCCCCCCCCC(=O)NC(CC1=CNC2=CC=CC=C21)C(=O)NC(CC...,>81.1mg/mL in DMSO,Free Base,DNA Damage/DNA Repair,DNA Synthesis,Calcium-dependent antibiotic,http://www.apexbt.com/search.php?catalog=A1206,,,,


Question is, how do we detect and remove them?

Use the following function:

In [6]:
df_smiles = data['SMILES']

ok_smiles = []
delete_smiles = [] #faulty smiles will be collected in this list

i=0

for ds in df_smiles:
    
    try:
        cs = Chem.CanonSmiles(ds) #Use the CanonSmiles() function to check if rdkit can process the smiles successfully. If not, then the smiles is faulty
        ok_smiles.append(cs)
        i=i+1
    
    except:
        #print(i, 'Invalid SMILES:', ds)
        delete_smiles.append(ds)
        #print("")
#print()

[17:57:25] SMILES Parse Error: syntax error while parsing: C[C@]1(O)CO[C@H](O[C@H]2[C@H](N)C[C@H](N)[C@@H](O[C@H]3O[C@@]([H])([C@@H](C)NC)CC[C@H]3N)[C@@H]2O)[C@H](O)[C@H]1NC.C[C@]4(O)CO[C@H](O[C@H]5[C@H](N)C[C@H](N)[C@@H](O[C@H]6O[C@H](CN)CC[C@H]6N)[C@@H]5O)[C@H](O)[C@H]4NC.C[C@]7(O)CO[C@H](O[C@H]8[C@H](N)C[C@H](
[17:57:25] SMILES Parse Error: Failed parsing SMILES 'C[C@]1(O)CO[C@H](O[C@H]2[C@H](N)C[C@H](N)[C@@H](O[C@H]3O[C@@]([H])([C@@H](C)NC)CC[C@H]3N)[C@@H]2O)[C@H](O)[C@H]1NC.C[C@]4(O)CO[C@H](O[C@H]5[C@H](N)C[C@H](N)[C@@H](O[C@H]6O[C@H](CN)CC[C@H]6N)[C@@H]5O)[C@H](O)[C@H]4NC.C[C@]7(O)CO[C@H](O[C@H]8[C@H](N)C[C@H](' for input: 'C[C@]1(O)CO[C@H](O[C@H]2[C@H](N)C[C@H](N)[C@@H](O[C@H]3O[C@@]([H])([C@@H](C)NC)CC[C@H]3N)[C@@H]2O)[C@H](O)[C@H]1NC.C[C@]4(O)CO[C@H](O[C@H]5[C@H](N)C[C@H](N)[C@@H](O[C@H]6O[C@H](CN)CC[C@H]6N)[C@@H]5O)[C@H](O)[C@H]4NC.C[C@]7(O)CO[C@H](O[C@H]8[C@H](N)C[C@H]('
[17:57:25] SMILES Parse Error: syntax error while parsing: CC(O)=O.CC.CC.CCccC.[R].[P].[2H].[L].[E].[P].[

In [7]:
print(f"Turns out there are {len(delete_smiles)} problematic smiles in the dataframe")

Turns out there are 14 problematic smiles in the dataframe


In [8]:
# Remove these 14 smiles from the dataframe:

new_data = data[~data['SMILES'].isin(delete_smiles)]

In [9]:
print(f"Now the dataframe contains {new_data.shape[0]} smiles, instead of {data.shape[0]} smiles")

Now the dataframe contains 1957 smiles, instead of 1971 smiles


### Standardize and Clean molecules:

In [10]:
#This function is taken from: https://bitsilla.com/blog/2021/06/standardizing-a-molecule-using-rdkit/
from rdkit.Chem.MolStandardize import rdMolStandardize

def standardize(smiles):
    # follows the steps in
    # https://github.com/greglandrum/RSC_OpenScience_Standardization_202104/blob/main/MolStandardize%20pieces.ipynb
    # as described **excellently** (by Greg) in
    # https://www.youtube.com/watch?v=eWTApNX8dJQ
    # We are putting this code in a try except as we would want to clean all molecules even if some smiles in between might throw and error
    
    
    try:
        mol = Chem.MolFromSmiles(smiles)
     
        # removeHs, disconnect metal atoms, normalize the molecule, reionize the molecule!!!!!
        clean_mol = rdMolStandardize.Cleanup(mol) 
     
        # if many fragments, get the "parent" (the actual mol we are interested in) 
        parent_clean_mol = rdMolStandardize.FragmentParent(clean_mol)
         
        # try to neutralize molecule
        uncharger = rdMolStandardize.Uncharger() # annoying, but necessary as no convenience method exists
        uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol)
     
        # note that no attempt is made at reionization at this step
        # nor at ionization at some pH (rdkit has no pKa caculator)
        # the main aim to to represent all molecules from different sources
        # in a (single) standard way, for use in ML, catalogue, etc.
     
        te = rdMolStandardize.TautomerEnumerator() # idem
        taut_uncharged_parent_clean_mol = te.Canonicalize(uncharged_parent_clean_mol)
    
        Chem.RemoveStereochemistry(taut_uncharged_parent_clean_mol)
     
        #return taut_uncharged_parent_clean_mol
        return Chem.MolToSmiles(taut_uncharged_parent_clean_mol)
    
    except:
        failed_smiles.append(smiles)
        return "None"
    
    #Running the above function on 2.37 million molecules took 1076 minutes

In [11]:
#To clean a single molecule:

standardize("C[P+](CC)(N)N")

[17:57:26] Initializing MetalDisconnector
[17:57:26] Running MetalDisconnector
[17:57:26] Initializing Normalizer
[17:57:26] Running Normalizer
[17:57:26] Rule applied: P+N to P=N+
[17:57:26] Initializing MetalDisconnector
[17:57:26] Running MetalDisconnector
[17:57:26] Initializing Normalizer
[17:57:26] Running Normalizer
[17:57:26] Running LargestFragmentChooser
[17:57:26] Running Uncharger
[17:57:26] Removed positive charge.


'CCP(C)(=N)N'

Try it on a dataframe:

In [12]:
import pandas as pd

smi = {'smiles': ['c1ccccc1C(=O)[O-].[Na+]', 'O=C1[C@H](CC[C@H](O)c2ccc(F)cc2)[C@@H](c2ccc(O)cc2)N1c1ccc(F)cc1.c1ccccc1']}
da = pd.DataFrame(data=smi)

da['clean_smiles'] = da['smiles'].apply(standardize)
da

[17:57:26] Initializing MetalDisconnector
[17:57:26] Running MetalDisconnector
[17:57:26] Initializing Normalizer
[17:57:26] Running Normalizer
[17:57:26] Initializing MetalDisconnector
[17:57:26] Running MetalDisconnector
[17:57:26] Initializing Normalizer
[17:57:26] Running Normalizer
[17:57:26] Running LargestFragmentChooser
[17:57:26] Fragment: O=C([O-])c1ccccc1
[17:57:26] New largest fragment: O=C([O-])c1ccccc1 (14)
[17:57:26] Fragment: [Na+]
[17:57:26] Running Uncharger
[17:57:26] Removed negative charge.
[17:57:26] Initializing MetalDisconnector
[17:57:26] Running MetalDisconnector
[17:57:26] Initializing Normalizer
[17:57:26] Running Normalizer
[17:57:26] Initializing MetalDisconnector
[17:57:26] Running MetalDisconnector
[17:57:26] Initializing Normalizer
[17:57:26] Running Normalizer
[17:57:26] Running LargestFragmentChooser
[17:57:26] Fragment: O=C1[C@H](CC[C@H](O)c2ccc(F)cc2)[C@@H](c2ccc(O)cc2)N1c1ccc(F)cc1
[17:57:26] New largest fragment: O=C1[C@H](CC[C@H](O)c2ccc(F)cc2)[C

Unnamed: 0,smiles,clean_smiles
0,c1ccccc1C(=O)[O-].[Na+],O=C(O)c1ccccc1
1,O=C1[C@H](CC[C@H](O)c2ccc(F)cc2)[C@@H](c2ccc(O...,O=C1C(CCC(O)c2ccc(F)cc2)C(c2ccc(O)cc2)N1c1ccc(...
