# Standardize and Clean molecules 

The following function will get a dataset ready for training a Generative AI model

In [11]:
from rdkit import Chem

In [12]:
#This function is taken from: https://bitsilla.com/blog/2021/06/standardizing-a-molecule-using-rdkit/
from rdkit.Chem.MolStandardize import rdMolStandardize

def standardize(smiles):
    # follows the steps in
    # https://github.com/greglandrum/RSC_OpenScience_Standardization_202104/blob/main/MolStandardize%20pieces.ipynb
    # as described **excellently** (by Greg) in
    # https://www.youtube.com/watch?v=eWTApNX8dJQ
    # We are putting this code in a try except as we would want to clean all molecules even if some smiles in between might throw and error
    
    
    try:
        mol = Chem.MolFromSmiles(smiles)
     
        # removeHs, disconnect metal atoms, normalize the molecule, reionize the molecule!!!!!
        clean_mol = rdMolStandardize.Cleanup(mol) 
     
        # if many fragments, get the "parent" (the actual mol we are interested in) 
        parent_clean_mol = rdMolStandardize.FragmentParent(clean_mol)
         
        # try to neutralize molecule
        uncharger = rdMolStandardize.Uncharger() # annoying, but necessary as no convenience method exists
        uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol)
     
        # note that no attempt is made at reionization at this step
        # nor at ionization at some pH (rdkit has no pKa caculator)
        # the main aim to to represent all molecules from different sources
        # in a (single) standard way, for use in ML, catalogue, etc.
     
        te = rdMolStandardize.TautomerEnumerator() # idem
        taut_uncharged_parent_clean_mol = te.Canonicalize(uncharged_parent_clean_mol)
    
        Chem.RemoveStereochemistry(taut_uncharged_parent_clean_mol)
     
        #return taut_uncharged_parent_clean_mol
        return Chem.MolToSmiles(taut_uncharged_parent_clean_mol)
    
    except:
        failed_smiles.append(smiles)
        return "None"
    
    #Running the above function on 2.37 million molecules took 1076 minutes

In [13]:
#To clean a single molecule:

standardize("C[P+](CC)(N)N")

[11:29:57] Initializing MetalDisconnector
[11:29:57] Running MetalDisconnector
[11:29:57] Initializing Normalizer
[11:29:57] Running Normalizer
[11:29:57] Rule applied: P+N to P=N+
[11:29:57] Initializing MetalDisconnector
[11:29:57] Running MetalDisconnector
[11:29:57] Initializing Normalizer
[11:29:57] Running Normalizer
[11:29:57] Running LargestFragmentChooser
[11:29:57] Running Uncharger
[11:29:57] Removed positive charge.


'CCP(C)(=N)N'

Try it on a dataframe:

In [17]:
import pandas as pd

smi = {'smiles': ['c1ccccc1C(=O)[O-].[Na+]', 'O=C1[C@H](CC[C@H](O)c2ccc(F)cc2)[C@@H](c2ccc(O)cc2)N1c1ccc(F)cc1.c1ccccc1']}
da = pd.DataFrame(data=smi)

da['clean_smiles'] = da['smiles'].apply(standardize)
da

[11:31:18] Initializing MetalDisconnector
[11:31:18] Running MetalDisconnector
[11:31:18] Initializing Normalizer
[11:31:18] Running Normalizer
[11:31:18] Initializing MetalDisconnector
[11:31:18] Running MetalDisconnector
[11:31:18] Initializing Normalizer
[11:31:18] Running Normalizer
[11:31:18] Running LargestFragmentChooser
[11:31:18] Fragment: O=C([O-])c1ccccc1
[11:31:18] New largest fragment: O=C([O-])c1ccccc1 (14)
[11:31:18] Fragment: [Na+]
[11:31:18] Running Uncharger
[11:31:18] Removed negative charge.
[11:31:18] Initializing MetalDisconnector
[11:31:18] Running MetalDisconnector
[11:31:18] Initializing Normalizer
[11:31:18] Running Normalizer
[11:31:18] Initializing MetalDisconnector
[11:31:18] Running MetalDisconnector
[11:31:18] Initializing Normalizer
[11:31:18] Running Normalizer
[11:31:18] Running LargestFragmentChooser
[11:31:18] Fragment: O=C1[C@H](CC[C@H](O)c2ccc(F)cc2)[C@@H](c2ccc(O)cc2)N1c1ccc(F)cc1
[11:31:18] New largest fragment: O=C1[C@H](CC[C@H](O)c2ccc(F)cc2)[C

Unnamed: 0,smiles,clean_smiles
0,c1ccccc1C(=O)[O-].[Na+],O=C(O)c1ccccc1
1,O=C1[C@H](CC[C@H](O)c2ccc(F)cc2)[C@@H](c2ccc(O...,O=C1C(CCC(O)c2ccc(F)cc2)C(c2ccc(O)cc2)N1c1ccc(...
