# Standardize and Clean molecules 

The following function will get a dataset ready for training a Generative AI model

In [3]:
from rdkit import Chem

In [1]:
#This function is taken from: https://bitsilla.com/blog/2021/06/standardizing-a-molecule-using-rdkit/
from rdkit.Chem.MolStandardize import rdMolStandardize

def standardize(smiles):
    # follows the steps in
    # https://github.com/greglandrum/RSC_OpenScience_Standardization_202104/blob/main/MolStandardize%20pieces.ipynb
    # as described **excellently** (by Greg) in
    # https://www.youtube.com/watch?v=eWTApNX8dJQ
    mol = Chem.MolFromSmiles(smiles)
     
    # removeHs, disconnect metal atoms, normalize the molecule, reionize the molecule
    clean_mol = rdMolStandardize.Cleanup(mol) 
     
    # if many fragments, get the "parent" (the actual mol we are interested in) 
    parent_clean_mol = rdMolStandardize.FragmentParent(clean_mol)
         
    # try to neutralize molecule
    uncharger = rdMolStandardize.Uncharger() # annoying, but necessary as no convenience method exists
    uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol)
     
    # note that no attempt is made at reionization at this step
    # nor at ionization at some pH (rdkit has no pKa caculator)
    # the main aim to to represent all molecules from different sources
    # in a (single) standard way, for use in ML, catalogue, etc.
     
    te = rdMolStandardize.TautomerEnumerator() # idem
    taut_uncharged_parent_clean_mol = te.Canonicalize(uncharged_parent_clean_mol)
     
    #return taut_uncharged_parent_clean_mol
    return Chem.MolToSmiles(taut_uncharged_parent_clean_mol)

In [5]:
#To clean a single molecule:

standardize("C[P+](CC)(N)N")

[16:58:30] Initializing MetalDisconnector
[16:58:30] Running MetalDisconnector
[16:58:30] Initializing Normalizer
[16:58:30] Running Normalizer
[16:58:30] Rule applied: P+N to P=N+
[16:58:30] Initializing MetalDisconnector
[16:58:30] Running MetalDisconnector
[16:58:30] Initializing Normalizer
[16:58:30] Running Normalizer
[16:58:30] Running LargestFragmentChooser
[16:58:30] Running Uncharger
[16:58:30] Removed positive charge.


'CCP(C)(=N)N'

Try it on a dataframe:

In [7]:
import pandas as pd

smi = {'smiles': ['c1ccccc1C(=O)[O-].[Na+]', 'O=C1[C@H](CC[C@H](O)c2ccc(F)cc2)[C@@H](c2ccc(O)cc2)N1c1ccc(F)cc1.c1ccccc1']}
da = pd.DataFrame(data=smi)

#### Sometimes, removing stereochemistry can be useful:

In [9]:
def removeStereo(smiles):
    m = Chem.MolFromSmiles(smiles)
    Chem.RemoveStereochemistry(m)
    return Chem.MolToSmiles(m)

In [10]:
#Consider an example smiles:
removeStereo('C=CCO[C@H]1C[C@@H](C(=O)OC(C)(C)C)C1')

'C=CCOC1CC(C(=O)OC(C)(C)C)C1'