CURATION SCRIPT
by: Igor Sanches
Edited by: Francisco Feitosa

**IMPORT DEPENDENCIES AND IN-HOUSE FUNCTIONS**

In [None]:
#import libraries and dependencies

import pandas as pd
import math
import numpy as np

from rdkit import Chem
from chembl_structure_pipeline import standardizer
from rdkit.Chem.MolStandardize.metal import MetalDisconnector
import rdkit.Chem.MolStandardize.rdMolStandardize as rdMolStandardize
from rdkit.Chem import PandasTools

from rdkit.Chem.SaltRemover import SaltRemover
from rdkit.Chem import inchi as rd_inchi

from molvs import standardize_smiles
from molvs import Standardizer
from rdkit.Chem import Draw

In [None]:
#in-house functions
def metal_atomic_numbers(at):
    """ This function checks the atomic number of an atom """
    
    n = at.GetAtomicNum()
    return (n==13) or (n>=21 and n<=31) or (n>=39 and n<=50) or (n>=57 and n<=83) or (n>=89 and n<=115)

def is_metal(smile):
    """ This function checks if an atom is a metal based on its atomic number """
    mol = Chem.MolFromSmiles(smile)
    rwmol = Chem.RWMol(mol)
    rwmol.UpdatePropertyCache(strict=False)
    metal = [at.GetSymbol() for at in rwmol.GetAtoms() if metal_atomic_numbers(at)]
    return len(metal) == 1

def smiles_preparator(smile):
    """ This function prepares smiles by removing stereochemistry """
    smile1 = smile.replace('@','')
    smile2 = smile1.replace('/','')
    smile3 = smile2.replace("\\",'')
    return str(smile3)

def salt_remover(mol):
    """ This function removes salts, see complete list of possible salts in https://github.com/rdkit/rdkit/blob/master/Data/Salts.txt """

    salt_list = [None, "[Cl,Br,I]", "[Li,Na,K,Ca,Mg]", "[O,N]", "[H]", "[Ba]", "[Al]", "[Cu]", "[Cs]", "[Zn]", 
    "[Mn]", "Cl[Cr]Cl", "COS(=O)(=O)[O-]", "[Sb]", "[Cr]", "[Ni]", "[B]", "CCN(CC)CC", "NCCO", "O=CO", "O=S(=O)([O-])C(F)(F)F",
    "O=C(O)C(F)(F)F"]

    stripped = mol

    for salt in salt_list:
        remover = SaltRemover(defnData=salt)
        stripped = remover.StripMol(stripped, dontRemoveEverything=True)
    
    return stripped



**SET PATH**

In [None]:
#choose a path to save
savepath = r"C:\Users\LabMo\Downloads"

In [None]:
#df1 = pd.read_csv(r"C:\Users\franc\OneDrive\Documentos\LabMol\IC-Citotoxicidade\datasets\AID_1673\Raw data\AID_364_datatable.csv")
#df2 = pd.read_csv(r"C:\Users\franc\OneDrive\Documentos\LabMol\IC-Citotoxicidade\datasets\AID_1673\Raw data\AID_463_datatable.csv")
#df3 = pd.read_csv(r"C:\Users\franc\OneDrive\Documentos\LabMol\IC-Citotoxicidade\datasets\AID_1673\Raw data\AID_464_datatable.csv")
#df4 = pd.read_csv(r"C:\Users\franc\OneDrive\Documentos\LabMol\IC-Citotoxicidade\THP-1\AID_1117359_datatable.csv")

#df = pd.concat([df1, df2, df3])
#df

In [None]:
df = pd.read_csv(r"D:\OneDrive\Documentos\LabMol\IC-Citotoxicidade\datasets\AID_1345083 HEK\AID_1345083_datatable.csv")

**DATA PREPARATION**

In [None]:
df0 = df.loc[:, ['PUBCHEM_EXT_DATASOURCE_SMILES', 'PUBCHEM_ACTIVITY_OUTCOME', 'PUBCHEM_CID']]
df0

In [None]:
df0 = df0.rename(columns={'PUBCHEM_EXT_DATASOURCE_SMILES':'Molecule', 'PUBCHEM_ACTIVITY_OUTCOME':'Outcome'})
df0

In [None]:
def string_to_int(s):
    mapping = {"Active": 1, "Inactive": 0}
    return mapping.get(s, None)

df0['Outcome'] = df0['Outcome'].apply(string_to_int)
df0

In [None]:
df0 = df0.dropna(subset=['Outcome'])
df0 = df0.reset_index(drop=True)
df0

In [None]:
df0 = df0.dropna(subset=['Molecule'])
df0 = df0.reset_index(drop=True)
df0

In [None]:
df0.groupby('Outcome').size()

**ESTEREO REMOVAL**

In [None]:
#remove stereoisomers 
smiles = [smiles_preparator(str(smile)) for smile in df0['Molecule']]
df0['SMILES_no_stereo'] = smiles

#remove relation column
df0

**REMOVE SALTS AND INVALID SMILES**

In [None]:
#remove salts
wrongSmiles = []
new_smiles = []
indexDropList_salts = []
for index, smile in enumerate(df0['SMILES_no_stereo']):
    try:
        mol = Chem.MolFromSmiles(smile)
        remov = salt_remover(mol)
        if remov.GetNumAtoms() <= 2:
            indexDropList_salts.append(index)
        else:
            new_smiles.append(Chem.MolToSmiles(remov, kekuleSmiles=True))
        
    except:
        wrongSmiles.append(df0.iloc[[index]])
        indexDropList_salts.append(index)


if len(wrongSmiles) == 0:
    print("no wrong smiles found")
    
else:
    #drop wrong smiles
    df0 = df0.drop(df0.index[indexDropList_salts])
    
    print(f"{len(indexDropList_salts)} wrong smiles found")
    
    #save removes mixtures
    wrongsmiles = pd.concat(wrongSmiles)
    wrongsmiles.to_csv(f'{savepath}\\wrongsmiles_tryp.csv', sep=',', header=True, index=False)
df0['SMILES_no_salts'] = new_smiles
df0 

df0 = df0.reset_index(drop=True)

**REMOVE ORGANOMETALLICS**

In [None]:
organometals = []
indexDropList_org = []
for index, smile in enumerate(df0['SMILES_no_salts']):
    if is_metal(smile) == True:
        organometals.append(df0.iloc[[index]])
        indexDropList_org.append(index)

if len(indexDropList_org) == 0:
    print("no organometallics found")
    
else:
    #drop organometallics
    df0 = df0.drop(df0.index[indexDropList_org])
    
    print(f"{len(indexDropList_org)} organometallics found")
    
    #save droped organometallics
    organmetal = pd.concat(organometals)
    organmetal.to_csv(f'{savepath}\\organometallics_tryp.csv', sep=',', header=True, index=False)
    
df0

df0 = df0.reset_index(drop=True)

**REMOVE MIXTURES**

In [None]:
#remove mixtures
mixtureList = []
indexDropList_mix = []
for index, smile in enumerate (df0['SMILES_no_salts']):
    for char in smile:
        if char == '.':
            mixtureList.append(df0.iloc[[index]])
            indexDropList_mix.append(index)
            break

            
if len(indexDropList_mix) == 0:
    print("no mixtures found")
    
else:
    #drop mixtures
    df0 = df0.drop(df0.index[indexDropList_mix])
    
    print(f"{len(indexDropList_mix)} mixtures found")
    
    #save removes mixtures
    mixtures = pd.concat(mixtureList)
    mixtures.to_csv(f'{savepath}\\mixtures_tryp.csv', sep=',', header=True, index=False)
df0 

df0 = df0.reset_index(drop=True)

In [None]:
#Remove moléculas com peso molecular acima de 1000

mols = [Chem.MolFromSmiles(smile) for smile in df0['SMILES_no_salts']]
MWdroplist = []
indexdroplist_MW = []
for index, mol in enumerate(mols):
    mw = Chem.rdMolDescriptors.CalcExactMolWt(mol)
    if mw >= 1000: 
        MWdroplist.append(df0.iloc[[index]])
        indexdroplist_MW.append(index)
    else:
        pass

if len(indexdroplist_MW) == 0:
    print("no molecule removed")
    
else:
    #drop MW >1000
    df0 = df0.drop(df0.index[indexdroplist_MW])
    
    print(f"{len(indexdroplist_MW)} polymer found")
    
    #save removes 
    MWW = pd.concat(MWdroplist)
    MWW.to_csv(f'{savepath}\\MW_tryp.csv', sep=',', header=True, index=False)
df0 

df0 = df0.reset_index(drop=True)

**STANDARDISE**

In [None]:
rdMol = [Chem.MolFromSmiles(smile, sanitize=True) for smile in df0['SMILES_no_salts']]

molBlock = [Chem.MolToMolBlock(mol) for mol in rdMol]

stdMolBlock = [standardizer.standardize_molblock(mol_block) for mol_block in molBlock]

molFromMolBlock = [Chem.MolFromMolBlock(std_molblock) for std_molblock in stdMolBlock]

mol2smiles = [Chem.MolToSmiles(m) for m in molFromMolBlock]

df0['final_smiles'] = mol2smiles

df0 = df0.reset_index(drop=True)

**Remove Salts for the second time**

In [None]:
#remove salts second time
wrongSmiles = []
new_smiles = []
indexDropList_salts = []
for index, smile in enumerate(df0['final_smiles']):
    try:
        mol = Chem.MolFromSmiles(smile)
        remov = salt_remover(mol)
        if remov.GetNumAtoms() <= 2:
            indexDropList_salts.append(index)
        else:
            new_smiles.append(Chem.MolToSmiles(remov, kekuleSmiles=True))
        
    except:
        wrongSmiles.append(df0.iloc[[index]])
        indexDropList_salts.append(index)


if len(wrongSmiles) == 0:
    print("no wrong smiles found")
    
else:
    #drop wrong smiles
    df0 = df0.drop(df0.index[indexDropList_salts])
    
    print(f"{len(indexDropList_salts)} wrong smiles found")
    
    #save removes mixtures
    wrongsmiles = pd.concat(wrongSmiles)
    wrongsmiles.to_csv(f'{savepath}\\wrongsmiles_after_std.csv', sep=',', header=True, index=False)
df0['final_smiles_stand'] = new_smiles
df0 

df0 = df0.reset_index(drop=True)

### DUPLICATES REMOVAL

In [None]:
# Calculate the InChI
inchi_list = []
for smiles in df0['final_smiles_stand']:
    mol = Chem.MolFromSmiles(smiles)
    inchi = Chem.inchi.MolToInchi(mol)
    inchi_list.append(inchi)

# Adicionar a coluna de InChI no dataframe
df0['InChI'] = inchi_list

In [None]:
df0_active = df0.query('Outcome == 0')
df0_inactive = df0.query('Outcome == 1')

df0_active = df0_active.drop_duplicates(subset=['InChI'], inplace=False)
df0_inactive = df0_inactive.drop_duplicates(subset=['InChI'], inplace=False)

df_no_dup_concord = pd.concat([df0_active, df0_inactive], axis=0)

final_drop_dup = df_no_dup_concord.drop_duplicates(subset=['InChI'], keep=False, inplace=False)

lastcount = final_drop_dup['InChI'].count()

df_final = final_drop_dup

df_final = df_final.reset_index(drop=True)

In [None]:
df0_dropped = df0[~df0['InChI'].isin(df_final['InChI'])]
dropped_duplist = df0_dropped.loc[:, ['final_smiles_stand', 'Outcome', 'PUBCHEM_CID', 'InChI']]
dropped_duplist.to_csv(f'{savepath}\\dropped_duplicates.csv', sep=',', header=True, index=False)

countdprem = df0_dropped['InChI'].count()
print(f"{countdprem} duplicates removed")

### SAVE FINAL DATASET

In [None]:
df_final.groupby('Outcome').size()

In [None]:
df_final.rename(columns={'final_smiles_stand': 'SMILES'}, inplace=True)

In [None]:
df_final

In [None]:
df_final = df_final.loc[:, ['SMILES', 'Outcome', 'PUBCHEM_CID']]
df_final.to_csv(f'{savepath}\curated_binary.csv', sep=',', header=True, index=False)

In [None]:
def escrever_log(**contagens):
    with open(f'{savepath}\log.txt', 'w') as arquivo:
        arquivo.write('Were removed:\n')
        for chave, valor in contagens.items():
            arquivo.write(f'{chave}: {valor}\n')

# Exemplo de variáveis de contagem
Wrong_Smiles = len(wrongSmiles)
Salts = len(indexDropList_salts)
Organometals = len(indexDropList_org)
Mixtures = len(indexDropList_mix)
Polymers = len(indexdroplist_MW)
Duplicates = countdprem

# Chamada da função para escrever o log
escrever_log(Wrong_Smiles=Wrong_Smiles, Salts=Salts, Organometals=Organometals, Mixtures=Mixtures, Polymers=Polymers, Duplicates=Duplicates)