CURATION SCRIPT
by: Igor Sanches
Edited by: Francisco Feitosa

**IMPORT DEPENDENCIES AND IN-HOUSE FUNCTIONS**

In [1]:
#import libraries and dependencies

import pandas as pd
import math
import numpy as np

from rdkit import Chem
from chembl_structure_pipeline import standardizer
import rdkit.Chem.MolStandardize.rdMolStandardize as rdMolStandardize
from rdkit.Chem import PandasTools

from rdkit.Chem.SaltRemover import SaltRemover
from rdkit.Chem import inchi as rd_inchi

from molvs import standardize_smiles
from molvs import Standardizer
from rdkit.Chem import Draw

[14:54:43] Initializing Normalizer


In [2]:
#in-house functions
def metal_atomic_numbers(at):
    """ This function checks the atomic number of an atom """
    
    n = at.GetAtomicNum()
    return (n==13) or (n>=21 and n<=31) or (n>=39 and n<=50) or (n>=57 and n<=83) or (n>=89 and n<=115)

def is_metal(smile):
    """ This function checks if an atom is a metal based on its atomic number """
    mol = Chem.MolFromSmiles(smile)
    rwmol = Chem.RWMol(mol)
    rwmol.UpdatePropertyCache(strict=False)
    metal = [at.GetSymbol() for at in rwmol.GetAtoms() if metal_atomic_numbers(at)]
    return len(metal) == 1

def smiles_preparator(smile):
    """ This function prepares smiles by removing stereochemistry """
    smile1 = smile.replace('@','')
    smile2 = smile1.replace('/','')
    smile3 = smile2.replace("\\",'')
    return str(smile3)

def salt_remover(mol):
    """ This function removes salts, see complete list of possible salts in https://github.com/rdkit/rdkit/blob/master/Data/Salts.txt """

    salt_list = [None, "[Cl,Br,I]", "[Li,Na,K,Ca,Mg]", "[O,N]", "[H]", "[Ba]", "[Al]", "[Cu]", "[Cs]", "[Zn]", 
    "[Mn]", "Cl[Cr]Cl", "COS(=O)(=O)[O-]", "[Sb]", "[Cr]", "[Ni]", "[B]", "CCN(CC)CC", "NCCO", "O=CO", "O=S(=O)([O-])C(F)(F)F",
    "O=C(O)C(F)(F)F"]

    stripped = mol

    for salt in salt_list:
        remover = SaltRemover(defnData=salt)
        stripped = remover.StripMol(stripped, dontRemoveEverything=True)
    
    return stripped



**SET PATH**

In [3]:
#choose a path to save
savepath = r"C:\Users\franc\OneDrive\Documentos\LabMol\IC-Skin\DADOS\GHS"

In [4]:
#df1 = pd.read_csv(r"C:\Users\franc\OneDrive\Documentos\LabMol\IC-Citotoxicidade\datasets\AID_1673\Raw data\AID_364_datatable.csv")
#df2 = pd.read_csv(r"C:\Users\franc\OneDrive\Documentos\LabMol\IC-Citotoxicidade\datasets\AID_1673\Raw data\AID_463_datatable.csv")
#df3 = pd.read_csv(r"C:\Users\franc\OneDrive\Documentos\LabMol\IC-Citotoxicidade\datasets\AID_1673\Raw data\AID_464_datatable.csv")
#df4 = pd.read_csv(r"C:\Users\franc\OneDrive\Documentos\LabMol\IC-Citotoxicidade\THP-1\AID_1117359_datatable.csv")

#df = pd.concat([df1, df2, df3])
#df

In [5]:
df0 = pd.read_excel(r"C:\Users\franc\OneDrive\Documentos\LabMol\IC-Skin\DADOS\GHS\Skin_data_LLNA_smilesok_GHSprecuragem.xlsx")

**DATA PREPARATION**

In [None]:
df0 = df0.loc[:, ['PUBCHEM_EXT_DATASOURCE_SMILES', 'PUBCHEM_ACTIVITY_OUTCOME', 'PUBCHEM_CID']]
df0

In [6]:
df0 = df0.rename(columns={'Canonical SMILES':'Molecule', 'GHS':'Outcome'})
df0

Unnamed: 0,ID,ID_type,CAS No,InChI,IUPAC,Molecule,EC3,Outcome,Source,Max Dose Tested
0,,,3344-77-2,1S/C12H25BrO/c13-11-9-7-5-3-1-2-4-6-8-10-12-14...,12-bromododecan-1-ol,C(CCCCCCBr)CCCCCO,6.90,1B,SSDB,
1,,,149-30-4,"1S/C7H5NS2/c9-7-8-5-3-1-2-4-6(5)10-7/h1-4H,(H,...","3H-1,3-benzothiazole-2-thione",C1=CC=C2C(=C1)NC(=S)S2,1.00,1B,SSDB,
2,,,1875-88-3,"1S/C8H9ClO/c9-8-3-1-7(2-4-8)5-6-10/h1-4,10H,5-6H2",,C1=CC(=CC=C1CCO)Cl,31.25,1B,SSDB,
3,,,4638-48-6,1S/C13H10ClNO2/c14-9-6-7-12(16)11(8-9)13(17)15...,5-chloro-2-hydroxy-N-phenylbenzamide,C1=CC=C(C=C1)NC(=O)C2=C(C=CC(=C2)Cl)O,5.00,1B,SSDB,
4,,,30618-84-9,"1S/C5H10O4S/c6-1-4(7)2-9-5(8)3-10/h4,6-7,10H,1...","2,3-dihydroxypropyl 2-sulfanylacetate",C(C(COC(=O)CS)O)O,4.70,1B,SSDB,
...,...,...,...,...,...,...,...,...,...,...
1624,,,931-36-2,"1S/C6H10N2/c1-3-6-7-4-5(2)8-6/h4H,3H2,1-2H3,(H...","Propoxylated reaction products of phenol, 4-no...",CCc1[nH]c(C)cn1,,1B,ECHA,
1625,,,22288-43-3,1S/C16H32O3/c1-8-10-11-13(9-2)14(17)18-19-16(6...,2-(propan-2-yloxy)ethyl acetate,CCCCC(CC)C(=O)OOC(C)(C)CC(C)(C)C,,1B,ECHA,
1626,,,1889-67-4,"1S/C18H22/c1-17(2,15-11-7-5-8-12-15)18(3,4)16-...",(9E)-undec-9-enal; (9Z)-undec-9-enal; undec-10...,CC(C)(c1ccccc1)C(C)(C)c2ccccc2,,1B,ECHA,
1627,,,75-66-1,"1S/C4H10S/c1-4(2,3)5/h5H,1-3H3",ditantalum(5+) pentaoxidandiide,CC(C)(C)S,,1B,ECHA,


In [None]:
def string_to_int(s):
    mapping = {"Active": 1, "Inactive": 0}
    return mapping.get(s, None)

df0['Outcome'] = df0['Outcome'].apply(string_to_int)
df0

In [7]:
df0 = df0.dropna(subset=['Outcome'])
df0 = df0.reset_index(drop=True)
df0

Unnamed: 0,ID,ID_type,CAS No,InChI,IUPAC,Molecule,EC3,Outcome,Source,Max Dose Tested
0,,,3344-77-2,1S/C12H25BrO/c13-11-9-7-5-3-1-2-4-6-8-10-12-14...,12-bromododecan-1-ol,C(CCCCCCBr)CCCCCO,6.90,1B,SSDB,
1,,,149-30-4,"1S/C7H5NS2/c9-7-8-5-3-1-2-4-6(5)10-7/h1-4H,(H,...","3H-1,3-benzothiazole-2-thione",C1=CC=C2C(=C1)NC(=S)S2,1.00,1B,SSDB,
2,,,1875-88-3,"1S/C8H9ClO/c9-8-3-1-7(2-4-8)5-6-10/h1-4,10H,5-6H2",,C1=CC(=CC=C1CCO)Cl,31.25,1B,SSDB,
3,,,4638-48-6,1S/C13H10ClNO2/c14-9-6-7-12(16)11(8-9)13(17)15...,5-chloro-2-hydroxy-N-phenylbenzamide,C1=CC=C(C=C1)NC(=O)C2=C(C=CC(=C2)Cl)O,5.00,1B,SSDB,
4,,,30618-84-9,"1S/C5H10O4S/c6-1-4(7)2-9-5(8)3-10/h4,6-7,10H,1...","2,3-dihydroxypropyl 2-sulfanylacetate",C(C(COC(=O)CS)O)O,4.70,1B,SSDB,
...,...,...,...,...,...,...,...,...,...,...
1624,,,931-36-2,"1S/C6H10N2/c1-3-6-7-4-5(2)8-6/h4H,3H2,1-2H3,(H...","Propoxylated reaction products of phenol, 4-no...",CCc1[nH]c(C)cn1,,1B,ECHA,
1625,,,22288-43-3,1S/C16H32O3/c1-8-10-11-13(9-2)14(17)18-19-16(6...,2-(propan-2-yloxy)ethyl acetate,CCCCC(CC)C(=O)OOC(C)(C)CC(C)(C)C,,1B,ECHA,
1626,,,1889-67-4,"1S/C18H22/c1-17(2,15-11-7-5-8-12-15)18(3,4)16-...",(9E)-undec-9-enal; (9Z)-undec-9-enal; undec-10...,CC(C)(c1ccccc1)C(C)(C)c2ccccc2,,1B,ECHA,
1627,,,75-66-1,"1S/C4H10S/c1-4(2,3)5/h5H,1-3H3",ditantalum(5+) pentaoxidandiide,CC(C)(C)S,,1B,ECHA,


In [8]:
df0 = df0.dropna(subset=['Molecule'])
df0 = df0.reset_index(drop=True)
df0

Unnamed: 0,ID,ID_type,CAS No,InChI,IUPAC,Molecule,EC3,Outcome,Source,Max Dose Tested
0,,,3344-77-2,1S/C12H25BrO/c13-11-9-7-5-3-1-2-4-6-8-10-12-14...,12-bromododecan-1-ol,C(CCCCCCBr)CCCCCO,6.90,1B,SSDB,
1,,,149-30-4,"1S/C7H5NS2/c9-7-8-5-3-1-2-4-6(5)10-7/h1-4H,(H,...","3H-1,3-benzothiazole-2-thione",C1=CC=C2C(=C1)NC(=S)S2,1.00,1B,SSDB,
2,,,1875-88-3,"1S/C8H9ClO/c9-8-3-1-7(2-4-8)5-6-10/h1-4,10H,5-6H2",,C1=CC(=CC=C1CCO)Cl,31.25,1B,SSDB,
3,,,4638-48-6,1S/C13H10ClNO2/c14-9-6-7-12(16)11(8-9)13(17)15...,5-chloro-2-hydroxy-N-phenylbenzamide,C1=CC=C(C=C1)NC(=O)C2=C(C=CC(=C2)Cl)O,5.00,1B,SSDB,
4,,,30618-84-9,"1S/C5H10O4S/c6-1-4(7)2-9-5(8)3-10/h4,6-7,10H,1...","2,3-dihydroxypropyl 2-sulfanylacetate",C(C(COC(=O)CS)O)O,4.70,1B,SSDB,
...,...,...,...,...,...,...,...,...,...,...
1624,,,931-36-2,"1S/C6H10N2/c1-3-6-7-4-5(2)8-6/h4H,3H2,1-2H3,(H...","Propoxylated reaction products of phenol, 4-no...",CCc1[nH]c(C)cn1,,1B,ECHA,
1625,,,22288-43-3,1S/C16H32O3/c1-8-10-11-13(9-2)14(17)18-19-16(6...,2-(propan-2-yloxy)ethyl acetate,CCCCC(CC)C(=O)OOC(C)(C)CC(C)(C)C,,1B,ECHA,
1626,,,1889-67-4,"1S/C18H22/c1-17(2,15-11-7-5-8-12-15)18(3,4)16-...",(9E)-undec-9-enal; (9Z)-undec-9-enal; undec-10...,CC(C)(c1ccccc1)C(C)(C)c2ccccc2,,1B,ECHA,
1627,,,75-66-1,"1S/C4H10S/c1-4(2,3)5/h5H,1-3H3",ditantalum(5+) pentaoxidandiide,CC(C)(C)S,,1B,ECHA,


In [9]:
df0.groupby('Outcome').size()

Outcome
1A     613
1B    1016
dtype: int64

**ESTEREO REMOVAL**

In [10]:
#remove stereoisomers 
smiles = [smiles_preparator(str(smile)) for smile in df0['Molecule']]
df0['SMILES_no_stereo'] = smiles

#remove relation column
df0

Unnamed: 0,ID,ID_type,CAS No,InChI,IUPAC,Molecule,EC3,Outcome,Source,Max Dose Tested,SMILES_no_stereo
0,,,3344-77-2,1S/C12H25BrO/c13-11-9-7-5-3-1-2-4-6-8-10-12-14...,12-bromododecan-1-ol,C(CCCCCCBr)CCCCCO,6.90,1B,SSDB,,C(CCCCCCBr)CCCCCO
1,,,149-30-4,"1S/C7H5NS2/c9-7-8-5-3-1-2-4-6(5)10-7/h1-4H,(H,...","3H-1,3-benzothiazole-2-thione",C1=CC=C2C(=C1)NC(=S)S2,1.00,1B,SSDB,,C1=CC=C2C(=C1)NC(=S)S2
2,,,1875-88-3,"1S/C8H9ClO/c9-8-3-1-7(2-4-8)5-6-10/h1-4,10H,5-6H2",,C1=CC(=CC=C1CCO)Cl,31.25,1B,SSDB,,C1=CC(=CC=C1CCO)Cl
3,,,4638-48-6,1S/C13H10ClNO2/c14-9-6-7-12(16)11(8-9)13(17)15...,5-chloro-2-hydroxy-N-phenylbenzamide,C1=CC=C(C=C1)NC(=O)C2=C(C=CC(=C2)Cl)O,5.00,1B,SSDB,,C1=CC=C(C=C1)NC(=O)C2=C(C=CC(=C2)Cl)O
4,,,30618-84-9,"1S/C5H10O4S/c6-1-4(7)2-9-5(8)3-10/h4,6-7,10H,1...","2,3-dihydroxypropyl 2-sulfanylacetate",C(C(COC(=O)CS)O)O,4.70,1B,SSDB,,C(C(COC(=O)CS)O)O
...,...,...,...,...,...,...,...,...,...,...,...
1624,,,931-36-2,"1S/C6H10N2/c1-3-6-7-4-5(2)8-6/h4H,3H2,1-2H3,(H...","Propoxylated reaction products of phenol, 4-no...",CCc1[nH]c(C)cn1,,1B,ECHA,,CCc1[nH]c(C)cn1
1625,,,22288-43-3,1S/C16H32O3/c1-8-10-11-13(9-2)14(17)18-19-16(6...,2-(propan-2-yloxy)ethyl acetate,CCCCC(CC)C(=O)OOC(C)(C)CC(C)(C)C,,1B,ECHA,,CCCCC(CC)C(=O)OOC(C)(C)CC(C)(C)C
1626,,,1889-67-4,"1S/C18H22/c1-17(2,15-11-7-5-8-12-15)18(3,4)16-...",(9E)-undec-9-enal; (9Z)-undec-9-enal; undec-10...,CC(C)(c1ccccc1)C(C)(C)c2ccccc2,,1B,ECHA,,CC(C)(c1ccccc1)C(C)(C)c2ccccc2
1627,,,75-66-1,"1S/C4H10S/c1-4(2,3)5/h5H,1-3H3",ditantalum(5+) pentaoxidandiide,CC(C)(C)S,,1B,ECHA,,CC(C)(C)S


**REMOVE SALTS AND INVALID SMILES**

In [11]:
#remove salts
wrongSmiles = []
new_smiles = []
indexDropList_salts = []
for index, smile in enumerate(df0['SMILES_no_stereo']):
    try:
        mol = Chem.MolFromSmiles(smile)
        remov = salt_remover(mol)
        if remov.GetNumAtoms() <= 2:
            indexDropList_salts.append(index)
        else:
            new_smiles.append(Chem.MolToSmiles(remov, kekuleSmiles=True))
        
    except:
        wrongSmiles.append(df0.iloc[[index]])
        indexDropList_salts.append(index)


if len(wrongSmiles) == 0:
    print("no wrong smiles found")
    
else:
    #drop wrong smiles
    df0 = df0.drop(df0.index[indexDropList_salts])
    
    print(f"{len(indexDropList_salts)} wrong smiles found")
    
    #save removes mixtures
    wrongsmiles = pd.concat(wrongSmiles)
    wrongsmiles.to_csv(f'{savepath}\\wrongsmiles_tryp.csv', sep=',', header=True, index=False)
df0['SMILES_no_salts'] = new_smiles
df0 

df0 = df0.reset_index(drop=True)

[14:54:57] SMILES Parse Error: syntax error while parsing: [Cl]|[Sn](|[Cl])(|[Cl])|[Cl]
[14:54:57] SMILES Parse Error: Failed parsing SMILES '[Cl]|[Sn](|[Cl])(|[Cl])|[Cl]' for input: '[Cl]|[Sn](|[Cl])(|[Cl])|[Cl]'
[14:54:58] SMILES Parse Error: syntax error while parsing: C[Sn](|[S]CC(=O)OCC(CC)CCCC)(|[S]CC(=O)OCC(CC)CCCC)|[S]CC(=O)OCC(CC)CCCC
[14:54:58] SMILES Parse Error: Failed parsing SMILES 'C[Sn](|[S]CC(=O)OCC(CC)CCCC)(|[S]CC(=O)OCC(CC)CCCC)|[S]CC(=O)OCC(CC)CCCC' for input: 'C[Sn](|[S]CC(=O)OCC(CC)CCCC)(|[S]CC(=O)OCC(CC)CCCC)|[S]CC(=O)OCC(CC)CCCC'


63 wrong smiles found


**REMOVE ORGANOMETALLICS**

In [12]:
organometals = []
indexDropList_org = []
for index, smile in enumerate(df0['SMILES_no_salts']):
    if is_metal(smile) == True:
        organometals.append(df0.iloc[[index]])
        indexDropList_org.append(index)

if len(indexDropList_org) == 0:
    print("no organometallics found")
    
else:
    #drop organometallics
    df0 = df0.drop(df0.index[indexDropList_org])
    
    print(f"{len(indexDropList_org)} organometallics found")
    
    #save droped organometallics
    organmetal = pd.concat(organometals)
    organmetal.to_csv(f'{savepath}\\organometallics_tryp.csv', sep=',', header=True, index=False)
    
df0

df0 = df0.reset_index(drop=True)

3 organometallics found


**REMOVE MIXTURES**

In [13]:
#remove mixtures
mixtureList = []
indexDropList_mix = []
for index, smile in enumerate (df0['SMILES_no_salts']):
    for char in smile:
        if char == '.':
            mixtureList.append(df0.iloc[[index]])
            indexDropList_mix.append(index)
            break

            
if len(indexDropList_mix) == 0:
    print("no mixtures found")
    
else:
    #drop mixtures
    df0 = df0.drop(df0.index[indexDropList_mix])
    
    print(f"{len(indexDropList_mix)} mixtures found")
    
    #save removes mixtures
    mixtures = pd.concat(mixtureList)
    mixtures.to_csv(f'{savepath}\\mixtures_tryp.csv', sep=',', header=True, index=False)
df0 

df0 = df0.reset_index(drop=True)

27 mixtures found


In [14]:
#Remove moléculas com peso molecular acima de 1000

mols = [Chem.MolFromSmiles(smile) for smile in df0['SMILES_no_salts']]
MWdroplist = []
indexdroplist_MW = []
for index, mol in enumerate(mols):
    mw = Chem.rdMolDescriptors.CalcExactMolWt(mol)
    if mw >= 1000: 
        MWdroplist.append(df0.iloc[[index]])
        indexdroplist_MW.append(index)
    else:
        pass

if len(indexdroplist_MW) == 0:
    print("no molecule removed")
    
else:
    #drop MW >1000
    df0 = df0.drop(df0.index[indexdroplist_MW])
    
    print(f"{len(indexdroplist_MW)} polymer found")
    
    #save removes 
    MWW = pd.concat(MWdroplist)
    MWW.to_csv(f'{savepath}\\MW_tryp.csv', sep=',', header=True, index=False)
df0 

df0 = df0.reset_index(drop=True)

2 polymer found


**STANDARDISE**

In [15]:
df0['final_smiles'] = [Chem.MolToSmiles(Chem.MolFromMolBlock(standardizer.standardize_molblock(Chem.MolToMolBlock(Chem.MolFromSmiles(smile, sanitize=True))))) for smile in df0['SMILES_no_salts']]
df0 = df0.reset_index(drop=True)

[14:54:59] Running Normalizer
[14:54:59] Running Uncharger
[14:54:59] Running Normalizer
[14:54:59] Running Uncharger
[14:54:59] Running Normalizer
[14:54:59] Running Uncharger
[14:54:59] Running Normalizer
[14:54:59] Running Uncharger
[14:54:59] Running Normalizer
[14:54:59] Running Uncharger
[14:54:59] Running Normalizer
[14:54:59] Running Uncharger
[14:54:59] Running Normalizer
[14:54:59] Running Uncharger
[14:54:59] Running Normalizer
[14:54:59] Running Uncharger
[14:54:59] Running Normalizer
[14:54:59] Running Uncharger
[14:54:59] Running Normalizer
[14:54:59] Running Uncharger
[14:54:59] Running Normalizer
[14:54:59] Running Uncharger
[14:54:59] Running Normalizer
[14:54:59] Running Uncharger
[14:54:59] Running Normalizer
[14:54:59] Running Uncharger
[14:54:59] Running Normalizer
[14:54:59] Running Uncharger
[14:54:59] Running Normalizer
[14:54:59] Running Uncharger
[14:54:59] Running Normalizer
[14:54:59] Running Uncharger
[14:54:59] Running Normalizer
[14:54:59] Running Uncharg

In [16]:
""""rdMol = [Chem.MolFromSmiles(smile, sanitize=True) for smile in df0['SMILES_no_salts']]

molBlock = [Chem.MolToMolBlock(mol) for mol in rdMol]

stdMolBlock = [standardizer.standardize_molblock(mol_block) for mol_block in molBlock]

molFromMolBlock = [Chem.MolFromMolBlock(std_molblock) for std_molblock in stdMolBlock]

mol2smiles = [Chem.MolToSmiles(m) for m in molFromMolBlock]

df0['final_smiles'] = mol2smiles

df0 = df0.reset_index(drop=True)"""""

'"rdMol = [Chem.MolFromSmiles(smile, sanitize=True) for smile in df0[\'SMILES_no_salts\']]\n\nmolBlock = [Chem.MolToMolBlock(mol) for mol in rdMol]\n\nstdMolBlock = [standardizer.standardize_molblock(mol_block) for mol_block in molBlock]\n\nmolFromMolBlock = [Chem.MolFromMolBlock(std_molblock) for std_molblock in stdMolBlock]\n\nmol2smiles = [Chem.MolToSmiles(m) for m in molFromMolBlock]\n\ndf0[\'final_smiles\'] = mol2smiles\n\ndf0 = df0.reset_index(drop=True)'

**Remove Salts for the second time**

In [17]:
#remove salts second time
wrongSmiles = []
new_smiles = []
indexDropList_salts = []
for index, smile in enumerate(df0['final_smiles']):
    try:
        mol = Chem.MolFromSmiles(smile)
        remov = salt_remover(mol)
        if remov.GetNumAtoms() <= 2:
            indexDropList_salts.append(index)
        else:
            new_smiles.append(Chem.MolToSmiles(remov, kekuleSmiles=True))
        
    except:
        wrongSmiles.append(df0.iloc[[index]])
        indexDropList_salts.append(index)


if len(wrongSmiles) == 0:
    print("no wrong smiles found")
    
else:
    #drop wrong smiles
    df0 = df0.drop(df0.index[indexDropList_salts])
    
    print(f"{len(indexDropList_salts)} wrong smiles found")
    
    #save removes mixtures
    wrongsmiles = pd.concat(wrongSmiles)
    wrongsmiles.to_csv(f'{savepath}\\wrongsmiles_after_std.csv', sep=',', header=True, index=False)
df0['final_smiles_stand'] = new_smiles
df0 

df0 = df0.reset_index(drop=True)

no wrong smiles found


### DUPLICATES REMOVAL

In [18]:
# Calculate the InChI
inchi_list = []
for smiles in df0['final_smiles_stand']:
    mol = Chem.MolFromSmiles(smiles)
    inchi = Chem.inchi.MolToInchi(mol)
    inchi_list.append(inchi)

# Adicionar a coluna de InChI no dataframe
df0['InChI'] = inchi_list






















































































































































































































































































































































































































































































































































































































































































































































































In [31]:
#Multiclass

# Separar os dados por Outcome
df_1A = df0.query('Outcome == "1A"')
df_1B = df0.query('Outcome == "1B"')

# Remover duplicatas dentro de cada categoria de Outcome
df_1A = df_1A.drop_duplicates(subset=['InChI'], inplace=False)
df_1B = df_1B.drop_duplicates(subset=['InChI'], inplace=False)

# Unir as classes de 0, 1A e 1B
df_no_dup_concord = pd.concat([df_1A, df_1B], axis=0)

# Remover completamente os compostos que têm duplicatas com Outcomes discordantes
final_drop_dup = df_no_dup_concord.drop_duplicates(subset=['InChI'], keep=False, inplace=False)

# Resetar o índice
df_final = final_drop_dup
df_final = df_final.reset_index(drop=True)

In [20]:
"""#Binary

df0_active = df0.query('Outcome == 1')
df0_inactive = df0.query('Outcome == 0')

df0_active = df0_active.drop_duplicates(subset=['InChI'], inplace=False)
df0_inactive = df0_inactive.drop_duplicates(subset=['InChI'], inplace=False)

df_no_dup_concord = pd.concat([df0_active, df0_inactive], axis=0)

final_drop_dup = df_no_dup_concord.drop_duplicates(subset=['InChI'], keep=False, inplace=False)

lastcount = final_drop_dup['InChI'].count()

df_final = final_drop_dup

df_final = df_final.reset_index(drop=True)"""

"#Binary\n\ndf0_active = df0.query('Outcome == 1')\ndf0_inactive = df0.query('Outcome == 0')\n\ndf0_active = df0_active.drop_duplicates(subset=['InChI'], inplace=False)\ndf0_inactive = df0_inactive.drop_duplicates(subset=['InChI'], inplace=False)\n\ndf_no_dup_concord = pd.concat([df0_active, df0_inactive], axis=0)\n\nfinal_drop_dup = df_no_dup_concord.drop_duplicates(subset=['InChI'], keep=False, inplace=False)\n\nlastcount = final_drop_dup['InChI'].count()\n\ndf_final = final_drop_dup\n\ndf_final = df_final.reset_index(drop=True)"

In [36]:
df_final

Unnamed: 0,ID,ID_type,CAS No,InChI,IUPAC,Molecule,EC3,Outcome,Source,Max Dose Tested,SMILES_no_stereo,SMILES_no_salts,final_smiles,final_smiles_stand
0,,,87-66-1,"InChI=1S/C6H6O3/c7-4-2-1-3-5(8)6(4)9/h1-3,7-9H","BENZENE-1,2,3-TRIOL",C1=CC(=C(C(=C1)O)O)O,0.4000,1A,SSDB,,C1=CC(=C(C(=C1)O)O)O,OC1=CC=CC(O)=C1O,Oc1cccc(O)c1O,OC1=CC=CC(O)=C1O
1,,,1210-39-5,InChI=1S/C15H12O/c16-12-11-15(13-7-3-1-4-8-13)...,"3,3-di(phenyl)prop-2-enal",C1=CC=C(C=C1)C(=CC=O)C2=CC=CC=C2,0.6000,1A,SSDB,,C1=CC=C(C=C1)C(=CC=O)C2=CC=CC=C2,O=CC=C(C1=CC=CC=C1)C1=CC=CC=C1,O=CC=C(c1ccccc1)c1ccccc1,O=CC=C(C1=CC=CC=C1)C1=CC=CC=C1
2,,,106-51-4,InChI=1S/C6H4O2/c7-5-1-2-6(8)4-3-5/h1-4H,"cyclohexa-2,5-diene-1,4-dione",C1=CC(=O)C=CC1=O,0.0099,1A,SSDB,,C1=CC(=O)C=CC1=O,O=C1C=CC(=O)C=C1,O=C1C=CC(=O)C=C1,O=C1C=CC(=O)C=C1
3,,,94-36-0,InChI=1S/C14H10O4/c15-13(11-7-3-1-4-8-11)17-18...,benzoyl benzenecarboperoxoate,C1=CC=C(C=C1)C(=O)OOC(=O)C2=CC=CC=C2,0.0044,1A,SSDB,,C1=CC=C(C=C1)C(=O)OOC(=O)C2=CC=CC=C2,O=C(OOC(=O)C1=CC=CC=C1)C1=CC=CC=C1,O=C(OOC(=O)c1ccccc1)c1ccccc1,O=C(OOC(=O)C1=CC=CC=C1)C1=CC=CC=C1
4,,,534-85-0,InChI=1S/C12H12N2/c13-11-8-4-5-9-12(11)14-10-6...,"N-phenylbenzene-1,2-diamine",C1=CC=C(C=C1)NC2=CC=CC=C2N,0.5000,1A,SSDB,,C1=CC=C(C=C1)NC2=CC=CC=C2N,NC1=CC=CC=C1NC1=CC=CC=C1,Nc1ccccc1Nc1ccccc1,NC1=CC=CC=C1NC1=CC=CC=C1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
466,,,931-36-2,"InChI=1S/C6H10N2/c1-3-6-7-4-5(2)8-6/h4H,3H2,1-...","Propoxylated reaction products of phenol, 4-no...",CCc1[nH]c(C)cn1,,1B,ECHA,,CCc1[nH]c(C)cn1,CCC1=NC=C(C)N1,CCc1ncc(C)[nH]1,CCC1=NC=C(C)N1
467,,,22288-43-3,InChI=1S/C16H32O3/c1-8-10-11-13(9-2)14(17)18-1...,2-(propan-2-yloxy)ethyl acetate,CCCCC(CC)C(=O)OOC(C)(C)CC(C)(C)C,,1B,ECHA,,CCCCC(CC)C(=O)OOC(C)(C)CC(C)(C)C,CCCCC(CC)C(=O)OOC(C)(C)CC(C)(C)C,CCCCC(CC)C(=O)OOC(C)(C)CC(C)(C)C,CCCCC(CC)C(=O)OOC(C)(C)CC(C)(C)C
468,,,1889-67-4,"InChI=1S/C18H22/c1-17(2,15-11-7-5-8-12-15)18(3...",(9E)-undec-9-enal; (9Z)-undec-9-enal; undec-10...,CC(C)(c1ccccc1)C(C)(C)c2ccccc2,,1B,ECHA,,CC(C)(c1ccccc1)C(C)(C)c2ccccc2,CC(C)(C1=CC=CC=C1)C(C)(C)C1=CC=CC=C1,CC(C)(c1ccccc1)C(C)(C)c1ccccc1,CC(C)(C1=CC=CC=C1)C(C)(C)C1=CC=CC=C1
469,,,75-66-1,"InChI=1S/C4H10S/c1-4(2,3)5/h5H,1-3H3",ditantalum(5+) pentaoxidandiide,CC(C)(C)S,,1B,ECHA,,CC(C)(C)S,CC(C)(C)S,CC(C)(C)S,CC(C)(C)S


In [21]:
df0_dropped = df0[~df0['InChI'].isin(df_final['InChI'])]
dropped_duplist = df0_dropped.loc[:, ['final_smiles_stand', 'Outcome', 'CAS No', 'InChI', 'Source']]
dropped_duplist.to_csv(f'{savepath}\\dropped_duplicates.csv', sep=',', header=True, index=False)

countdprem = df0_dropped['InChI'].count()
print(f"{countdprem} duplicates removed")

461 duplicates removed


### SAVE FINAL DATASET

In [22]:
df_final.groupby('Outcome').size()

Outcome
1A    124
1B    347
dtype: int64

In [23]:
df_final.rename(columns={'final_smiles_stand': 'SMILES'}, inplace=True)

In [24]:
df_final

Unnamed: 0,ID,ID_type,CAS No,InChI,IUPAC,Molecule,EC3,Outcome,Source,Max Dose Tested,SMILES_no_stereo,SMILES_no_salts,final_smiles,SMILES
0,,,87-66-1,"InChI=1S/C6H6O3/c7-4-2-1-3-5(8)6(4)9/h1-3,7-9H","BENZENE-1,2,3-TRIOL",C1=CC(=C(C(=C1)O)O)O,0.4000,1A,SSDB,,C1=CC(=C(C(=C1)O)O)O,OC1=CC=CC(O)=C1O,Oc1cccc(O)c1O,OC1=CC=CC(O)=C1O
1,,,1210-39-5,InChI=1S/C15H12O/c16-12-11-15(13-7-3-1-4-8-13)...,"3,3-di(phenyl)prop-2-enal",C1=CC=C(C=C1)C(=CC=O)C2=CC=CC=C2,0.6000,1A,SSDB,,C1=CC=C(C=C1)C(=CC=O)C2=CC=CC=C2,O=CC=C(C1=CC=CC=C1)C1=CC=CC=C1,O=CC=C(c1ccccc1)c1ccccc1,O=CC=C(C1=CC=CC=C1)C1=CC=CC=C1
2,,,106-51-4,InChI=1S/C6H4O2/c7-5-1-2-6(8)4-3-5/h1-4H,"cyclohexa-2,5-diene-1,4-dione",C1=CC(=O)C=CC1=O,0.0099,1A,SSDB,,C1=CC(=O)C=CC1=O,O=C1C=CC(=O)C=C1,O=C1C=CC(=O)C=C1,O=C1C=CC(=O)C=C1
3,,,94-36-0,InChI=1S/C14H10O4/c15-13(11-7-3-1-4-8-11)17-18...,benzoyl benzenecarboperoxoate,C1=CC=C(C=C1)C(=O)OOC(=O)C2=CC=CC=C2,0.0044,1A,SSDB,,C1=CC=C(C=C1)C(=O)OOC(=O)C2=CC=CC=C2,O=C(OOC(=O)C1=CC=CC=C1)C1=CC=CC=C1,O=C(OOC(=O)c1ccccc1)c1ccccc1,O=C(OOC(=O)C1=CC=CC=C1)C1=CC=CC=C1
4,,,534-85-0,InChI=1S/C12H12N2/c13-11-8-4-5-9-12(11)14-10-6...,"N-phenylbenzene-1,2-diamine",C1=CC=C(C=C1)NC2=CC=CC=C2N,0.5000,1A,SSDB,,C1=CC=C(C=C1)NC2=CC=CC=C2N,NC1=CC=CC=C1NC1=CC=CC=C1,Nc1ccccc1Nc1ccccc1,NC1=CC=CC=C1NC1=CC=CC=C1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
466,,,931-36-2,"InChI=1S/C6H10N2/c1-3-6-7-4-5(2)8-6/h4H,3H2,1-...","Propoxylated reaction products of phenol, 4-no...",CCc1[nH]c(C)cn1,,1B,ECHA,,CCc1[nH]c(C)cn1,CCC1=NC=C(C)N1,CCc1ncc(C)[nH]1,CCC1=NC=C(C)N1
467,,,22288-43-3,InChI=1S/C16H32O3/c1-8-10-11-13(9-2)14(17)18-1...,2-(propan-2-yloxy)ethyl acetate,CCCCC(CC)C(=O)OOC(C)(C)CC(C)(C)C,,1B,ECHA,,CCCCC(CC)C(=O)OOC(C)(C)CC(C)(C)C,CCCCC(CC)C(=O)OOC(C)(C)CC(C)(C)C,CCCCC(CC)C(=O)OOC(C)(C)CC(C)(C)C,CCCCC(CC)C(=O)OOC(C)(C)CC(C)(C)C
468,,,1889-67-4,"InChI=1S/C18H22/c1-17(2,15-11-7-5-8-12-15)18(3...",(9E)-undec-9-enal; (9Z)-undec-9-enal; undec-10...,CC(C)(c1ccccc1)C(C)(C)c2ccccc2,,1B,ECHA,,CC(C)(c1ccccc1)C(C)(C)c2ccccc2,CC(C)(C1=CC=CC=C1)C(C)(C)C1=CC=CC=C1,CC(C)(c1ccccc1)C(C)(C)c1ccccc1,CC(C)(C1=CC=CC=C1)C(C)(C)C1=CC=CC=C1
469,,,75-66-1,"InChI=1S/C4H10S/c1-4(2,3)5/h5H,1-3H3",ditantalum(5+) pentaoxidandiide,CC(C)(C)S,,1B,ECHA,,CC(C)(C)S,CC(C)(C)S,CC(C)(C)S,CC(C)(C)S


In [25]:
df_final = df_final.loc[:, ['SMILES', 'Outcome', 'Source', 'CAS No']]
df_final.to_csv(f'{savepath}\curated_binary.csv', sep=',', header=True, index=False)

In [26]:
def escrever_log(**contagens):
    with open(f'{savepath}\log.txt', 'w') as arquivo:
        arquivo.write('Were removed:\n')
        for chave, valor in contagens.items():
            arquivo.write(f'{chave}: {valor}\n')

# Exemplo de variáveis de contagem
Wrong_Smiles = len(wrongSmiles)
Salts = len(indexDropList_salts)
Organometals = len(indexDropList_org)
Mixtures = len(indexDropList_mix)
Polymers = len(indexdroplist_MW)
Duplicates = countdprem

# Chamada da função para escrever o log
escrever_log(Wrong_Smiles=Wrong_Smiles, Salts=Salts, Organometals=Organometals, Mixtures=Mixtures, Polymers=Polymers, Duplicates=Duplicates)