In [None]:
import pandas as pd
import numpy as np

import os
from pathlib import Path
from rdkit import Chem
from chembl_structure_pipeline import standardizer
from rdkit.Chem.MolStandardize.metal import MetalDisconnector
import rdkit.Chem.MolStandardize.rdMolStandardize as rdMolStandardize
from rdkit.Chem import PandasTools

from rdkit.Chem import inchi as rd_inchi

from molvs import standardize_smiles
from molvs import Standardizer

In [None]:
#choose a path to save
savepath = r"D:\OneDrive\Documentos\LabMol\IC-Citotoxicidade\DeepCytosafe"
cell = "hek293" 
incubation = "24h"

In [None]:
df0  = pd.read_csv(r"D:\OneDrive\Documentos\LabMol\IC-Citotoxicidade\DeepCytosafe\HEK293\RAW\AID_624418_datatable.csv")

In [None]:
df1 = pd.read_csv(r"D:\OneDrive\Documentos\LabMol\IC-Citotoxicidade\DeepCytosafe\HEPG2\RAW\AID_588856_datatable.csv")
df2 = pd.read_csv(r"D:\OneDrive\Documentos\LabMol\IC-Citotoxicidade\DeepCytosafe\HEPG2\RAW\AID_720535_datatable.csv")
#df3 = pd.read_csv(r"D:\OneDrive\Documentos\LabMol\IC-Citotoxicidade\DeepCytosafe\THP1\RAW\AID_489025_datatable.csv")
#df4 = pd.read_csv(r"D:\OneDrive\Documentos\LabMol\IC-Citotoxicidade\DeepCytosafe\THP1\RAW\AID_1117359_datatable.csv")

df0 = pd.concat([df1, df2], axis=0)

In [None]:
df0 = df0.loc[:, ['PUBCHEM_EXT_DATASOURCE_SMILES', 'PUBCHEM_ACTIVITY_OUTCOME', 'PUBCHEM_CID']]
df0

In [None]:
df0 = df0.rename(columns={'PUBCHEM_EXT_DATASOURCE_SMILES':'Molecule', 'PUBCHEM_ACTIVITY_OUTCOME':'Outcome'})
df0

In [None]:
df0 = df0.dropna(subset=['Molecule'])
df0 = df0.dropna(subset=['Outcome'])
df0

In [None]:
def string_to_int(s):
    mapping = {"Active": 1, "Inactive": 0}
    return mapping.get(s, None)

df0['Outcome'] = df0['Outcome'].apply(string_to_int)
df0

In [None]:
df0["Outcome"].value_counts()

In [None]:
rdMol = [Chem.MolFromSmiles(smile, sanitize=True) for smile in df0['Molecule']]
molBlock = [Chem.MolToMolBlock(mol) for mol in rdMol]
stdMolBlock = [standardizer.standardize_molblock(mol_block) for mol_block in molBlock]
molFromMolBlock = [Chem.MolFromMolBlock(std_molblock) for std_molblock in stdMolBlock]
mol2smiles = [Chem.MolToSmiles(m) for m in molFromMolBlock]
df0['final_smiles'] = mol2smiles

df0 = df0.reset_index(drop=True)

In [None]:
df0 = df0.dropna(subset=['final_smiles'])
df0 = df0.loc[:, ['final_smiles', 'Outcome', 'PUBCHEM_CID']]
df0

In [None]:
#Save withou removing duplicates
df0.to_csv(os.path.join(savepath, f"Cytotox_{cell}_{incubation}_dupIN.csv"), index=False)

In [None]:
# Calculate the InChI
inchi_list = []
for smiles in df0['final_smiles']:
    mol = Chem.MolFromSmiles(smiles)
    inchi = Chem.inchi.MolToInchi(mol)
    inchi_list.append(inchi)

# Adicionar a coluna de InChI no dataframe
df0['InChI'] = inchi_list

df0_active = df0.query('Outcome == 0')
df0_inactive = df0.query('Outcome == 1')

df0_active = df0_active.drop_duplicates(subset=['InChI'], inplace=False)
df0_inactive = df0_inactive.drop_duplicates(subset=['InChI'], inplace=False)

df_no_dup_concord = pd.concat([df0_active, df0_inactive], axis=0)

final_drop_dup = df_no_dup_concord.drop_duplicates(subset=['InChI'], keep=False, inplace=False)

lastcount = final_drop_dup['InChI'].count()

df_final = final_drop_dup

df_final = df_final.reset_index(drop=True)

In [None]:
df_final

In [None]:
#Save without duplicates
df_final.to_csv(os.path.join(savepath, f"Cytotox_{cell}_{incubation}_dupOUT.csv"), index=False)