In [None]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
from pathlib import Path

from rdkit import Chem
from chembl_structure_pipeline import standardizer
import rdkit.Chem.MolStandardize.rdMolStandardize as rdMolStandardize
from rdkit.Chem import PandasTools
from rdkit.Chem import inchi as rd_inchi
from molvs import standardize_smiles
from molvs import Standardizer


def remove_invalid_smiles(df):
        valid_smiles = []
        for smile in df['Molecule']:
            mol = Chem.MolFromSmiles(smile)
            if mol is not None:
                valid_smiles.append(smile)
        df = df[df['Molecule'].isin(valid_smiles)]

        print(f"Removed {df.shape[0] - len(valid_smiles)} invalid SMILES")
        print(f"New dataframe shape: {df.shape}")
        return df

def remove_mixtures(df):
    mixtureList = []
    indexDropList_mix = []
    for index, smile in enumerate(df['Molecule']):
        if '.' in smile:
            mixtureList.append(df.iloc[[index]])
            indexDropList_mix.append(index)

    if not indexDropList_mix:
        print("no mixtures found")
    else:
        # drop mixtures
        df = df.drop(df.index[indexDropList_mix])
        print(f"{len(indexDropList_mix)} mixtures found")
        print(f"New dataframe shape: {df.shape}")
    df = df.reset_index(drop=True)
    return df

In [None]:
file = Path(r"D:\OneDrive\Documentos\LabMol\IC-Skin\DADOS\Multiclass\RAW_Skin_data_LLNA_Multiclass.xlsx")
savepath = r"D:\OneDrive\Documentos\LabMol\IC-Skin\DADOS\Multiclass\DL"

In [None]:
### Binary ####

df  = pd.read_excel(file)

#df = df.loc[:, ['PUBCHEM_EXT_DATASOURCE_SMILES', 'PUBCHEM_ACTIVITY_OUTCOME', 'PUBCHEM_CID']]
df = df.rename(columns={'Canonical SMILES':'Molecule', 'binary':'Outcome'})

with open('output.txt', 'w') as f:
    f.write(f"(Initial dataframe shape: {df.shape}\n")
    df = df.dropna(subset=['Molecule'])
    df = df.dropna(subset=['Outcome'])

    f.write(f"After removing NaN: {df.shape}\n")

    df = remove_invalid_smiles(df)

    df = remove_mixtures(df)
    f.write(f"After removing mixtures: {df.shape}\n")

    df['final_smiles'] = [Chem.MolToSmiles(Chem.MolFromMolBlock(standardizer.standardize_molblock(Chem.MolToMolBlock(Chem.MolFromSmiles(smile, sanitize=True))))) for smile in df['Molecule']]
    df = df.reset_index(drop=True)

    df = df.dropna(subset=['final_smiles'])

    inchi_list = []
    for smiles in df['final_smiles']:
        mol = Chem.MolFromSmiles(smiles)
        inchi = Chem.inchi.MolToInchi(mol)
        inchi_list.append(inchi)

    df['InChI'] = inchi_list

    df_0 = df.query('Outcome == 0')
    df_1 = df.query('Outcome == 1')

    df_0 = df_0.drop_duplicates(subset=['InChI'], inplace=False)
    df_1 = df_1.drop_duplicates(subset=['InChI'], inplace=False)

    df_no_dup_concord = pd.concat([df_0, df_1], axis=0)

    final_drop_dup = df_no_dup_concord.drop_duplicates(subset=['InChI'], keep=False, inplace=False)

    df = final_drop_dup
    df = df.reset_index(drop=True)
    f.write(f"After removing duplicates: {df.shape}\n")

In [None]:
### Multiclass ####
df  = pd.read_excel(file)

#df = df.loc[:, ['PUBCHEM_EXT_DATASOURCE_SMILES', 'PUBCHEM_ACTIVITY_OUTCOME', 'PUBCHEM_CID']]
df = df.rename(columns={'Canonical SMILES':'Molecule', 'LLNA (EPA,GHS)':'Outcome'})

with open('output.txt', 'w') as f:
    f.write(f"(Initial dataframe shape: {df.shape}\n")
    df = df.dropna(subset=['Molecule'])
    df = df.dropna(subset=['Outcome'])

    f.write(f"After removing NaN: {df.shape}\n")

    df = remove_invalid_smiles(df)

    df = remove_mixtures(df)
    f.write(f"After removing mixtures: {df.shape}\n")

    df['final_smiles'] = [Chem.MolToSmiles(Chem.MolFromMolBlock(standardizer.standardize_molblock(Chem.MolToMolBlock(Chem.MolFromSmiles(smile, sanitize=True))))) for smile in df['Molecule']]
    df = df.reset_index(drop=True)

    df = df.dropna(subset=['final_smiles'])

    inchi_list = []
    for smiles in df['final_smiles']:
        mol = Chem.MolFromSmiles(smiles)
        inchi = Chem.inchi.MolToInchi(mol)
        inchi_list.append(inchi)

    df['InChI'] = inchi_list

    df_0 = df.query('Outcome == "0"')
    df_1A = df.query('Outcome == "1A"')
    df_1B = df.query('Outcome == "1B"')

    df_0 = df_0.drop_duplicates(subset=['InChI'], inplace=False)
    df_1A = df_1A.drop_duplicates(subset=['InChI'], inplace=False)
    df_1B = df_1B.drop_duplicates(subset=['InChI'], inplace=False)

    df_no_dup_concord = pd.concat([df_0, df_1A, df_1B], axis=0)

    final_drop_dup = df_no_dup_concord.drop_duplicates(subset=['InChI'], keep=False, inplace=False)

    df = final_drop_dup
    df = df.reset_index(drop=True)
    f.write(f"After removing duplicates: {df.shape}\n")

In [None]:
df.to_csv(os.path.join(savepath, f"LLNA_Curated_binary_DL.csv"), index=False)
