In [2]:
import os
import pandas as pd
from rdkit import Chem

DATAPATH = "../data"
SMICOL = "smiles"
INCHICOL = "inchikey"
ACTCOL = "activity"

# Compare Model Training datasets

First, we clean up the original files and add the InChiKey of the smiles if not available. We want to create a dataframe with three columns, smiles, inchikey and activity. We will store each dataset under data/model_datasets/{model_name}_processed.csv

In [11]:
#eos30gr

train_data = pd.read_excel(os.path.join(DATAPATH, "model_datasets", "eos30gr.xlsx"), sheet_name=0)
test_data = pd.read_excel(os.path.join(DATAPATH, "model_datasets", "eos30gr.xlsx"), sheet_name=1)
valid_data = pd.read_excel(os.path.join(DATAPATH, "model_datasets", "eos30gr.xlsx"), sheet_name=2)
eos30gr = pd.concat([train_data, test_data, valid_data])

inchikeys = []
for smi in eos30gr["Smiles"]:
    mol = Chem.MolFromSmiles(smi)
    if mol is not None:
        inchikey = Chem.MolToInchiKey(mol)
    else:
        inchikey = None
    inchikeys += [inchikey]

eos30gr[INCHICOL] = inchikeys
total_len = len(eos30gr)
eos30gr.dropna(subset=[INCHICOL], inplace=True)
print("Smiles eliminated: ", total_len-len(eos30gr))
eos30gr.rename(columns={"Smiles":SMICOL, "activity10":ACTCOL}, inplace=True) #looking at the model, activity 10 was chosen for activity
eos30gr = eos30gr[[SMICOL, INCHICOL, ACTCOL]]
eos30gr.to_csv(os.path.join(DATAPATH, "model_datasets", "eos30gr_processed.csv"), index=False)


[17:44:13] non-ring atom 10 marked aromatic
[17:44:13] non-ring atom 12 marked aromatic
[17:44:13] non-ring atom 10 marked aromatic
[17:44:13] non-ring atom 14 marked aromatic
[17:44:13] non-ring atom 10 marked aromatic
[17:44:13] non-ring atom 10 marked aromatic
[17:44:13] non-ring atom 10 marked aromatic
[17:44:14] non-ring atom 21 marked aromatic
[17:44:14] non-ring atom 10 marked aromatic
[17:44:14] non-ring atom 10 marked aromatic
[17:44:14] non-ring atom 12 marked aromatic
[17:44:14] non-ring atom 12 marked aromatic
[17:44:15] Explicit valence for atom # 0 N, 4, is greater than permitted
[17:44:15] Explicit valence for atom # 0 N, 4, is greater than permitted
[17:44:16] Explicit valence for atom # 0 N, 4, is greater than permitted
[17:44:16] Explicit valence for atom # 0 N, 4, is greater than permitted
[17:44:16] Explicit valence for atom # 0 N, 4, is greater than permitted
[17:44:16] Explicit valence for atom # 0 N, 4, is greater than permitted
[17:44:16] Explicit valence for at

Smiles eliminated:  49


In [12]:
#eos2ta5

train_data = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos2ta5", "train_validation_cardio_tox_data.csv"))
test_data = pd.read_csv(os.path.join(DATAPATH, "model_datasets", "eos2ta5", 'external_test_set_neg.csv'))
test_data2 = pd.read_csv(os.path.join(DATAPATH, "model_datasets", "eos2ta5", 'external_test_set_new.csv'))
test_data3 = pd.read_csv(os.path.join(DATAPATH, "model_datasets", "eos2ta5", 'external_test_set_pos.csv'))
valid_data = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos2ta5", "valid_cardio_tox_data.csv"))
valid_data2 = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos2ta5", "validation_cardio_tox_data.csv"))
eos2ta5 = pd.concat([train_data, test_data,test_data2, test_data3, valid_data, valid_data2])

inchikeys = []
for smi in eos2ta5["smiles"]:
    mol = Chem.MolFromSmiles(smi)
    if mol is not None:
        inchikey = Chem.MolToInchiKey(mol)
    else:
        inchikey = None
    inchikeys += [inchikey]

eos2ta5[INCHICOL] = inchikeys
total_len = len(eos2ta5)
eos2ta5.dropna(subset=[INCHICOL], inplace=True)
print("Smiles eliminated: ", total_len-len(eos2ta5))
eos2ta5.rename(columns={"smiles":SMICOL, "ACTIVITY":ACTCOL}, inplace=True) #looking at the model, activity 10 was chosen for activity
eos2ta5 = eos2ta5[[SMICOL, INCHICOL, ACTCOL]]
eos2ta5.to_csv(os.path.join(DATAPATH, "model_datasets", "eos2ta5_processed.csv"), index=False)

  train_data = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos2ta5", "train_validation_cardio_tox_data.csv"))


Smiles eliminated:  0


  eos2ta5[INCHICOL] = inchikeys


In [16]:
#eos4tcc
train_data = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos4tcc","pretraining_eos4tcc", "MLSMR_training.csv"))
train_data2 = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos4tcc", "pretraining_eos4tcc", "MLSMR_validation.csv"))
finetuning_data = pd.read_csv(os.path.join(DATAPATH, "model_datasets", "eos4tcc", "finetuning_eos4tcc", "test_all.csv"))
finetuning_data2 = pd.read_csv(os.path.join(DATAPATH, "model_datasets", "eos4tcc", "finetuning_eos4tcc", "test_rev.csv"))
finetuning_data3 = pd.read_csv(os.path.join(DATAPATH, "model_datasets", "eos4tcc", "finetuning_eos4tcc", "training.csv"))
finetuning_data4 = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos4tcc", "finetuning_eos4tcc", "val_all.csv"))
finetuning_data5 = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos4tcc", "finetuning_eos4tcc", "val_rev.csv"))
external_data = pd.read_csv(os.path.join(DATAPATH, "model_datasets", "eos4tcc", "external_eos4tcc", "EX1.csv"))
external_data2 = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos4tcc", "external_eos4tcc", "EX2.csv"))
external_data3 = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos4tcc", "external_eos4tcc", "EX3.csv"))
external_data4 = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos4tcc", "external_eos4tcc", "EX4.csv"))
eos4tcc = pd.concat([train_data,train_data2, finetuning_data, finetuning_data2,finetuning_data3,finetuning_data4,finetuning_data5,external_data, external_data2,external_data3,external_data4 ])

inchikeys = []
for smi in eos4tcc["smiles"]:
    mol = Chem.MolFromSmiles(smi)
    if mol is not None:
        inchikey = Chem.MolToInchiKey(mol)
    else:
        inchikey = None
    inchikeys += [inchikey]

eos4tcc[INCHICOL] = inchikeys
total_len = len(eos4tcc)
eos4tcc.dropna(subset=[INCHICOL], inplace=True)
print("Smiles eliminated: ", total_len-len(eos4tcc))
eos4tcc.rename(columns={"smiles":SMICOL, "label":ACTCOL}, inplace=True) 
eos4tcc = eos4tcc[[SMICOL, INCHICOL, ACTCOL]]
eos4tcc.to_csv(os.path.join(DATAPATH, "model_datasets", "eos4tcc_processed.csv"), index=False)




Smiles eliminated:  0


In [17]:
# Reading Data with .read_csv for CSV file
train_data = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos30f3", "Cai_TableS3_fixed.csv"))

# Concatenating Data
eos30f3 = pd.concat([train_data])

# Generating InChiKeys
inchikeys = []
for smi in eos30f3["smiles"]:
    mol = Chem.MolFromSmiles(smi)
    if mol is not None:
        inchikey = Chem.MolToInchiKey(mol)
    else:
        inchikey = None
    inchikeys += [inchikey]

# Adding InChiKeys to DataFrame and Dropping NaN Values
eos30f3[INCHICOL] = inchikeys
total_len = len(eos30f3)
eos30f3.dropna(subset=[INCHICOL], inplace=True)
print("Smiles eliminated: ", total_len - len(eos30f3))

# Renaming Columns
eos30f3.rename(columns={"smiles": SMICOL, "X10": ACTCOL}, inplace=True)

# Selecting Columns
eos30f3 = eos30f3[[SMICOL, INCHICOL, ACTCOL]]

# Saving Processed Data
eos30f3.to_csv(os.path.join(DATAPATH, "model_datasets", "eos30f3_processed.csv"), index=False)

Smiles eliminated:  0


In [31]:
#eos43at
## The training files has no activity column

train_data = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos43at","CHEMBL1909307.csv"))
train_data2 = pd.read_csv(os.path.join(DATAPATH, "model_datasets", "eos43at", 'CHEMBL1909308.csv'))
train_data3 = pd.read_csv(os.path.join(DATAPATH, "model_datasets", "eos43at", 'CHEMBL1909313.csv'))
train_data4 = pd.read_csv(os.path.join(DATAPATH, "model_datasets", "eos43at", 'CHEMBL1909314.csv'))
train_data5 = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos43at", "CHEMBL1909317.csv"))
train_data6 = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos43at", "CHEMBL3039488.csv"))
train_data7 = pd.read_csv(os.path.join(DATAPATH, "model_datasets", "eos43at", 'CHEMBL3039491.csv'))
train_data8 = pd.read_csv(os.path.join(DATAPATH, "model_datasets", "eos43at", 'CHEMBL3301364.csv'))
train_data9 = pd.read_csv(os.path.join(DATAPATH, "model_datasets", "eos43at", 'CHEMBL3301365.csv'))
train_data10 = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos43at", "CHEMBL3301366.csv"))
train_data11 = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos43at", "CHEMBL3301370.csv"))
train_data12 = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos43at", "CHEMBL3301371.csv"))
train_data13 = pd.read_csv(os.path.join(DATAPATH, "model_datasets", "eos43at", 'CHEMBL3301372.csv'))
train_data14 = pd.read_csv(os.path.join(DATAPATH, "model_datasets", "eos43at", 'CHEMBL4029349.csv'))
eos43at = pd.concat([train_data, train_data2,train_data3, train_data4, train_data5, train_data6, train_data7, train_data8, train_data9, train_data10, train_data11,train_data12, train_data13, train_data14])

# Convert InChI to SMILES
inchi_to_smiles = {}
for inchi in eos43at['inchi']:
    if inchi is not None:
        mol = Chem.MolFromInchi(inchi)
        if mol is not None:
            smiles = Chem.MolToSmiles(mol)
            inchi_to_smiles[inchi] = smiles

# Add a new column with SMILES to eos43at DataFrame
eos43at['Smiles'] = eos43at['inchi'].map(inchi_to_smiles)

# Drop rows with missing values in the InChI column
total_len = len(eos43at)
eos43at.dropna(subset=['inchi'], inplace=True)
print("Smiles eliminated: ", total_len - len(eos43at))

# Rename columns and select desired columns
eos43at.rename(columns={"Smiles": SMICOL,'inchi': INCHICOL}, inplace=True)
eos43at = eos43at[[SMICOL, INCHICOL]]

# Save processed data to a new CSV file
eos43at.to_csv(os.path.join(DATAPATH, "model_datasets", "eos43at_processed.csv"), index=False)



Smiles eliminated:  0


Once all the datasets have been cleaned, we can compare them

In [None]:
models = ["eos2ta5", "eos4tcc", "eos30f3", "eos30gr", "eos43at"]

# load the datasets and make comparisons

# proportion of actives and inactives in each dataset

# number of repeated smiles between models

# Build test dataset

We collate in a single file the data from the NCATS repository and eliminate any duplicate molecules that exist in the training sets of the models already.

In [16]:
df1 = pd.read_csv(os.path.join(DATAPATH, "test_data", "training_set_ncats.csv"))
df2 = pd.read_csv(os.path.join(DATAPATH, "test_data", "validation_set_ncats.csv"))

In [20]:
df1.columns

Index(['smiles', 'activity', 'source'], dtype='object')

In [22]:
#merge and remove duplicates. Obtain InChiKeys for all

df = pd.concat([df1, df2])
inchikeys = []
for smi in df["smiles"]:
    mol = Chem.MolFromSmiles(smi)
    if mol is not None:
        inchikey = Chem.MolToInchiKey(mol)
    else:
        inchikey = None
    inchikeys += [inchikey]

df[INCHICOL] = inchikeys
total_len = len(df)
df.dropna(subset=[INCHICOL], inplace=True)
print("Smiles eliminated: ", total_len-len(df))
total_len = len(df)
df.drop_duplicates(subset=[SMICOL], inplace=True)
print("Smiles eliminated: ", total_len-len(df))
df = df[[SMICOL, INCHICOL, ACTCOL]]
df.to_csv(os.path.join(DATAPATH, "test_data", "ncats.csv"), index=False)


Smiles eliminated:  0
Smiles eliminated:  32


In [None]:
# Now, from the all NCATS data, we eliminate duplicated molecules with training set