In [10]:
import os
import pandas as pd
from rdkit import Chem

DATAPATH = "../data"
SMICOL = "smiles"
INCHICOL = "inchikey"
ACTCOL = "activity"

# Compare Model Training datasets

First, we clean up the original files and add the InChiKey of the smiles if not available. We want to create a dataframe with three columns, smiles, inchikey and activity. We will store each dataset under data/model_datasets/{model_name}_processed.csv

In [13]:
#eos30gr

train_data = pd.read_excel(os.path.join(DATAPATH, "model_datasets", "eos30gr.xlsx"), sheet_name=0)
test_data = pd.read_excel(os.path.join(DATAPATH, "model_datasets", "eos30gr.xlsx"), sheet_name=1)
valid_data = pd.read_excel(os.path.join(DATAPATH, "model_datasets", "eos30gr.xlsx"), sheet_name=2)
eos30gr = pd.concat([train_data, test_data, valid_data])

inchikeys = []
for smi in eos30gr["Smiles"]:
    mol = Chem.MolFromSmiles(smi)
    if mol is not None:
        inchikey = Chem.MolToInchiKey(mol)
    else:
        inchikey = None
    inchikeys += [inchikey]

eos30gr[INCHICOL] = inchikeys
total_len = len(eos30gr)
eos30gr.dropna(subset=[INCHICOL], inplace=True)
print("Smiles eliminated: ", total_len-len(eos30gr))
eos30gr.rename(columns={"Smiles":SMICOL, "activity10":ACTCOL}, inplace=True) #looking at the model, activity 10 was chosen for activity
eos30gr = eos30gr[[SMICOL, INCHICOL, ACTCOL]]
eos30gr.to_csv(os.path.join(DATAPATH, "model_datasets", "eos30gr_processed.csv"), index=False)


[22:43:01] non-ring atom 10 marked aromatic
[22:43:01] non-ring atom 12 marked aromatic
[22:43:01] non-ring atom 10 marked aromatic
[22:43:01] non-ring atom 14 marked aromatic
[22:43:01] non-ring atom 10 marked aromatic
[22:43:02] non-ring atom 10 marked aromatic
[22:43:02] non-ring atom 10 marked aromatic
[22:43:02] non-ring atom 21 marked aromatic
[22:43:02] non-ring atom 10 marked aromatic
[22:43:02] non-ring atom 10 marked aromatic
[22:43:02] non-ring atom 12 marked aromatic
[22:43:02] non-ring atom 12 marked aromatic
[22:43:03] Explicit valence for atom # 0 N, 4, is greater than permitted
[22:43:03] Explicit valence for atom # 0 N, 4, is greater than permitted
[22:43:03] Explicit valence for atom # 0 N, 4, is greater than permitted
[22:43:03] Explicit valence for atom # 0 N, 4, is greater than permitted
[22:43:04] Explicit valence for atom # 0 N, 4, is greater than permitted
[22:43:04] Explicit valence for atom # 0 N, 4, is greater than permitted
[22:43:04] Explicit valence for at

Smiles eliminated:  49


In [15]:
#eos2ta5

train_data = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos2ta5", "train_validation_cardio_tox_data.csv"))
test_data = pd.read_csv(os.path.join(DATAPATH, "model_datasets", "eos2ta5", 'external_test_set_neg.csv'))
test_data2 = pd.read_csv(os.path.join(DATAPATH, "model_datasets", "eos2ta5", 'external_test_set_new.csv'))
test_data3 = pd.read_csv(os.path.join(DATAPATH, "model_datasets", "eos2ta5", 'external_test_set_pos.csv'))

eos2ta5 = pd.concat([train_data, test_data,test_data2, test_data3])

inchikeys = []
for smi in eos2ta5["smiles"]:
    mol = Chem.MolFromSmiles(smi)
    if mol is not None:
        inchikey = Chem.MolToInchiKey(mol)
    else:
        inchikey = None
    inchikeys += [inchikey]

eos2ta5[INCHICOL] = inchikeys
total_len = len(eos2ta5)
eos2ta5.dropna(subset=[INCHICOL], inplace=True)
print("Smiles eliminated: ", total_len-len(eos2ta5))
eos2ta5.rename(columns={"smiles":SMICOL, "ACTIVITY":ACTCOL}, inplace=True) #looking at the model, activity 10 was chosen for activity
eos2ta5 = eos2ta5[[SMICOL, INCHICOL, ACTCOL]]
eos2ta5.to_csv(os.path.join(DATAPATH, "model_datasets", "eos2ta5_processed.csv"), index=False)

  train_data = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos2ta5", "train_validation_cardio_tox_data.csv"))
  eos2ta5[INCHICOL] = inchikeys


Smiles eliminated:  0


In [16]:
#eos4tcc
finetuning_data = pd.read_csv(os.path.join(DATAPATH, "model_datasets", "eos4tcc", "finetuning_eos4tcc", "test_all.csv"))
finetuning_data2 = pd.read_csv(os.path.join(DATAPATH, "model_datasets", "eos4tcc", "finetuning_eos4tcc", "test_rev.csv"))
finetuning_data3 = pd.read_csv(os.path.join(DATAPATH, "model_datasets", "eos4tcc", "finetuning_eos4tcc", "training.csv"))
finetuning_data4 = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos4tcc", "finetuning_eos4tcc", "val_all.csv"))
finetuning_data5 = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos4tcc", "finetuning_eos4tcc", "val_rev.csv"))
eos4tcc = pd.concat([finetuning_data, finetuning_data2,finetuning_data3,finetuning_data4,finetuning_data5 ])

inchikeys = []
for smi in eos4tcc["smiles"]:
    mol = Chem.MolFromSmiles(smi)
    if mol is not None:
        inchikey = Chem.MolToInchiKey(mol)
    else:
        inchikey = None
    inchikeys += [inchikey]

eos4tcc[INCHICOL] = inchikeys
total_len = len(eos4tcc)
eos4tcc.dropna(subset=[INCHICOL], inplace=True)
print("Smiles eliminated: ", total_len-len(eos4tcc))
eos4tcc.rename(columns={"smiles":SMICOL, "label":ACTCOL}, inplace=True) 
eos4tcc = eos4tcc[[SMICOL, INCHICOL, ACTCOL]]
eos4tcc.to_csv(os.path.join(DATAPATH, "model_datasets", "eos4tcc_processed.csv"), index=False)


Smiles eliminated:  0


In [17]:
# eos30f3
train_data = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos30f3", "Cai_TableS3_fixed.csv"))

# Concatenating Data
eos30f3 = pd.concat([train_data])

# Generating InChiKeys
inchikeys = []
for smi in eos30f3["smiles"]:
    mol = Chem.MolFromSmiles(smi)
    if mol is not None:
        inchikey = Chem.MolToInchiKey(mol)
    else:
        inchikey = None
    inchikeys += [inchikey]

# Adding InChiKeys to DataFrame and Dropping NaN Values
eos30f3[INCHICOL] = inchikeys
total_len = len(eos30f3)
eos30f3.dropna(subset=[INCHICOL], inplace=True)
print("Smiles eliminated: ", total_len - len(eos30f3))

# Renaming Columns
eos30f3.rename(columns={"smiles": SMICOL, "X10": ACTCOL}, inplace=True)

# Selecting Columns
eos30f3 = eos30f3[[SMICOL, INCHICOL, ACTCOL]]

# Saving Processed Data
eos30f3.to_csv(os.path.join(DATAPATH, "model_datasets", "eos30f3_processed.csv"), index=False)

Smiles eliminated:  0


Once all the datasets have been cleaned, we can compare them

In [7]:
models = ["eos2ta5", "eos4tcc", "eos30f3", "eos30gr"]

# load the datasets and make comparisons
eos2ta5 = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos2ta5_processed.csv"))
eos4tcc = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos4tcc_processed.csv"))
eos30f3 = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos30f3_processed.csv"))
eos30gr = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos30gr_processed.csv"))

# proportion of actives and inactives in each dataset
print("eos2ta5:")
print(eos2ta5['activity'].value_counts())

print("\neos4tcc:")
print(eos4tcc['activity'].value_counts())

print("\neos30f3:")
print(eos30f3['activity'].value_counts())

print("\neos30gr:")
print(eos30gr['activity'].value_counts())

# number of repeated smiles between models
# Check repeated smiles within each dataset
repeated_smiles_eos2ta5 = eos2ta5['smiles'].duplicated().sum()
repeated_smiles_eos4tcc = eos4tcc['smiles'].duplicated().sum()
repeated_smiles_eos30f3 = eos30f3['smiles'].duplicated().sum()
repeated_smiles_eos30gr = eos30gr['smiles'].duplicated().sum()

# Print the results
print(f"Number of repeated smiles in eos2ta5: {repeated_smiles_eos2ta5}")
print(f"Number of repeated smiles in eos4tcc: {repeated_smiles_eos4tcc}")
print(f"Number of repeated smiles in eos30f3: {repeated_smiles_eos30f3}")
print(f"Number of repeated smiles in eos30gr: {repeated_smiles_eos30gr}")



# Check repeated smiles between pairs of datasets
repeated_smiles_eos2ta5_eos4tcc = pd.concat([eos2ta5['smiles'], eos4tcc['smiles']]).duplicated().sum()
repeated_smiles_eos2ta5_eos30f3 = pd.concat([eos2ta5['smiles'], eos30f3['smiles']]).duplicated().sum()
repeated_smiles_eos2ta5_eos30gr = pd.concat([eos2ta5['smiles'], eos30gr['smiles']]).duplicated().sum()
repeated_smiles_eos4tcc_eos30f3 = pd.concat([eos4tcc['smiles'], eos30f3['smiles']]).duplicated().sum()
repeated_smiles_eos4tcc_eos30gr = pd.concat([eos4tcc['smiles'], eos30gr['smiles']]).duplicated().sum()
repeated_smiles_eos30f3_eos30gr = pd.concat([eos30f3['smiles'], eos30gr['smiles']]).duplicated().sum()

# Print the results
print(f"Number of repeated smiles between eos2ta5 and eos4tcc: {repeated_smiles_eos2ta5_eos4tcc}")
print(f"Number of repeated smiles between eos2ta5 and eos30f3: {repeated_smiles_eos2ta5_eos30f3}")
print(f"Number of repeated smiles between eos2ta5 and eos30gr: {repeated_smiles_eos2ta5_eos30gr}")
print(f"Number of repeated smiles between eos4tcc and eos30f3: {repeated_smiles_eos4tcc_eos30f3}")
print(f"Number of repeated smiles between eos4tcc and eos30gr: {repeated_smiles_eos4tcc_eos30gr}")
print(f"Number of repeated smiles between eos30f3 and eos30gr: {repeated_smiles_eos30f3_eos30gr}")

eos2ta5:
activity
0    6727
1    6718
Name: count, dtype: int64

eos4tcc:
activity
1    9284
0    6487
Name: count, dtype: int64

eos30f3:
activity
1    4355
0    3534
Name: count, dtype: int64

eos30gr:
activity
1.0    4332
0.0    3526
Name: count, dtype: int64
Number of repeated smiles in eos2ta5: 0
Number of repeated smiles in eos4tcc: 1449
Number of repeated smiles in eos30f3: 0
Number of repeated smiles in eos30gr: 5120
Number of repeated smiles between eos2ta5 and eos4tcc: 12028
Number of repeated smiles between eos2ta5 and eos30f3: 42
Number of repeated smiles between eos2ta5 and eos30gr: 5162
Number of repeated smiles between eos4tcc and eos30f3: 1497
Number of repeated smiles between eos4tcc and eos30gr: 6617
Number of repeated smiles between eos30f3 and eos30gr: 12978


# Build test dataset

We collate in a single file the data from the NCATS repository and eliminate any duplicate molecules that exist in the training sets of the models already.

In [17]:
df1 = pd.read_csv(os.path.join(DATAPATH, "test_data", "training_set_ncats.csv"))
df2 = pd.read_csv(os.path.join(DATAPATH, "test_data", "validation_set_ncats.csv"))

In [12]:
df1.columns

Index(['smiles', 'activity', 'source'], dtype='object')

In [18]:
#merge and remove duplicates. Obtain InChiKeys for all

df = pd.concat([df1, df2])
inchikeys = []
for smi in df["smiles"]:
    mol = Chem.MolFromSmiles(smi)
    if mol is not None:
        inchikey = Chem.MolToInchiKey(mol)
    else:
        inchikey = None
    inchikeys += [inchikey]

df[INCHICOL] = inchikeys
total_len = len(df)
df.dropna(subset=[INCHICOL], inplace=True)
print("Smiles eliminated: ", total_len-len(df))
total_len = len(df)
df.drop_duplicates(subset=[SMICOL], inplace=True)
print("Smiles eliminated: ", total_len-len(df))
df = df[[SMICOL, INCHICOL, ACTCOL]]
df.to_csv(os.path.join(DATAPATH, "test_data", "ncats.csv"), index=False)


Smiles eliminated:  0
Smiles eliminated:  32


In [19]:
# Now, from the all NCATS data, we eliminate duplicated molecules with training set
## concatenate the training set together
training_set = pd.concat([eos2ta5, eos4tcc, eos30f3, eos30gr], ignore_index=True)

# Save the training set to a CSV file
training_set.to_csv(os.path.join(DATAPATH, "model_datasets","training_set.csv"), index=False)

##Load the test dataset
test_dataset= pd.read_csv(os.path.join(DATAPATH, "test_data", "test_dataset.csv")) 

# Calculate the number of removed InChiKey
initial_inchikey_count = len(test_dataset)
processed_test_dataset = test_dataset[~test_dataset['InChiKey'].isin(training_set['inchikey'])]
removed_inchikey_count = initial_inchikey_count - len(processed_test_dataset)

# Print the number of removed smiles
print(f"Number of removed inchikey: {removed_inchikey_count}")

# Save the processed test dataset to a file
processed_test_dataset.to_csv(os.path.join(DATAPATH, "test_data", "processed_test_dataset.csv"), index=False)

Number of removed inchikey: 0


In [21]:
## Confirm that the training and test dataset has no Inchikey in common

training_set = pd.read_csv(os.path.join(DATAPATH,"model_datasets", "training_set.csv"))
test_set = pd.read_csv(os.path.join(DATAPATH,"test_data", "processed_test_dataset.csv"))

# Check for common inchikeys
common_inchikey = set(training_set['inchikey']).intersection(set(test_set['InChiKey']))

# Print the number of common Inchikey
print(f"Number of common inchikeys between training and test datasets: {len(common_inchikey)}")

print("Common inchikey:", common_inchikey)

Number of common inchikeys between training and test datasets: 0
Common inchikey: set()
