In [25]:
import os
import pandas as pd
from rdkit import Chem

DATAPATH = "../data"
SMICOL = "smiles"
INCHICOL = "inchikey"
ACTCOL = "activity"

# Compare Model Training datasets
First, we clean up the original files and add the InChiKey of the smiles if not available. We want to create a dataframe with three columns, smiles, inchikey and activity. We will store each dataset under data/model_datasets/{model_name}_processed.csv

In [26]:
#eos21q7

train_data = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos21q7", "Total_dataset.csv"))

eos21q7 = pd.concat([train_data])

inchikeys = []
for smi in eos21q7["smiles"]:
    mol = Chem.MolFromSmiles(smi)
    if mol is not None:
        inchikey = Chem.MolToInchiKey(mol)
    else:
        inchikey = None
    inchikeys += [inchikey]

eos21q7[INCHICOL] = inchikeys
total_len = len(eos21q7)
eos21q7.dropna(subset=[INCHICOL], inplace=True)
print("Inchikey eliminated: ", total_len-len(eos21q7))
eos21q7.rename(columns={"smiles":SMICOL, "toxicity":ACTCOL}, inplace=True) #looking at the model, toxicity was chosen for activity
eos21q7 = eos21q7[[SMICOL, INCHICOL, ACTCOL]]
eos21q7.to_csv(os.path.join(DATAPATH, "model_datasets", "eos21q7_processed.csv"), index=False)


Inchikey eliminated:  0


In [27]:
#eos7e3s
train_data_without_outcome = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos7e3s", "dilismiles.csv"))
train_data_with_outcome = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos7e3s", "dili_padel_2d.csv"))

##Add the columns together
eos7e3s = pd.concat([train_data_without_outcome, train_data_with_outcome], axis=1)

inchikeys = []
for smi in eos7e3s["col_smiles"]:
    mol = Chem.MolFromSmiles(smi)
    if mol is not None:
        inchikey = Chem.MolToInchiKey(mol)
    else:
        inchikey = None
    inchikeys += [inchikey]

eos7e3s[INCHICOL] = inchikeys
total_len = len(eos7e3s)
eos7e3s.dropna(subset=[INCHICOL], inplace=True)
print("Inchikey eliminated: ", total_len-len(eos7e3s))
eos7e3s.rename(columns={"col_smiles":SMICOL, "Outcome":ACTCOL}, inplace=True) #looking at the model, Outcome was chosen for activity
eos7e3s = eos7e3s[[SMICOL, INCHICOL, ACTCOL]]
eos7e3s.to_csv(os.path.join(DATAPATH, "model_datasets", "eos7e3s_processed.csv"), index=False)



  train_data_with_outcome = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos7e3s", "dili_padel_2d.csv"))


Inchikey eliminated:  0


Once all the datasets have been cleaned, we can compare them

In [9]:
models = ["eos21q7", "eos7e3s"]
# load the datasets and make comparisons
eos21q7 = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos21q7_processed.csv"))
eos7e3s = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos7e3s_processed.csv"))

# proportion of actives and inactives in each dataset
print("eos21q7:")
print(eos21q7['activity'].value_counts())

print("\neos7e3s:")
print(eos7e3s['activity'].value_counts())

# number of repeated inchikey between models
# Check repeated inchikey within each dataset
repeated_inchikey_eos21q7 = eos21q7['inchikey'].duplicated().sum()
repeated_inchikey_eos7e3s = eos7e3s['inchikey'].duplicated().sum()

# Print the results
print(f"Number of repeated inchikeys in eos21q7: {repeated_inchikey_eos21q7}")
print(f"Number of repeated inchikeys in eos7e3s: {repeated_inchikey_eos7e3s}")

# Check repeated inchikey between pairs of datasets
repeated_inchikey_eos21q7_eos7e3s = pd.concat([eos21q7['inchikey'], eos7e3s['inchikey']]).duplicated().sum()

# Print the results
print(f"Number of repeated inchikeys between eos21q7 and eos7e3s: {repeated_inchikey_eos21q7_eos7e3s}")


eos21q7:
activity
1    952
0    898
Name: count, dtype: int64

eos7e3s:
activity
1    394
0    194
Name: count, dtype: int64
Number of repeated inchikeys in eos21q7: 224
Number of repeated inchikeys in eos7e3s: 3
Number of repeated inchikeys between eos21q7 and eos7e3s: 555


In [10]:
## Get the percentage overlap between models

# Define the dataset paths
datasets = [
    {'path': '../data/model_datasets/eos21q7_processed.csv'},
    {'path': '../data/model_datasets/eos7e3s_processed.csv'}
]

# Read datasets into a list of DataFrames
dfs = [pd.read_csv(dataset['path']) for dataset in datasets]

# Dictionary to store the results
overlap_results = {}

# Method 1: Get inchi keys common to ALL the models' datasets
common_keys = set.intersection(*(set(df['inchikey']) for df in dfs))
total_common_molecules = len(common_keys)

for i, df in enumerate(dfs):
    total_molecules = len(df)
    percentage_overlap = (total_common_molecules / total_molecules) * 100
    overlap_results[f"Percentage overlap for {datasets[i]['path']} with all models"] = percentage_overlap

# Method 2: Pairwise percentage overlap
for i in range(len(dfs)):
    for j in range(i + 1, len(dfs)):
        common_keys_pairwise = set.intersection(set(dfs[i]['inchikey']), set(dfs[j]['inchikey']))
        total_molecules_i = len(dfs[i])
        total_molecules_j = len(dfs[j])
        percentage_overlap_pairwise = (len(common_keys_pairwise) / min(total_molecules_i, total_molecules_j)) * 100
        overlap_results[f"Percentage overlap between {datasets[i]['path']} and {datasets[j]['path']}"] = percentage_overlap_pairwise

# Display the results
for key, value in overlap_results.items():
    print(f"{key}: {value:.2f}%")

Percentage overlap for ../data/model_datasets/eos21q7_processed.csv with all models: 17.73%
Percentage overlap for ../data/model_datasets/eos7e3s_processed.csv with all models: 55.78%
Percentage overlap between ../data/model_datasets/eos21q7_processed.csv and ../data/model_datasets/eos7e3s_processed.csv: 55.78%


# Build test dataset

 Our first test dataset is the TDC dili dataset from https://tdcommons.ai/

In [28]:
tdc_dataset = pd.read_csv(os.path.join(DATAPATH, "test_data", "dili_tdc_dataset.csv"), sep='\,')

  tdc_dataset = pd.read_csv(os.path.join(DATAPATH, "test_data", "dili_tdc_dataset.csv"), sep='\,')


In [29]:
tdc_dataset.columns

Index(['Drug_ID', 'Drug', 'Y'], dtype='object')

In [42]:
#merge and remove duplicates. Obtain InChiKeys for all
# Rename the "Drug" column to "smiles"
tdc_dataset.rename(columns={"Drug": "smiles", "Y": "activity"}, inplace=True)

# List to store InChiKeys
inchikeys = []
for smi in tdc_dataset["smiles"]:
    mol = Chem.MolFromSmiles(smi)
    if mol is not None:
        inchikey = Chem.MolToInchiKey(mol)
    else:
        inchikey = None
    inchikeys.append(inchikey)

# Add the InChiKeys to the dataset
tdc_dataset[INCHICOL] = inchikeys

# Drop rows with missing InChiKeys
tdc_dataset.dropna(subset=[INCHICOL], inplace=True)

# Remove duplicates based on "inchikey" column
total_len = len(tdc_dataset)
tdc_dataset.drop_duplicates(subset=["inchikey"], inplace=True)
print("INchikey eliminated: ", total_len - len(tdc_dataset))

# Assuming you want to reorder the columns
tdc_dataset_processed = tdc_dataset[[SMICOL, INCHICOL, ACTCOL]]

# Saving the processed dataset to a CSV file
output_file = os.path.join("../data", "test_data", "tdc_dataset_processed.csv")
tdc_dataset_processed.to_csv(output_file, index=False)


INchikey eliminated:  0


In [40]:
print(tdc_dataset_processed)

                                                smiles  \
0                                 CC(=O)OCC[N+](C)(C)C   
1                                C[N+](C)(C)CC(=O)[O-]   
2         O=C(NC(CO)C(O)c1ccc([N+](=O)[O-])cc1)C(Cl)Cl   
3                                      O=C(O)c1ccccc1O   
4                       CC(NC(C)(C)C)C(=O)c1cccc(Cl)c1   
..                                                 ...   
470           CCCC(CCC)C(=O)O.CCCC(CCC)C(=O)[O-].[Na+]   
471  CCCCC(CC)COC(=O)CC(C(=O)OCC(CC)CCCC)S(=O)(=O)[...   
472  C=C1c2cccc(O)c2C(O)=C2C(=O)C3(O)C(O)=C(C(N)=O)...   
473                             O=C1OC(C(O)CO)C(O)=C1O   
474  CN(C)C1C(=O)C(C(N)=O)=C(O)C2(O)C(=O)C3=C(O)c4c...   

                        inchikey  activity  
0    OIPILFWXSMYKGL-UHFFFAOYSA-N       0.0  
1    KWIUHFFTVRNATP-UHFFFAOYSA-N       0.0  
2    WIIZWVCIJKGZOK-UHFFFAOYSA-N       0.0  
3    YGSDEFSMJLZEOE-UHFFFAOYSA-N       0.0  
4    SNPPWIUOZRMYNY-UHFFFAOYSA-N       0.0  
..                           ... 

In [48]:
## Now that we have arranged the tdc dataset in smiles_inchikey_activity
## we can  eliminate duplicated molecules with training set

# load the datasets and make comparisons
eos21q7 = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos21q7_processed.csv"))
eos7e3s = pd.read_csv(os.path.join(DATAPATH, "model_datasets","eos7e3s_processed.csv"))

## concatenate the training set together
training_set = pd.concat([eos21q7, eos7e3s], ignore_index=True)

# Save the training set to a CSV file
training_set.to_csv(os.path.join(DATAPATH, "model_datasets","training_set.csv"), index=False)

##Load the test dataset
test_dataset= pd.read_csv(os.path.join(DATAPATH, "test_data", "tdc_dataset_processed.csv")) 

# Calculate the number of removed InChiKey
initial_inchikey_count = len(test_dataset)
processed_test_dataset = test_dataset[~test_dataset['inchikey'].isin(training_set['inchikey'])]
removed_inchikey_count = initial_inchikey_count - len(processed_test_dataset)

# Print the number of removed smiles
print(f"Number of removed inchikey: {removed_inchikey_count}")

# Save the processed test dataset to a file
processed_test_dataset.to_csv(os.path.join(DATAPATH, "test_data", "processed_test_dataset.csv"), index=False)


Number of removed inchikey: 473


In [50]:
## Confirm that the training and test dataset has no Inchikey in common

training_set = pd.read_csv(os.path.join(DATAPATH,"model_datasets", "training_set.csv"))
test_set = pd.read_csv(os.path.join(DATAPATH,"test_data", "processed_test_dataset.csv"))

# Check for common inchikeys
common_inchikey = set(training_set['inchikey']).intersection(set(test_set['inchikey']))

# Print the number of common Inchikey
print(f"Number of common inchikeys between training and test datasets: {len(common_inchikey)}")

print("Common inchikey:", common_inchikey)

Number of common inchikeys between training and test datasets: 0
Common inchikey: set()
