In [13]:
import os
import pandas as pd
from rdkit import Chem

DATAPATH = "../data"
SMICOL = "smiles"
INCHICOL = "inchikey"
ACTCOL = "activity"

# Compare Model Training datasets

First, we clean up the original files and add the InChiKey of the smiles if not available. We want to create a dataframe with three columns, smiles, inchikey and activity. We will store each dataset under data/model_datasets/{model_name}_processed.csv

In [14]:
#eos30gr

train_data = pd.read_excel(os.path.join(DATAPATH, "model_datasets", "eos30gr.xlsx"), sheet_name=0)
test_data = pd.read_excel(os.path.join(DATAPATH, "model_datasets", "eos30gr.xlsx"), sheet_name=1)
valid_data = pd.read_excel(os.path.join(DATAPATH, "model_datasets", "eos30gr.xlsx"), sheet_name=2)
eos30gr = pd.concat([train_data, test_data, valid_data])

inchikeys = []
for smi in eos30gr["Smiles"]:
    mol = Chem.MolFromSmiles(smi)
    if mol is not None:
        inchikey = Chem.MolToInchiKey(mol)
    else:
        inchikey = None
    inchikeys += [inchikey]

eos30gr[INCHICOL] = inchikeys
total_len = len(eos30gr)
eos30gr.dropna(subset=[INCHICOL], inplace=True)
print("Smiles eliminated: ", total_len-len(eos30gr))
eos30gr.rename(columns={"Smiles":SMICOL, "activity10":ACTCOL}, inplace=True) #looking at the model, activity 10 was chosen for activity
eos30gr = eos30gr[[SMICOL, INCHICOL, ACTCOL]]
eos30gr.to_csv(os.path.join(DATAPATH, "model_datasets", "eos30gr_processed.csv"), index=False)


[10:44:57] non-ring atom 10 marked aromatic
[10:44:57] non-ring atom 12 marked aromatic
[10:44:57] non-ring atom 10 marked aromatic
[10:44:57] non-ring atom 14 marked aromatic
[10:44:57] non-ring atom 10 marked aromatic
[10:44:57] non-ring atom 10 marked aromatic
[10:44:57] non-ring atom 10 marked aromatic
[10:44:58] non-ring atom 21 marked aromatic
[10:44:58] non-ring atom 10 marked aromatic
[10:44:58] non-ring atom 10 marked aromatic
[10:44:58] non-ring atom 12 marked aromatic
[10:44:58] non-ring atom 12 marked aromatic
[10:44:58] Explicit valence for atom # 0 N, 4, is greater than permitted
[10:44:58] Explicit valence for atom # 0 N, 4, is greater than permitted
[10:44:59] Explicit valence for atom # 0 N, 4, is greater than permitted
[10:44:59] Explicit valence for atom # 0 N, 4, is greater than permitted
[10:44:59] Explicit valence for atom # 0 N, 4, is greater than permitted
[10:44:59] Explicit valence for atom # 0 N, 4, is greater than permitted
[10:44:59] Explicit valence for at

Smiles eliminated:  49


In [None]:
#eos2ta5

In [None]:
#eos4tcc

In [None]:
#eos30f3

In [None]:
#eos43at

Once all the datasets have been cleaned, we can compare them

In [None]:
models = ["eos2ta5", "eos4tcc", "eos30f3", "eos30gr", "eos43at"]

# load the datasets and make comparisons

# proportion of actives and inactives in each dataset

# number of repeated smiles between models

# Build test dataset

We collate in a single file the data from the NCATS repository and eliminate any duplicate molecules that exist in the training sets of the models already.

In [16]:
df1 = pd.read_csv(os.path.join(DATAPATH, "test_data", "training_set_ncats.csv"))
df2 = pd.read_csv(os.path.join(DATAPATH, "test_data", "validation_set_ncats.csv"))

In [20]:
df1.columns

Index(['smiles', 'activity', 'source'], dtype='object')

In [22]:
#merge and remove duplicates. Obtain InChiKeys for all

df = pd.concat([df1, df2])
inchikeys = []
for smi in df["smiles"]:
    mol = Chem.MolFromSmiles(smi)
    if mol is not None:
        inchikey = Chem.MolToInchiKey(mol)
    else:
        inchikey = None
    inchikeys += [inchikey]

df[INCHICOL] = inchikeys
total_len = len(df)
df.dropna(subset=[INCHICOL], inplace=True)
print("Smiles eliminated: ", total_len-len(df))
total_len = len(df)
df.drop_duplicates(subset=[SMICOL], inplace=True)
print("Smiles eliminated: ", total_len-len(df))
df = df[[SMICOL, INCHICOL, ACTCOL]]
df.to_csv(os.path.join(DATAPATH, "test_data", "ncats.csv"), index=False)


Smiles eliminated:  0
Smiles eliminated:  32


In [None]:
# Now, from the all NCATS data, we eliminate duplicated molecules with training set