In [14]:
import os
import pandas as pd

DATAPATH = "../data"

In [17]:
from rdkit import Chem
from standardiser import standardise
import numpy as np

def standardise_smiles(smiles):
    st_smiles = []
    for smi in smiles:
        if smi is None:
            st_smi = np.nan
            st_smiles += [st_smi]
            continue
        smi = str(smi)
        smi = smi.strip()
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            st_smi = np.nan
            st_smiles += [st_smi]
            continue
        try:
            std_mol = standardise.run(mol)
            st_smi = Chem.MolToSmiles(std_mol, canonical=True)
            st_smiles += [st_smi]
        except:
            st_smi = Chem.MolToSmiles(mol, canonical=True)
            st_smiles += [st_smi]
        if std_mol is None:
            st_smi = Chem.MolToSmiles(mol, canonical=True)
            st_smiles += [st_smi]
            continue
    return st_smiles

In [18]:
##load the dataset
df = pd.read_csv(os.path.join(DATAPATH, 'Total_dataset.csv'))
df.columns

Index(['smiles', 'toxicity', 'ref'], dtype='object')

In [19]:
df["st_smiles"] = df["st_smiles"] = standardise_smiles(df["smiles"].tolist())
print(len(df[df["st_smiles"].isna()]))

[14:22:40] Can't kekulize mol.  Unkekulized atoms: 0 1 3 4 6 9
[14:22:41] Can't kekulize mol.  Unkekulized atoms: 0 1 3 4 7 9
[14:22:41] Can't kekulize mol.  Unkekulized atoms: 0 1 3 4 8 10


0


In [20]:
##Divide into train and test.
##According to the authors column ref with NCTR, GREENE, Xu and Liew were used for training wile DILIrank was used for testing
df.groupby("ref")["st_smiles"].count()

ref
DILIrank    452
Greene      229
Liew        775
NCTR        348
Xu           46
Name: st_smiles, dtype: int64

In [23]:
# We will first benchmark the model on the DILIRank using the four training sets only and then train a final model on the whole dataset.

train_refs = ["NCTR", "Greene", "Xu", "Liew"]
train_df = df[df["ref"].isin(train_refs)]
test_df = df[df["ref"]=="DILIrank"]
train_df = train_df[["st_smiles", "toxicity"]]
train_df.to_csv(os.path.join(DATAPATH, "train.csv"), index=False)
test_df = test_df[["st_smiles", "toxicity"]]
test_df.to_csv(os.path.join(DATAPATH, "test.csv"), index=False)

final_df = df[["st_smiles", "toxicity"]]
final_df.to_csv(os.path.join(DATAPATH, "full_dataset.csv"), index=False)

In [25]:
# Count the occurrences of positive (1) and negative (0) outcomes in the 'toxicity' column of our train dataset
outcome_counts = train_df['toxicity'].value_counts()

# Print the result
total_samples = len(train_df)
print("Number of Negative (0) outcomes in train dataset:", outcome_counts[0])
print("Number of Positive (1) outcomes in train dataset:", outcome_counts[1])
print("Total number of train samples:", total_samples)

# Calculate and print the ratio of negative (0) and positive (1) outcomes
outcome_ratio = train_df['toxicity'].value_counts(normalize=True)
print("\nRatio of Negative (0) outcomes:", outcome_ratio[0])
print("Ratio of Positive (1) outcomes:", outcome_ratio[1])

Number of Negative (0) outcomes in train dataset: 630
Number of Positive (1) outcomes in train dataset: 768
Total number of train samples: 1398

Ratio of Negative (0) outcomes: 0.45064377682403434
Ratio of Positive (1) outcomes: 0.5493562231759657


In [10]:
# Count the occurrences of positive (1) and negative (0) outcomes in the 'toxicity' column of our test dataset
#Load test dataset
outcome_counts = test_df['toxicity'].value_counts()

# Print the result
total_samples = len(test_df)
print("Number of Negative (0) outcomes in test dataset:", outcome_counts[0])
print("Number of Positive (1) outcomes in test dataset:", outcome_counts[1])
print("Total number of test samples:", total_samples)

# Calculate and print the ratio of negative (0) and positive (1) outcomes
outcome_ratio = test_df['toxicity'].value_counts(normalize=True)
print("\nRatio of Negative (0) outcomes:", outcome_ratio[0])
print("Ratio of Positive (1) outcomes:", outcome_ratio[1])

Number of Negative (0) outcomes in test dataset: 268
Number of Positive (1) outcomes in test dataset: 184
Total number of test samples: 452

Ratio of Negative (0) outcomes: 0.5929203539823009
Ratio of Positive (1) outcomes: 0.40707964601769914
