In [2]:
import os
import pandas as pd
from rdkit import Chem
import numpy as np
from rdkit.Chem import AllChem
from rdkit.Chem import QED
from rdkit.Chem import DataStructs

DATAPATH = "../data"

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [4]:
##load the dataset
df = pd.read_csv(os.path.join(DATAPATH, 'Total_dataset.csv'))
df.head(10)

Unnamed: 0,smiles,toxicity,ref
0,CC1=NN=C(O1)C(=O)NC(C)(C)C2=N/C(=C(/NCC3=CC=C(...,1,DILIrank
1,C1=CC2=C(C=C1OC(F)(F)F)SC(=N2)N,1,DILIrank
2,CCC1C(=O)N(CC(=O)N(C(C(=O)NC(C(=O)N(C(C(=O)NC(...,1,DILIrank
3,CC(C1=CC(=CC=C1)OC2=CC=CC=C2)C(=O)O,1,DILIrank
4,CC(=O)NC1=NN=C(S1)S(=O)(=O)N,1,DILIrank
5,CNCCC=C1C2=CC=CC=C2CCC3=CC=CC=C31,1,DILIrank
6,CC1=CC2=C(C=C1C(=C)C3=CC=C(C=C3)C(=O)O)C(CCC2(...,1,DILIrank
7,CNC(=O)C1=NC=CC(=C1)OC2=CC=C(C=C2)NC(=O)NC3=CC...,1,DILIrank
8,COC1=C(C=C(C=C1)CC2=NC=CC3=CC(=C(C=C32)OC)OC)OC,1,DILIrank
9,CS(=O)(=O)NC1=C(C=C(C=C1)[N+](=O)[O-])OC2=CC=C...,1,DILIrank


In [9]:
# check if all molecules can be converted using rdkit using author's code

mols = [Chem.MolFromSmiles(smiles) for smiles in df["smiles"]]

none_list = []
for i in range(len(mols)):
    if mols[i] is None :
        none_list.append(i)
        print('add to none_list')
    
reg_idx = 0
for i in none_list :
    del mols[i - reg_idx]
    reg_idx += 1
    
# modify index
if len(none_list) != 0 :
    test_df = df.drop(none_list, axis=0)
    test_df = df.reset_index(drop = True)
df.shape

#all molecules can be converted by rdkit so we do not need to check this further at this stage

(1850, 3)

In [6]:
##Divide into train and test.
##According to the authors column ref with NCTR, GREENE, Xu and Liew were used for training wile DILIrank was used for testing

# Split the DataFrame based on the value in the 'ref' column
test = df[df['ref'] == 'DILIrank']
train = df[df['ref'] != 'DILIrank']

# Write the divided DataFrames to separate CSV files
test.to_csv(os.path.join(DATAPATH,"test.csv"), index=False)
train.to_csv(os.path.join(DATAPATH, "train.csv"), index=False)
print(len(df), len(train), len(test), len(train)+len(test))

1850 1398 452 1850


In [7]:
# Count the occurrences of positive (1) and negative (0) outcomes in the 'toxicity' column of our train dataset
outcome_counts = train['toxicity'].value_counts()

# Print the result
total_samples = len(train)
print("Number of Negative (0) outcomes in train dataset:", outcome_counts[0])
print("Number of Positive (1) outcomes in train dataset:", outcome_counts[1])
print("Total number of train samples:", total_samples)

# Calculate and print the ratio of negative (0) and positive (1) outcomes
outcome_ratio = train['toxicity'].value_counts(normalize=True)
print("\nRatio of Negative (0) outcomes:", outcome_ratio[0])
print("Ratio of Positive (1) outcomes:", outcome_ratio[1])

Number of Negative (0) outcomes in train dataset: 630
Number of Positive (1) outcomes in train dataset: 768
Total number of train samples: 1398

Ratio of Negative (0) outcomes: 0.45064377682403434
Ratio of Positive (1) outcomes: 0.5493562231759657


In [8]:
# Count the occurrences of positive (1) and negative (0) outcomes in the 'toxicity' column of our test dataset
#Load test dataset
outcome_counts = test['toxicity'].value_counts()

# Print the result
total_samples = len(test)
print("Number of Negative (0) outcomes in test dataset:", outcome_counts[0])
print("Number of Positive (1) outcomes in test dataset:", outcome_counts[1])
print("Total number of test samples:", total_samples)

# Calculate and print the ratio of negative (0) and positive (1) outcomes
outcome_ratio = test['toxicity'].value_counts(normalize=True)
print("\nRatio of Negative (0) outcomes:", outcome_ratio[0])
print("Ratio of Positive (1) outcomes:", outcome_ratio[1])

Number of Negative (0) outcomes in test dataset: 268
Number of Positive (1) outcomes in test dataset: 184
Total number of test samples: 452

Ratio of Negative (0) outcomes: 0.5929203539823009
Ratio of Positive (1) outcomes: 0.40707964601769914
