In [22]:
import os
import pandas as pd
from rdkit import Chem
import numpy as np
from rdkit.Chem import AllChem
from rdkit.Chem import QED
from rdkit.Chem import DataStructs

DATAPATH = "../data"

In [2]:
## Data Analysis

In [23]:
##load the dataset
df1 = pd.read_csv(os.path.join(DATAPATH, 'Total_dataset.csv'))


In [24]:
df1.head(10)  # Display the first ten rows

Unnamed: 0,smiles,toxicity,ref
0,CC1=NN=C(O1)C(=O)NC(C)(C)C2=N/C(=C(/NCC3=CC=C(...,1,DILIrank
1,C1=CC2=C(C=C1OC(F)(F)F)SC(=N2)N,1,DILIrank
2,CCC1C(=O)N(CC(=O)N(C(C(=O)NC(C(=O)N(C(C(=O)NC(...,1,DILIrank
3,CC(C1=CC(=CC=C1)OC2=CC=CC=C2)C(=O)O,1,DILIrank
4,CC(=O)NC1=NN=C(S1)S(=O)(=O)N,1,DILIrank
5,CNCCC=C1C2=CC=CC=C2CCC3=CC=CC=C31,1,DILIrank
6,CC1=CC2=C(C=C1C(=C)C3=CC=C(C=C3)C(=O)O)C(CCC2(...,1,DILIrank
7,CNC(=O)C1=NC=CC(=C1)OC2=CC=C(C=C2)NC(=O)NC3=CC...,1,DILIrank
8,COC1=C(C=C(C=C1)CC2=NC=CC3=CC(=C(C=C32)OC)OC)OC,1,DILIrank
9,CS(=O)(=O)NC1=C(C=C(C=C1)[N+](=O)[O-])OC2=CC=C...,1,DILIrank


In [25]:
##Divide into train and test.
##According to the authors column ref with NCTR, GREENE, Xu and Liew were used for training wile DILIrank was used for testing

import os
import pandas as pd

# Create a folder for train test splits if it doesn't exist
train_test_splits_folder = os.path.join(DATAPATH, "train_test_splits")
os.makedirs(train_test_splits_folder, exist_ok=True)

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(os.path.join(DATAPATH, 'Total_dataset.csv'))

# Split the DataFrame based on the value in the 'ref' column
dilirank_df = df[df['ref'] == 'DILIrank']
others_df = df[df['ref'] != 'DILIrank']

# Write the divided DataFrames to separate CSV files
dilirank_df.to_csv(os.path.join(train_test_splits_folder, "test.csv"), index=False)
others_df.to_csv(os.path.join(train_test_splits_folder, "train.csv"), index=False)



In [28]:
# Count the occurrences of positive (1) and negative (0) outcomes in the 'toxicity' column of our train dataset
#Load train dataset
train_df = pd.read_csv(os.path.join(train_test_splits_folder, "train.csv"))

outcome_counts = train_df['toxicity'].value_counts()

# Print the result
total_samples = len(train_df)
print("Number of Negative (0) outcomes in train dataset:", outcome_counts[0])
print("Number of Positive (1) outcomes in train dataset:", outcome_counts[1])
print("Total number of train samples:", total_samples)

# Calculate and print the ratio of negative (0) and positive (1) outcomes
outcome_ratio = train_df['toxicity'].value_counts(normalize=True)
print("\nRatio of Negative (0) outcomes:", outcome_ratio[0])
print("Ratio of Positive (1) outcomes:", outcome_ratio[1])

Number of Negative (0) outcomes in train dataset: 630
Number of Positive (1) outcomes in train dataset: 768
Total number of train samples: 1398

Ratio of Negative (0) outcomes: 0.45064377682403434
Ratio of Positive (1) outcomes: 0.5493562231759657


In [29]:
# Count the occurrences of positive (1) and negative (0) outcomes in the 'toxicity' column of our test dataset
#Load test dataset
test_df = pd.read_csv(os.path.join(train_test_splits_folder, "test.csv"))

outcome_counts = test_df['toxicity'].value_counts()

# Print the result
total_samples = len(test_df)
print("Number of Negative (0) outcomes in test dataset:", outcome_counts[0])
print("Number of Positive (1) outcomes in test dataset:", outcome_counts[1])
print("Total number of test samples:", total_samples)

# Calculate and print the ratio of negative (0) and positive (1) outcomes
outcome_ratio = test_df['toxicity'].value_counts(normalize=True)
print("\nRatio of Negative (0) outcomes:", outcome_ratio[0])
print("Ratio of Positive (1) outcomes:", outcome_ratio[1])

Number of Negative (0) outcomes in test dataset: 268
Number of Positive (1) outcomes in test dataset: 184
Total number of test samples: 452

Ratio of Negative (0) outcomes: 0.5929203539823009
Ratio of Positive (1) outcomes: 0.40707964601769914


In [20]:
##Convert the train csv to descriptors and phsiochemical properties and combine together 
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import QED
from rdkit.Chem import DataStructs
import os

# Load the dataset
train_df = pd.read_csv(os.path.join(train_test_splits_folder, "train.csv"))

# Now you can proceed with the rest of the code
mols = [Chem.MolFromSmiles(smiles) for smiles in train_df["smiles"]]

# If smiles don't transform to mol, add to none_list
none_list = []
for i in range(len(mols)):
    if mols[i] is None:
        none_list.append(i)
        print('add to none_list')

reg_idx = 0
for i in none_list:
    del mols[i - reg_idx]
    reg_idx += 1

# Modify index
if len(none_list) != 0:
    train_df = train_df.drop(none_list, axis=0)
    train_df = train_df.reset_index(drop=True)

# Create fingerprint
bit_info_list = []  # Bit vector
bit_info = {}  # Bit vector
fps = []
b = 0

# Mol to fingerprint Bit Vector
for a in mols:
    fps.append(AllChem.GetMorganFingerprintAsBitVect(a, 3, nBits=1024, bitInfo=bit_info))
    bit_info_list.append(bit_info.copy())

# To array
arr_list = list()
for i in range(len(fps)):
    array = np.zeros((0,), dtype=np.int8)
    arr_list.append(array)

for i in range(len(fps)):
    bit = fps[i]
    DataStructs.ConvertToNumpyArray(bit, arr_list[i])

train_x = np.stack([i.tolist() for i in arr_list])
train_x = train_x.astype(np.float32)
train_finprt = pd.DataFrame(train_x)

# Create physicochemical properties
qe = [QED.properties(mol) for mol in mols]
qe = pd.DataFrame(qe)

# Adding the toxicity column to train_finprt and qe dataframe
qe['toxicity'] = train_df['toxicity']

# Merge QED properties dataframe with train_finprt
train_finprt = pd.concat([train_finprt, qe], axis=1)


# # Save train_finprt dataframe to a CSV file
train_filepath = os.path.join(DATAPATH, 'train_finprt_with_qed.csv')
train_finprt.to_csv(train_filepath, index=False)



In [21]:
##Convert the test csv to descriptors and phsiochemical properties and combine together 

# Load the dataset
test_df = pd.read_csv(os.path.join(train_test_splits_folder, "test.csv"))

# Now you can proceed with the rest of the code
mols = [Chem.MolFromSmiles(smiles) for smiles in test_df["smiles"]]

# If smiles don't transform to mol, add to none_list
none_list = []
for i in range(len(mols)):
    if mols[i] is None:
        none_list.append(i)
        print('add to none_list')

reg_idx = 0
for i in none_list:
    del mols[i - reg_idx]
    reg_idx += 1

# Modify index
if len(none_list) != 0:
    test_df = train_df.drop(none_list, axis=0)
    test_df = train_df.reset_index(drop=True)

# Create fingerprint
bit_info_list = []  # Bit vector
bit_info = {}  # Bit vector
fps = []
b = 0

# Mol to fingerprint Bit Vector
for a in mols:
    fps.append(AllChem.GetMorganFingerprintAsBitVect(a, 3, nBits=1024, bitInfo=bit_info))
    bit_info_list.append(bit_info.copy())

# To array
arr_list = list()
for i in range(len(fps)):
    array = np.zeros((0,), dtype=np.int8)
    arr_list.append(array)

for i in range(len(fps)):
    bit = fps[i]
    DataStructs.ConvertToNumpyArray(bit, arr_list[i])

test_x = np.stack([i.tolist() for i in arr_list])
test_x = test_x.astype(np.float32)
test_finprt = pd.DataFrame(test_x)

# Create physicochemical properties
qe = [QED.properties(mol) for mol in mols]
qe = pd.DataFrame(qe)

# Adding the toxicity column to train_finprt and qe dataframe
qe['toxicity'] = test_df['toxicity']

# Merge QED properties dataframe with train_finprt
test_finprt = pd.concat([test_finprt, qe], axis=1)


# # Save test_finprt dataframe to a CSV file
test_filepath = os.path.join(DATAPATH, 'test_finprt_with_qed.csv')
test_finprt.to_csv(test_filepath, index=False)

