In [15]:
import os
import pandas as pd
from rdkit import Chem
import numpy as np
from rdkit.Chem import AllChem
from rdkit.Chem import QED
from rdkit.Chem import DataStructs

DATAPATH = "../data"

In [2]:
## Data Analysis

In [16]:
##load the dataset
df1 = pd.read_csv(os.path.join(DATAPATH, 'Total_dataset.csv'))


In [5]:
df1.head(10)  # Display the first ten rows

Unnamed: 0,smiles,toxicity,ref
0,CC1=NN=C(O1)C(=O)NC(C)(C)C2=N/C(=C(/NCC3=CC=C(...,1,DILIrank
1,C1=CC2=C(C=C1OC(F)(F)F)SC(=N2)N,1,DILIrank
2,CCC1C(=O)N(CC(=O)N(C(C(=O)NC(C(=O)N(C(C(=O)NC(...,1,DILIrank
3,CC(C1=CC(=CC=C1)OC2=CC=CC=C2)C(=O)O,1,DILIrank
4,CC(=O)NC1=NN=C(S1)S(=O)(=O)N,1,DILIrank
5,CNCCC=C1C2=CC=CC=C2CCC3=CC=CC=C31,1,DILIrank
6,CC1=CC2=C(C=C1C(=C)C3=CC=C(C=C3)C(=O)O)C(CCC2(...,1,DILIrank
7,CNC(=O)C1=NC=CC(=C1)OC2=CC=C(C=C2)NC(=O)NC3=CC...,1,DILIrank
8,COC1=C(C=C(C=C1)CC2=NC=CC3=CC(=C(C=C32)OC)OC)OC,1,DILIrank
9,CS(=O)(=O)NC1=C(C=C(C=C1)[N+](=O)[O-])OC2=CC=C...,1,DILIrank


In [17]:
# Count the occurrences of positive (1) and negative (0) outcomes in the 'toxicity' column of out dataset
outcome_counts = df1['toxicity'].value_counts()

# Print the result
total_samples = len(df1)
print("Number of Negative (0) outcomes:", outcome_counts[0])
print("Number of Positive (1) outcomes:", outcome_counts[1])
print("Total number of samples:", total_samples)

# Calculate and print the ratio of negative (0) and positive (1) outcomes
outcome_ratio = df1['toxicity'].value_counts(normalize=True)
print("\nRatio of Negative (0) outcomes:", outcome_ratio[0])
print("Ratio of Positive (1) outcomes:", outcome_ratio[1])

Number of Negative (0) outcomes: 898
Number of Positive (1) outcomes: 952
Total number of samples: 1850

Ratio of Negative (0) outcomes: 0.4854054054054054
Ratio of Positive (1) outcomes: 0.5145945945945946


In [18]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import QED
from rdkit.Chem import DataStructs

# Load the dataset
df1 = pd.read_csv(os.path.join(DATAPATH, 'Total_dataset.csv'))

# Now you can proceed with the rest of the code
mols = [Chem.MolFromSmiles(smiles) for smiles in df1["smiles"]]

# If smiles don't transform to mol, add to non_list
none_list = []
for i in range(len(mols)):
    if mols[i] is None:
        none_list.append(i)
        print('add to none_list')

reg_idx = 0
for i in none_list:
    del mols[i - reg_idx]
    reg_idx += 1

# Modify index
if len(none_list) != 0:
    df1 = df1.drop(none_list, axis=0)
    df1 = df1.reset_index(drop=True)

# Create fingerprint
bit_info_list = []  # Bit vector
bit_info = {}  # Bit vector
fps = []
b = 0

# Mol to fingerprint Bit Vector
for a in mols:
    fps.append(AllChem.GetMorganFingerprintAsBitVect(a, 3, nBits=1024, bitInfo=bit_info))
    bit_info_list.append(bit_info.copy())

# To array
arr_list = list()
for i in range(len(fps)):
    array = np.zeros((0,), dtype=np.int8)
    arr_list.append(array)

for i in range(len(fps)):
    bit = fps[i]
    DataStructs.ConvertToNumpyArray(bit, arr_list[i])

test_x = np.stack([i.tolist() for i in arr_list])
test_x = test_x.astype(np.float32)
test_finprt = pd.DataFrame(test_x)

# Create physicochemical properties
qe = [QED.properties(mol) for mol in mols]
qe = pd.DataFrame(qe)
# Adding the toxicity column to test_finprt and qe dataframes
test_finprt['toxicity'] = df1['toxicity']
qe['toxicity'] = df1['toxicity']


In [19]:
# Save computed molecular descriptors to a CSV file
descriptors_filepath = os.path.join(DATAPATH, 'computed_descriptors.csv')
test_finprt.to_csv(descriptors_filepath, index=False)

# Save physicochemical properties to a CSV file
properties_filepath = os.path.join(DATAPATH, 'physicochemical_properties.csv')
qe.to_csv(properties_filepath, index=False)