In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Define a function to convert SMILES to molecules
def smiles_to_molecule(smiles):
    return Chem.MolFromSmiles(smiles)

ipc_index= [42]   # index of descriptor ipc to be removed as it has extremely large value

# Load the data from the Excel files
smiles_file = r"C:\Users\grvkr\Box\Gaurav Kumar\Purdue_Work\SAR_NM\Data\Siegwart_group_Data1_2021\Activity\SMILES_old.xlsx" 
target_file = r"C:\Users\grvkr\Box\Gaurav Kumar\Purdue_Work\SAR_NM\Data\Siegwart_group_Data1_2021\Activity\Activity_old.xlsx" 

smiles_df = pd.read_excel(smiles_file)
target_df = pd.read_excel(target_file)

# Initialize empty lists for features and target values
X = []
y = []

# Define the number of rows and columns
Nrows = 572
Ncolumns = 1

# Iterate through rows and columns of both dataframes
for i in range(Nrows):
    for j in range(Ncolumns):
        if i < len(smiles_df) and i < len(target_df):  # Check if the row index is within bounds
            smiles = smiles_df.iloc[i, j]
            target = target_df.iloc[i, j]

            molecule = smiles_to_molecule(smiles)
            if molecule is not None:  # Check if the SMILES is valid
                # Calculate all molecular descriptors as a dictionary
                molecular_descriptors_dict = Descriptors.CalcMolDescriptors(molecule)   
                # Extract the values from the dictionary and append to X
                molecular_descriptors = [value for value in molecular_descriptors_dict.values()]
                molecular_descriptors= np.array(molecular_descriptors)
                molecular_descriptors= np.delete(molecular_descriptors, ipc_index)
                
                X.append(molecular_descriptors)
                y.append(target)

# Convert the lists to NumPy arrays
X = np.array(X)
y = np.array(y)

# Define the target value categories based on the updated ranges
category_ranges = [(0, 25000), (25000, float('inf'))]
categories = [1, 2]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Determine majority class in training data
true_categories_train = []
for value in y_train:
    for i, (start, end) in enumerate(category_ranges):
        if start <= value < end:
            true_categories_train.append(categories[i])
            break

majority_class = max(set(true_categories_train), key=true_categories_train.count)
y_pred_class = [majority_class] * len(y_test)

# Define the true categories based on the target value ranges
true_categories = []
for value in y_test:
    for i, (start, end) in enumerate(category_ranges):
        if start <= value < end:
            true_categories.append(categories[i])
            break

# Calculate accuracy, precision, recall, and F1-score
accuracy = accuracy_score(true_categories, y_pred_class)
balanced_accuracy = balanced_accuracy_score(true_categories, y_pred_class)
precision = precision_score(true_categories, y_pred_class, average='weighted', labels=np.unique(y_pred_class))
recall = recall_score(true_categories, y_pred_class, average='weighted', labels=np.unique(y_pred_class))
f1 = f1_score(true_categories, y_pred_class, average='weighted', labels=np.unique(y_pred_class))
roc_auc = roc_auc_score(true_categories, y_pred_class)

print(f"Accuracy: {accuracy:.4f}")
print(f"Balanced Accuracy: {balanced_accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
