In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Define a function to convert SMILES to molecules
def smiles_to_molecule(smiles):
    return Chem.MolFromSmiles(smiles)

ipc_index= [42]   # index of descriptor ipc to be removed as it has extremely large value

# Load the data from the Excel files
smiles_file = r"C:\Users\grvkr\Box\Gaurav Kumar\Purdue_Work\SAR_NM\Data\Siegwart_group_Data1_2021\Activity\SMILES_old.xlsx" 
target_file = r"C:\Users\grvkr\Box\Gaurav Kumar\Purdue_Work\SAR_NM\Data\Siegwart_group_Data1_2021\Activity\Activity_old.xlsx" 

smiles_df = pd.read_excel(smiles_file)
target_df = pd.read_excel(target_file)

# Initialize empty lists for features and target values
X = []
y = []

# Define the number of rows and columns
Nrows = 572
Ncolumns = 1

# Iterate through rows and columns of both dataframes
for i in range(Nrows):
    for j in range(Ncolumns):
        if i < len(smiles_df) and i < len(target_df):  # Check if the row index is within bounds
            smiles = smiles_df.iloc[i, j]
            target = target_df.iloc[i, j]

            molecule = smiles_to_molecule(smiles)
            if molecule is not None:  # Check if the SMILES is valid
                # Calculate all molecular descriptors as a dictionary
                molecular_descriptors_dict = Descriptors.CalcMolDescriptors(molecule)   
                # Extract the values from the dictionary and append to X
                molecular_descriptors = [value for value in molecular_descriptors_dict.values()]
                molecular_descriptors= np.array(molecular_descriptors)
                molecular_descriptors= np.delete(molecular_descriptors, ipc_index)
                
                X.append(molecular_descriptors)
                y.append(target)

# Convert the lists to NumPy arrays
X = np.array(X)
y = np.array(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Mean prediction model
mean_value = np.mean(y_train)
y_pred = np.full_like(y_test, mean_value)



# Define the target value categories based on the updated ranges
category_ranges = [(0, 25000), (25000, float('inf'))]  # Two ranges for luciferase expression
# category_ranges = [(0, 90), (90, float('inf'))]  # Two ranges for cell viability
categories = [1, 2]  # Two categories

# Classify the predictions into categories based on the updated ranges
y_classified = []

for prediction in y_pred:
    for i, (start, end) in enumerate(category_ranges):
        if start <= prediction < end:
            y_classified.append(categories[i])
            break

# Print the results
print("Predicted Categories:")
print(y_classified)

# # If you want to add the predicted categories to your original DataFrame, you can do the following:
# test_df = pd.DataFrame({'Prediction': y_pred, 'Category': y_classified})
# print(test_df)


from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, r2_score, mean_absolute_percentage_error, matthews_corrcoef
from scipy.stats import pearsonr


# Define the true categories based on the target value ranges
true_categories = []

for prediction in y_test:
    for i, (start, end) in enumerate(category_ranges):
        if start <= prediction < end:
            true_categories.append(categories[i])
            break

# Calculate accuracy, precision, recall, and F1-score
accuracy = accuracy_score(true_categories, y_classified)
balanced_accuracy = balanced_accuracy_score(true_categories, y_classified)
precision = precision_score(true_categories, y_classified, average='weighted', labels=np.unique(y_classified))
recall = recall_score(true_categories, y_classified, average='weighted', labels=np.unique(y_classified))
f1 = f1_score(true_categories, y_classified, average='weighted', labels=np.unique(y_classified))
roc_auc = roc_auc_score(true_categories, y_classified)
r2 = r2_score(y_test, y_pred)
MAPE = mean_absolute_percentage_error(y_test, y_pred)
MCC= matthews_corrcoef(true_categories, y_classified)
PCC, _ = pearsonr(y_test, y_pred)

# # Format the values to four decimal places
# accuracy_str = f"{accuracy:.4f}"
# precision_str = f"{precision:.4f}"
# recall_str = f"{recall:.4f}"
# f1_str = f"{f1:.4f}"
# balanced_accuracy_str = f"{balanced_accuracy:.4f}"
# roc_auc_str = f"{roc_auc:.4f}"
# r2_str = f"{r2:.4f}"
# MAPE_str = f"{MAPE:.4f}"
# MCC_str = f"{MCC:.4f}"
# PCC_str = f"{PCC:.4f}"

# Generate a classification report
classification_report_result = classification_report(true_categories, y_classified, labels=np.unique(y_classified), target_names=["Category 1", "Category 2"])
# print("Classification Report:\n", classification_report_result)
print(accuracy)





Predicted Categories:
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]




NameError: name 'mse' is not defined

In [2]:
print(accuracy)

0.8260869565217391
