## This notebook is to check the model reproducibility with a sample from it's original publication

In [32]:
import os
import pandas as pd
from rdkit import Chem
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score, matthews_corrcoef

DATAPATH = "../data"

## eos2ta5

In [33]:
# Load the test dataset and the predictions dataset
test_df = pd.read_csv(os.path.join(DATAPATH, "model_reproducibility", "model_datasets", "eos2ta5_Test-set-I.csv"))
predictions_df = pd.read_csv(os.path.join(DATAPATH, "model_reproducibility", "predictions_data", "reproducibility_predictions_eos2ta5.csv"))

# the predictions are in a column named 'probability' in the predictions dataset
predicted_probabilities = predictions_df['probability']

# Convert probabilities to binary predictions based on a threshold (e.g., 0.5)
predicted_labels = (predicted_probabilities >= 0.5).astype(int)

# Extract the ground truth labels from the test dataset
test_labels = test_df['ACTIVITY']  # 'ACTIVITY' column contains the ground truth labels

# Calculate confusion matrix
conf_matrix = confusion_matrix(test_labels, predicted_labels)

# Calculate accuracy
accuracy = accuracy_score(test_labels, predicted_labels)

# Calculate precision
precision = precision_score(test_labels, predicted_labels)

# Calculate recall
recall = recall_score(test_labels, predicted_labels)

# Calculate F1 score
f1 = f1_score(test_labels, predicted_labels)

# Calculate balanced accuracy
balanced_accuracy = balanced_accuracy_score(test_labels, predicted_labels)

# Calculate Matthews correlation coefficient
mcc = matthews_corrcoef(test_labels, predicted_labels)

# Calculate Negative Predictive Value (NPV)
tn, fp, fn, tp = conf_matrix.ravel()
npv = tn / (tn + fn)

# Calculate Specificity (SPE)
spe = tn / (tn + fp)

# Print the results
print("MCC:", mcc)
print("NPV:", npv)
print("ACC:", accuracy)
print("PPV:", precision)
print("SPE:", spe)
print("SEN:", recall)
print("B-ACC:", balanced_accuracy)


MCC: 0.5993902797701955
NPV: 0.6875
ACC: 0.8181818181818182
PPV: 0.8928571428571429
SPE: 0.7857142857142857
SEN: 0.8333333333333334
B-ACC: 0.8095238095238095


### Results according to the publication
<img src="../figures/eos2ta5.png" alt="eos2ta5_publication_result">

### eos2ta5 has the same results as the publication

## eos4tcc

In [34]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
# Load the test dataset and the predictions dataset
test_df = pd.read_csv(os.path.join(DATAPATH, "model_reproducibility", "model_datasets", "eos4tcc_EX1.csv"))
predictions_df = pd.read_csv(os.path.join(DATAPATH, "model_reproducibility", "predictions_data", "reproducibility_predictions_eos4tcc.csv"))

# the predictions are in a column named 'score' in the predictions dataset
predicted_probabilities = predictions_df['score']

# Define a threshold for converting probabilities to binary predictions
threshold = 0.5  # You may adjust this threshold based on your preference or specific requirements

# Convert probabilities to binary predictions based on the threshold
binary_predictions = (predicted_probabilities >= threshold).astype(int)

# Extract the ground truth labels from the test dataset
test_labels = test_df['label']  # 'label' column contains the ground truth labels
# Calculate confusion matrix
conf_matrix = confusion_matrix(test_labels, predicted_labels)

# Calculate accuracy
accuracy = accuracy_score(test_labels, predicted_labels)

# Calculate precision
precision = precision_score(test_labels, predicted_labels)

# Calculate recall
recall = recall_score(test_labels, predicted_labels)

# Calculate F1 score
f1 = f1_score(test_labels, predicted_labels)

# Calculate balanced accuracy
balanced_accuracy = balanced_accuracy_score(test_labels, predicted_labels)

# Calculate Matthews correlation coefficient
mcc = matthews_corrcoef(test_labels, predicted_labels)

# Calculate Negative Predictive Value (NPV)
tn, fp, fn, tp = conf_matrix.ravel()
npv = tn / (tn + fn)

# Calculate Specificity (SPE)
spe = tn / (tn + fp)

# Print the results
print("SEN:", recall)
print("SPE:", spe)
print("MCC:", mcc)
print("B-ACC:", balanced_accuracy)
print("F1:", f1)


SEN: 0.8333333333333334
SPE: 0.7857142857142857
MCC: 0.5993902797701955
B-ACC: 0.8095238095238095
F1: 0.8620689655172413


### Results according to the publication
<img src="../figures/eos4tcc.png" alt="eos4tcc_publication_result">

<img src="../figures/eos4tcc2.png" alt="eos4tcc_publication_result">

### it can be seen that the publication and eos4tcc have the same range of values

## eos30gr

In [9]:
import pandas as pd
import os

# Load the entire Excel file
full_excel_data = pd.read_excel(os.path.join(DATAPATH, "model_reproducibility", "model_datasets", "eos30gr_TableS4.xlsx"), sheet_name=None)

# Print the available sheet names
sheet_names = full_excel_data.keys()  # Get the keys of the dictionary, which are the sheet names
print("Available sheet names:", sheet_names)

# Access the 'Validation set' sheet from the dictionary
validation_set_df = full_excel_data['Validation set']

# Specify the path where you want to save the downloaded CSV file
output_csv_path = os.path.join(DATAPATH, "model_reproducibility", "model_datasets", "eos30gr_validation_set.csv")  # Specify the full path to save the CSV file

# Save the 'Validation set' sheet as a separate CSV file
validation_set_df.to_csv(output_csv_path, index=False)


Available sheet names: dict_keys(['Training set', 'Test set', 'Validation set'])


In [29]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
# Load the test dataset and the predictions dataset
test_df = pd.read_csv(os.path.join(DATAPATH, "model_reproducibility", "model_datasets", "eos30gr_validation_set.csv"))

# Load the predictions dataset
predictions_df = pd.read_csv(os.path.join(DATAPATH, "model_reproducibility", "predictions_data", "reproducibility_predictions_eos30gr.csv"))

# Drop rows with NaN values from the 'activity (blockers‘ IC50 ≤ 10 μM; decoys‘ IC50 ＞10 μM)' column in the test dataset
test_df = test_df.dropna(subset=['activity (blockers‘ IC50 ≤ 10 μM; decoys‘ IC50 ＞10 μM)'])

# Drop corresponding rows from predictions_df
predictions_df = predictions_df.loc[test_df.index]


# The predictions are in the column named 'activity10'
predicted_probabilities = predictions_df['activity10']

# Define a threshold for converting probabilities to binary predictions
threshold = 0.5  # You may adjust this threshold based on your preference or specific requirements

# Convert probabilities to binary predictions based on the threshold
binary_predictions = (predicted_probabilities >= threshold).astype(int)

# Convert probabilities to binary predictions based on a threshold (e.g., 0.5)
predicted_labels = (predicted_probabilities >= 0.5).astype(int)

# Extract the ground truth labels from the test dataset
test_labels = test_df['activity (blockers‘ IC50 ≤ 10 μM; decoys‘ IC50 ＞10 μM)']  # 'label' column contains the ground truth labels

# Calculate Sensitivity (SE)
SE = recall_score(test_labels, predicted_labels)

# Calculate Specificity (SP)
TN, FP, FN, TP = conf_matrix.ravel()
SP = TN / (TN + FP)

# Calculate Positive Predictive Value (Q+)
Q_plus = precision_score(test_labels, predicted_labels)

# Calculate Negative Predictive Value (Q-)
Q_minus = TN / (TN + FN)

# Calculate Overall Accuracy (Q)
Q = accuracy_score(test_labels, predicted_labels)

# Calculate Area Under the Curve (AUC)
AUC = roc_auc_score(test_labels, predicted_probabilities)

# Print the results
print("Sensitivity (SE):", SE)
print("Specificity (SP):", SP)
print("Positive Predictive Value (Q+):", Q_plus)
print("Negative Predictive Value (Q-):", Q_minus)
print("Overall Accuracy (Q):", Q)
print("Area Under the Curve (AUC):", AUC)



Sensitivity (SE): 0.9977011494252873
Specificity (SP): 0.9965397923875432
Positive Predictive Value (Q+): 1.0
Negative Predictive Value (Q-): 0.9988439306358381
Overall Accuracy (Q): 0.9987325728770595
Area Under the Curve (AUC): 1.0


### results of the publication
<img src="../figures/eos30gr.png" alt="eos30gr_publication_result">

## eos30f3