In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from interpret.glassbox import ExplainableBoostingClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    balanced_accuracy_score, roc_auc_score, average_precision_score,
    matthews_corrcoef, ConfusionMatrixDisplay, roc_curve, auc,
    precision_recall_curve
)
from joblib import load, dump
from interpret.perf import ROC

x_train = pd.read_csv("output/composition_CTD_global/x_train_composition_CTD_global.csv")
x_test = pd.read_csv("output/composition_CTD_global/x_test_composition_CTD_global.csv")
y_train = pd.read_csv("output/composition_CTD_global/y_train_composition_CTD_global.csv").squeeze()
y_test = pd.read_csv("output/composition_CTD_global/y_test_composition_CTD_global.csv").squeeze()

group_counts = y_train.value_counts().sort_index()
print(group_counts)

ebm = load("ebm_model.pkl")
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=100)

selected_feature_names = [
    'AAC_A',
    'CKSAAGP_alphaticr.postivecharger.gap1',
    'CKSAAGP_alphaticr.uncharger.gap2',
    'CKSAAGP_uncharger.uncharger.gap3',
    'CTDC_hydrophobicity_PONP930101.G3',
    'CTDD_hydrophobicity_CASG920101.3.residue0',
    'CTDD_hydrophobicity_FASG890101.3.residue75',
    'CTDD_hydrophobicity_ZIMJ680101.3.residue50',
    'Charge'
]

x_train_final = x_train[selected_feature_names]
x_test_final = x_test[selected_feature_names]

y_train_pred = ebm.predict(x_train_final)
y_test_pred = ebm.predict(x_test_final)

term_importances = ebm.term_importances()
term_names = ebm.term_names_
ranked_term_names = [name for name, _ in sorted(zip(term_names, term_importances), key=lambda x: x[1])]
print("\nRanked Features:")
print(ranked_term_names)

y_train_pred_proba = ebm.predict_proba(x_train_final)
y_test_pred_proba = ebm.predict_proba(x_test_final)

def binary_metrics(y_true, y_pred, y_pred_proba):
    accuracy = accuracy_score(y_true, y_pred)
    balanced_acc = balanced_accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    auc_roc = roc_auc_score(y_true, y_pred_proba[:, 1])
    auc_pr = average_precision_score(y_true, y_pred_proba[:, 1])
    mcc = matthews_corrcoef(y_true, y_pred)
    return accuracy, balanced_acc, f1, precision, recall, auc_roc, auc_pr, mcc

train_metrics = binary_metrics(y_train, y_train_pred, y_train_pred_proba)
test_metrics = binary_metrics(y_test, y_test_pred, y_test_pred_proba)

print("\nTraining Metrics:")
print(f"Accuracy:             {train_metrics[0]:.4f}")
print(f"Balanced Accuracy:    {train_metrics[1]:.4f}")
print(f"F1 Score:             {train_metrics[2]:.4f}")
print(f"Precision:            {train_metrics[3]:.4f}")
print(f"Recall:               {train_metrics[4]:.4f}")
print(f"AUC ROC:              {train_metrics[5]:.4f}")
print(f"AUC PR:               {train_metrics[6]:.4f}")
print(f"MCC:                  {train_metrics[7]:.4f}")

print("\nTest Metrics:")
print(f"Accuracy:             {test_metrics[0]:.4f}")
print(f"Balanced Accuracy:    {test_metrics[1]:.4f}")
print(f"F1 Score:             {test_metrics[2]:.4f}")
print(f"Precision:            {test_metrics[3]:.4f}")
print(f"Recall:               {test_metrics[4]:.4f}")
print(f"AUC ROC:              {test_metrics[5]:.4f}")
print(f"AUC PR:               {test_metrics[6]:.4f}")
print(f"MCC:                  {test_metrics[7]:.4f}")


0    1145
1    1530
Name: target, dtype: int64

Ranked Features:
['CKSAAGP_alphaticr.uncharger.gap2', 'CKSAAGP_uncharger.uncharger.gap3', 'AAC_A', 'CKSAAGP_uncharger.uncharger.gap3 & CTDC_hydrophobicity_PONP930101.G3', 'CKSAAGP_alphaticr.postivecharger.gap1 & CTDC_hydrophobicity_PONP930101.G3', 'CTDD_hydrophobicity_FASG890101.3.residue75', 'CKSAAGP_uncharger.uncharger.gap3 & Charge', 'CTDC_hydrophobicity_PONP930101.G3 & CTDD_hydrophobicity_CASG920101.3.residue0', 'CTDC_hydrophobicity_PONP930101.G3 & Charge', 'CKSAAGP_alphaticr.postivecharger.gap1', 'CTDD_hydrophobicity_CASG920101.3.residue0 & Charge', 'CTDD_hydrophobicity_CASG920101.3.residue0 & CTDD_hydrophobicity_ZIMJ680101.3.residue50', 'CKSAAGP_alphaticr.postivecharger.gap1 & Charge', 'CKSAAGP_alphaticr.postivecharger.gap1 & CTDD_hydrophobicity_ZIMJ680101.3.residue50', 'CTDD_hydrophobicity_ZIMJ680101.3.residue50', 'CTDC_hydrophobicity_PONP930101.G3', 'CTDD_hydrophobicity_CASG920101.3.residue0', 'Charge']

Training Metrics:
Accuracy

In [3]:
from interpret import show

ebm_global = ebm.explain_global(name='EBM')
show(ebm_global)

In [7]:
show([hist, ebm_global, ebm_local, ebm_perf], share_tables=True)