In [None]:
import pandas as pd

df1 = pd.read_csv('positive.csv', na_values=['NULL'])

df2 = pd.read_csv('negative.csv', na_values=['NULL'])

df = pd.concat([df1, df2], ignore_index=True)

df['AcquisitionDateTime_DT'] = pd.to_datetime(df['AcquisitionDateTime_DT'])

print(df.head())

print(df.info())
print(df.isnull().sum())

In [None]:
from interpret.glassbox import ExplainableBoostingClassifier
from interpret import show
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

y = df["MI_Phys"]
X = df.drop(columns=["PatientID", "12SL_Codes", "Phys_Codes", "TestID", "Source", 
                     "Gender", "PatientAge", "AcquisitionDateTime_DT", "MI_Phys"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Save the original algorithm's prediction for test set before dropping it
y_12SL = X_test["MI_12SL"]

# Split data based on MI_12SL classification
X_train_pos = X_train[X_train["MI_12SL"] == 1].drop(columns=["MI_12SL"])
X_train_neg = X_train[X_train["MI_12SL"] == 0].drop(columns=["MI_12SL"])
X_test_pos = X_test[X_test["MI_12SL"] == 1].drop(columns=["MI_12SL"])
X_test_neg = X_test[X_test["MI_12SL"] == 0].drop(columns=["MI_12SL"])

# Ensure y labels match the correct samples
y_train_pos = y_train.loc[X_train_pos.index]  # True positives or false positives
y_train_neg = y_train.loc[X_train_neg.index]  # True negatives or false negatives
y_test_pos = y_test.loc[X_test_pos.index]
y_test_neg = y_test.loc[X_test_neg.index]

X_train = X_train.drop(columns=["MI_12SL"])
X_test = X_test.drop(columns=["MI_12SL"])

# Extract MI_12SL predictions for only the positive subset
y_12SL_pos = y_12SL.loc[X_test_pos.index]  # Original classifier's labels
y_12SL_neg = y_12SL.loc[X_test_neg.index]


# Train Explainable Boosting Machine



In [None]:
ebm = ExplainableBoostingClassifier(
    learning_rate=0.2,
    max_bins=255,
    interactions=2,
    min_samples_leaf=10,
    n_jobs=-2
)
ebm.fit(X_train_pos, y_train_pos)

# Make predictions
y_pred_pos = ebm.predict(X_test_pos)
print("Accuracy:", accuracy_score(y_test_pos, y_pred_pos))
# F1 score
from sklearn.metrics import f1_score
print("F1 Score:", f1_score(y_test_pos, y_pred_pos))

# Show feature importance
show(ebm.explain_global())

In [None]:
ebm = ExplainableBoostingClassifier(
    learning_rate=0.001,
    max_bins=512,
    interactions=100,
    min_samples_leaf=100,
    early_stopping_rounds=50,
    n_jobs=-2, 
    random_state=42
)

from sklearn.utils.class_weight import compute_sample_weight

# Compute weights based on class distribution
sample_weights = compute_sample_weight(class_weight="balanced", y=y_train_neg)

# Train EBM with sample weights
ebm.fit(X_train_neg, y_train_neg, sample_weight=sample_weights)

#ebm.fit(X_train_neg, y_train_neg)

# Make predictions
y_pred_neg = ebm.predict(X_test_neg)
print("Accuracy:", accuracy_score(y_test_neg, y_pred_neg))
# F1 score
from sklearn.metrics import f1_score
print("F1 Score:", f1_score(y_test_neg, y_pred_neg))

# Show feature importance
show(ebm.explain_global())

In [None]:
import optuna
from interpret.glassbox import ExplainableBoostingClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import numpy as np

def objective(trial):
    # Define the search space
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
    max_bins = trial.suggest_int("max_bins", 64, 512)
    interactions = trial.suggest_int("interactions", 0, 5)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 2, 100)
    max_leaves = trial.suggest_int("max_leaves", 2, 64)
    
    ebm = ExplainableBoostingClassifier(
        learning_rate=learning_rate,
        max_bins=max_bins,
        interactions=interactions,
        min_samples_leaf=min_samples_leaf,
        max_leaves=max_leaves,
        n_jobs=-2
    )
    
    ebm.fit(X_train_pos, y_train_pos)
    y_pred = ebm.predict(X_test_pos)

    return f1_score(y_test_pos, y_pred)

# Run the study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

print("Best parameters:", study.best_params)
print("Best F1 score:", study.best_value)


In [None]:
best_ebm = ExplainableBoostingClassifier(
    **study.best_params,
    n_jobs=-2
)
best_ebm.fit(X_train_pos, y_train_pos)

from sklearn.metrics import classification_report
print(classification_report(y_test_pos, best_ebm.predict(X_test_pos)))
