# Imports and Load Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import modules.data_visualization_utils as dvu
import modules.data_analysis_utils as dau
import modules.shaply_utils as shu
from importlib import reload
import optuna
import shap
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score, GroupShuffleSplit
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# my_computer_fpath = "C:\\Users\\dfber\\OneDrive - Mass General Brigham\\Epidural project\\Data\\"
my_computer_fpath = "C:\\Users\\User\\OneDrive - Mass General Brigham\\Epidural project\\Data\\"

In [None]:
df = pd.read_csv(my_computer_fpath + 'processed_and_imputed_merlin_data.csv') 

# Reduce to key features

In [None]:
df = dau.reduce_cols(df)

In [None]:
# Assume 'df' is your DataFrame with columns:
#   - Outcome: 'failed_catheter'
#   - Group ID: 'unique_pt_id' (vs 'anes_procedure_encounter_id_2273')
# Remove the group column from the features.
# Remove the outcome columns from the features.
X = df.drop(columns=["unique_pt_id", "anes_procedure_encounter_id_2273", "failed_catheter", "has_subsequent_neuraxial_catheter", "has_subsequent_spinal", "has_subsequent_airway", "placement_to_delivery_hours", "rom_thru_delivery_hours"], errors='ignore')
y = df["failed_catheter"]
groups = df["unique_pt_id"]

# ML Trials

In [None]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
numeric_cols = X.select_dtypes(exclude=["object", "category"]).columns.tolist()

# Define a preprocessor to scale numeric features and one-hot encode categorical features.
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ]
)

# Group-based Train/Test Split: ensure that data from a given patient is only in train or test.
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=groups))
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

def objective(trial):
    # Optimize the number of top features to select.
    # (Here we use the original number of features as an approximation.)
    k = trial.suggest_int("k", 1, X_train.shape[1])
    
    # Choose classifier type.
    # classifier_name = trial.suggest_categorical("classifier", ["logistic", "random_forest", "xgboost"])
    classifier_name = trial.suggest_categorical("classifier", ["xgboost"])
    
    if classifier_name == "logistic":
        C = trial.suggest_float("C", 1e-3, 1e2, log=True)
        penalty = trial.suggest_categorical("penalty", ["l1", "l2"])
        model = LogisticRegression(solver="liblinear", C=C, penalty=penalty, random_state=42)
        
    elif classifier_name == "random_forest":
        n_estimators = trial.suggest_int("rf_n_estimators", 50, 300)
        max_depth = trial.suggest_int("rf_max_depth", 3, 20)
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
        
    elif classifier_name == "xgboost":
        n_estimators = trial.suggest_int("xgb_n_estimators", 50, 300)
        max_depth = trial.suggest_int("xgb_max_depth", 3, 20)
        learning_rate = trial.suggest_float("xgb_learning_rate", 0.01, 0.3, log=True)
        model = XGBClassifier(
            n_estimators=n_estimators, 
            max_depth=max_depth, 
            learning_rate=learning_rate, 
            random_state=42, 
            use_label_encoder=False, 
            eval_metric="logloss"
        )
        
    # Build a pipeline with preprocessing, feature selection, and the classifier.
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("select", SelectKBest(score_func=f_classif, k=k)),
        ("clf", model)
    ])
    
    # Use 5-fold cross-validation (ROC AUC) on the training set.
    score = cross_val_score(pipeline, X_train, y_train, cv=5, scoring="roc_auc").mean()
    return score

# Run an Optuna study to maximize ROC AUC.
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=1)

print("Best trial:")
trial = study.best_trial
print(f"  Best AUC: {trial.value}")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

# Reconstruct the best pipeline using the optimal parameters.
best_classifier = trial.params["classifier"]



Best trial among 500 XBBoost trials:  
  Best AUC: 0.6987792467838857  
    k: 33  
    classifier: xgboost  
    xgb_n_estimators: 194  
    xgb_max_depth: 4  
    xgb_learning_rate: 0.04024859747455368  

# Manually enter the saved hyperparameters

In [None]:
trial.params = {
    "k": 33,
    "classifier": "xgboost",
    "xgb_n_estimators": 194,
    "xgb_max_depth": 4,
    "xgb_learning_rate": 0.04024859747455368,
}
best_classifier = trial.params["classifier"]

# Display info for best trial

In [None]:
reload(dvu)

In [None]:
if best_classifier == "logistic":
    best_model = LogisticRegression(
        solver="liblinear",
        C=trial.params["C"],
        penalty=trial.params["penalty"],
        random_state=42
    )
elif best_classifier == "random_forest":
    best_model = RandomForestClassifier(
        n_estimators=trial.params["rf_n_estimators"],
        max_depth=trial.params["rf_max_depth"],
        random_state=42
    )
elif best_classifier == "xgboost":
    best_model = XGBClassifier(
        n_estimators=trial.params["xgb_n_estimators"],
        max_depth=trial.params["xgb_max_depth"],
        learning_rate=trial.params["xgb_learning_rate"],
        random_state=42,
        use_label_encoder=False,
        eval_metric="logloss"
    )

final_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("select", SelectKBest(score_func=f_classif, k=trial.params["k"])),
    ("clf", best_model)
])
final_pipeline.fit(X_train, y_train)

# Evaluate the final model on the test set.
y_pred = final_pipeline.predict(X_test)
y_proba = final_pipeline.predict_proba(X_test)[:, 1]

print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
test_auc = roc_auc_score(y_test, y_proba)
print("Test AUC:", test_auc)

dvu.plot_roc_curve(y_test, y_proba, test_auc)

# Compare to ordinary logistic regression

In [None]:
X_train_logistic, X_test_logistic, y_train_logistic, y_test_logistic = dau.preprocess_data(data=df.copy())
logistic_model = dau.do_logistic_regression(X_train_logistic, X_test_logistic, y_train_logistic, y_test_logistic)
y_pred_logistic = logistic_model.predict(X_test_logistic)
y_pred_prob_logistic = logistic_model.predict_proba(X_test_logistic)[:, 1]

In [None]:
reload(dvu)

In [None]:
dvu.plot_roc_curve([y_test, y_test_logistic], [y_proba, y_pred_prob_logistic], [test_auc, roc_auc_score(y_test, y_pred_prob_logistic)], labels=['XGBoost','Logistic Regression'])

# Shapley analysis

In [None]:
reload(dau)
reload(dvu)

In [None]:
# -------------------------------
# SHAP Analysis for Model Interpretability
# -------------------------------
# Get feature names after preprocessing.
# (Requires scikit-learn 1.0+)
feature_names_num = numeric_cols
feature_names_cat = final_pipeline.named_steps["preprocessor"].transformers_[1][1]\
                    .get_feature_names_out(categorical_cols)
all_feature_names = np.concatenate([feature_names_num, feature_names_cat])

# Retrieve indices of the selected features from SelectKBest.
selected_mask = final_pipeline.named_steps["select"].get_support()
selected_feature_names = all_feature_names[selected_mask]
renamed_feature_names = dau.rename_feature_names_onehot_nodrop(selected_feature_names)

# Transform training and test data for SHAP analysis.
X_train_preprocessed = final_pipeline.named_steps["preprocessor"].transform(X_train)
X_train_transformed = final_pipeline.named_steps["select"].transform(X_train_preprocessed)

X_test_preprocessed = final_pipeline.named_steps["preprocessor"].transform(X_test)
X_test_transformed = final_pipeline.named_steps["select"].transform(X_test_preprocessed)

# Choose the appropriate SHAP explainer based on the classifier.
if best_classifier == "logistic":
    explainer = shap.LinearExplainer(
        final_pipeline.named_steps["clf"], 
        X_train_transformed,
        feature_perturbation="interventional"
    )
else:
    explainer = shap.TreeExplainer(final_pipeline.named_steps["clf"])

# Calculate SHAP values on the test set.
shap_values = explainer.shap_values(X_test_transformed)

In [None]:
shu.plot_shapley(shap_values, X_test_transformed, feature_names=renamed_feature_names)