In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, matthews_corrcoef, cohen_kappa_score, confusion_matrix
)
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neural_network import MLPClassifier
import lightgbm as lgb
import shap
import warnings
warnings.filterwarnings('ignore')

# Define the InjuryBoostStackNet class
class InjuryBoostStackNet:
    def __init__(self):
        # Initialize encoders and scaler
        self.encoders = {}
        self.scaler = StandardScaler()
        
        # Base models
        self.rf = RandomForestClassifier(n_estimators=100, max_depth=15, random_state=42)
        self.lgb = lgb.LGBMClassifier(n_estimators=150, num_leaves=31, learning_rate=0.05, random_state=42)
        self.mlp = MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu', solver='adam',
                                 alpha=0.0001, max_iter=500, random_state=21)


In [None]:
    def preprocess(self, df):
        # Drop irrelevant ID columns
        df = df.drop(columns=["Player Name", "Team Name"], errors='ignore')

        # Label encode categorical features
        for col in ['Position', 'Injury Type']:
            if col in df.columns:
                self.encoders[col] = LabelEncoder()
                df[col] = self.encoders[col].fit_transform(df[col].astype(str))

        # Separate features and target
        if "Performance Drop Binary" not in df.columns:
            raise ValueError("Target column 'Performance Drop Binary' not found.")

        features = df.drop(columns=["Performance Drop", "Performance Drop Binary"], errors='ignore')
        target = df["Performance Drop Binary"]

        # Normalize features
        features_scaled = pd.DataFrame(self.scaler.fit_transform(features), columns=features.columns)

        return features_scaled, target


In [None]:
    def train_and_evaluate(self, X, y):
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        results = {
            'accuracy': [], 'precision': [], 'recall': [], 'f1': [],
            'roc_auc': [], 'specificity': [], 'mcc': [], 'kappa': []
        }

        for fold_idx, (train_idx, test_idx) in enumerate(skf.split(X, y), 1):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

            # Train base models
            rf_pred = self.rf.fit(X_train, y_train).predict_proba(X_test)[:, 1]
            lgb_pred = self.lgb.fit(X_train, y_train).predict_proba(X_test)[:, 1]
            mlp_pred = self.mlp.fit(X_train, y_train).predict_proba(X_test)[:, 1]

            # Average ensemble
            final_pred = (rf_pred + lgb_pred + mlp_pred) / 3
            final_class = (final_pred >= 0.5).astype(int)

            # Confusion matrix for specificity
            tn, fp, fn, tp = confusion_matrix(y_test, final_class).ravel()

            # Collect metrics
            results['accuracy'].append(accuracy_score(y_test, final_class))
            results['precision'].append(precision_score(y_test, final_class))
            results['recall'].append(recall_score(y_test, final_class))
            results['f1'].append(f1_score(y_test, final_class))
            results['roc_auc'].append(roc_auc_score(y_test, final_pred))
            results['specificity'].append(tn / (tn + fp))
            results['mcc'].append(matthews_corrcoef(y_test, final_class))
            results['kappa'].append(cohen_kappa_score(y_test, final_class))

            # 👉 Visualize only for the final (5th) fold
            if fold_idx == 5:
                print(f"\n📊 Visualizations for Fold {fold_idx}:")
                plot_confusion_matrix(y_test, final_class)
                plot_roc_curve(y_test, final_pred)
                plot_f1_threshold(y_test, final_pred)

        # Return mean metrics
        return {k: np.mean(v) for k, v in results.items()}


In [None]:
# Example Usage
if __name__ == "__main__":
    # Load your dataset

    # Initialize model
    model = InjuryBoostStackNet()

    # Preprocess data
    X, y = model.preprocess(df)

    # Train and evaluate
    results = model.train_and_evaluate(X, y)

    # Print evaluation results
    print("Cross-Validation Results:")
    for metric, value in results.items():
        print(f"{metric}: {value:.4f}")


In [None]:
    def explain_shap(self, X, max_display=10):
        """
        Generate SHAP explanations using LightGBM.
        :param X: Scaled feature DataFrame
        :param max_display: Number of top features to show
        """
        # Use TreeExplainer on trained LightGBM model
        explainer = shap.TreeExplainer(self.lgb)
        shap_values = explainer.shap_values(X)

        # Summary plot for feature importance (bar)
        print("SHAP Feature Importance:")
        shap.summary_plot(shap_values, X, plot_type="bar", max_display=max_display)

        # Summary plot for feature effect (beeswarm)
        shap.summary_plot(shap_values, X, max_display=max_display)


In [None]:
# After model training:
model.explain_shap(X)  # Where X is your preprocessed feature matrix


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, f1_score
import seaborn as sns

def plot_confusion_matrix(y_true, y_pred, title="Confusion Matrix"):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False,
                xticklabels=["No Drop", "Drop"], yticklabels=["No Drop", "Drop"])
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(title)
    plt.tight_layout()
    plt.show()

def plot_roc_curve(y_true, y_score):
    fpr, tpr, _ = roc_curve(y_true, y_score)
    plt.figure()
    plt.plot(fpr, tpr, label=f"ROC curve (AUC = {roc_auc_score(y_true, y_score):.2f})")
    plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

def plot_f1_threshold(y_true, y_score):
    thresholds = np.linspace(0.0, 1.0, 100)
    f1_scores = [f1_score(y_true, (y_score >= t).astype(int)) for t in thresholds]
    plt.figure()
    plt.plot(thresholds, f1_scores, label="F1 Score")
    plt.xlabel("Threshold")
    plt.ylabel("F1 Score")
    plt.title("F1 Score vs. Threshold")
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:
import time
from memory_profiler import memory_usage

def track_training_time_and_memory(model, X, y):
    def _train():
        model.train_and_evaluate(X, y)

    start_time = time.time()
    mem_usage = memory_usage(( _train, ), interval=0.1)
    end_time = time.time()

    print(f"Total Training Time: {end_time - start_time:.2f} seconds")
    print(f"Peak Memory Usage: {max(mem_usage):.2f} MB")
