In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score
)
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
from sklearn.metrics import brier_score_loss, log_loss
from sklearn.calibration import calibration_curve
from sklearn.preprocessing import label_binarize

# Ensure output directory exists
os.makedirs("./output", exist_ok=True)

# Load dataset
df = pd.read_csv("./data/simulated_weekly_burnout.csv")

# Encode categorical target
label_encoder = LabelEncoder()
df["burnout_label"] = label_encoder.fit_transform(df["burnout"])

# Define features
features = ['avg_tired', 'avg_capable', 'avg_meaningful']
X = df[features]
y = df["burnout_label"]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.3, random_state=42
)

# Scale features (only for Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(
        multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42
    ),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(
        objective='multi:softprob', num_class=3, eval_metric='mlogloss',
        random_state=42
    )
}

# Train, evaluate, and save each model
for name, model in models.items():
    print(f"\n=== {name} ===")
    
    # Fit
    if name == "Logistic Regression":
        model.fit(X_train_scaled, y_train)
        X_eval = X_test_scaled
    else:
        model.fit(X_train, y_train)
        X_eval = X_test

    # Predict labels and probabilities
    y_pred = model.predict(X_eval)
    y_proba = model.predict_proba(X_eval)
    
    # Decode labels for display
    y_test_labels = label_encoder.inverse_transform(y_test)
    y_pred_labels = label_encoder.inverse_transform(y_pred)

    # 1) Classification Report
    print("Classification Report:")
    print(classification_report(y_test_labels, y_pred_labels))

    # 2) ROC-AUC (macro-averaged One-vs-Rest)
    roc_auc = roc_auc_score(
        y_true=y_test,
        y_score=y_proba,
        multi_class='ovr',
        average='macro'
    )
    print(f"ROC-AUC (macro, OVR): {roc_auc:.3f}\n")

    # 3) Confusion Matrix
    cm = confusion_matrix(
        y_test_labels, y_pred_labels, labels=label_encoder.classes_
    )
    print("Confusion Matrix:")
    print(pd.DataFrame(cm, index=label_encoder.classes_, columns=label_encoder.classes_))

    # Plot heatmap
    plt.figure(figsize=(5, 4))
    sns.heatmap(
        cm, annot=True, fmt='d', cmap='Blues',
        xticklabels=label_encoder.classes_,
        yticklabels=label_encoder.classes_
    )
    plt.title(f"{name} Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.show()

    # Save the model
    safe_name = name.lower().replace(" ", "_")
    joblib.dump(model, f"./output/{safe_name}_model.pkl")

# Save preprocessing tools
joblib.dump(scaler, "./output/burnout_scaler.pkl")
joblib.dump(label_encoder, "./output/burnout_label_encoder.pkl")





=== Logistic Regression ===


NameError: name 'multiclass_brier_score' is not defined