In [1]:
    "def evaluate_classifier(model, X_tr, X_te, y_tr, y_te):",
    "    model.fit(X_tr, y_tr)",
    "    preds = model.predict(X_te)",
    "    if hasattr(model, 'predict_proba'):",
    "        probs = model.predict_proba(X_te)",
    "        roc = roc_auc_score(y_te, probs, multi_class='ovr')",
    "    else:",
    "        probs = None",
    "        roc = np.nan",
    "    metrics = {",
    "        'accuracy': accuracy_score(y_te, preds)",
    "        , 'f1_macro': f1_score(y_te, preds, average='macro')",
    "        , 'roc_auc_ovr': roc",
    "    }",
    "    return metrics, preds, probs"
{
    "cells": [
        {
            "cell_type": "code",
            "id": "#VSC-0c71c676",
            "metadata": {
                "language": "python"
            },
            "source": [
                "from pathlib import Path",
                "import json",
                "import numpy as np",
                "import pandas as pd",
                "import matplotlib.pyplot as plt",
                "import seaborn as sns",
                "from sklearn.model_selection import train_test_split",
                "from sklearn.preprocessing import StandardScaler",
                "from sklearn.linear_model import LogisticRegression, LinearRegression",
                "from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier",
                "from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, roc_curve, confusion_matrix, classification_report, mean_absolute_error, mean_squared_error",
                "import joblib",
                "",
                "sns.set_theme(style='whitegrid', context='talk')",
                "DATA_DIR = Path('../data/processed')",
                "MODEL_DIR = Path('../models/saved')",
                "MODEL_DIR.mkdir(parents=True, exist_ok=True)"
            ]
        },
        {
            "cell_type": "code",
            "id": "#VSC-a146c755",
            "metadata": {
                "language": "python"
            },
            "source": [
                "tabular_path = DATA_DIR / 'tabular_ml_ready.parquet'",
                "if not tabular_path.exists():",
                "    raise FileNotFoundError('Run scripts/preprocess.py to create tabular_ml_ready.parquet')",
                "df = pd.read_parquet(tabular_path)",
                "feature_cols = [c for c in df.columns if c not in ['burnout_level', 'burnout_score']]",
                "X = df[feature_cols]",
                "y_class = df['burnout_level']",
                "y_reg = df['burnout_score']",
                "X_train, X_test, y_train, y_test = train_test_split(X, y_class, stratify=y_class, test_size=0.2, random_state=42)",
                "_, _, y_train_reg, y_test_reg = train_test_split(X, y_reg, stratify=None, test_size=0.2, random_state=42)",
                "X_train.shape, X_test.shape"
            ]
        },
        {
            "cell_type": "code",
            "id": "#VSC-e4b09336",
            "metadata": {
                "language": "python"
            },
            "source": [
                "def evaluate_classifier(model, X_tr, X_te, y_tr, y_te):",
                "    model.fit(X_tr, y_tr)",
                "    preds = model.predict(X_te)",
                "    return metrics, preds, probs"
            ]
        },
        {
            "cell_type": "code",
            "id": "#VSC-e26eb89b",
            "metadata": {
                "language": "python"
            },
            "source": [
                "results_clf = {}",
                "log_reg = LogisticRegression(max_iter=2000, multi_class='multinomial')",
                "metrics, preds = evaluate_classifier(log_reg, X_train, X_test, y_train, y_test)",
                "results_clf['logistic_regression'] = metrics",
                "joblib.dump(log_reg, MODEL_DIR / 'log_reg.pkl')",
                "",
                "rf_clf = RandomForestClassifier(n_estimators=400, random_state=42, class_weight='balanced_subsample')",
                "metrics, rf_preds = evaluate_classifier(rf_clf, X_train, X_test, y_train, y_test)",
                "results_clf['random_forest'] = metrics",
                "joblib.dump(rf_clf, MODEL_DIR / 'rf_classifier.pkl')",
                "",
                "gb_clf = GradientBoostingClassifier(random_state=42)",
                "metrics, gb_preds = evaluate_classifier(gb_clf, X_train, X_test, y_train, y_test)",
                "results_clf['gradient_boosting'] = metrics",
                "joblib.dump(gb_clf, MODEL_DIR / 'gb_classifier.pkl')",
                "results_clf"
            ]
        },
        {
            "cell_type": "code",
            "id": "#VSC-afa9dfd5",
            "metadata": {
                "language": "python"
            },
            "source": [
                "def plot_conf_matrix(y_true, y_pred, title):",
                "    cm = confusion_matrix(y_true, y_pred)",
                "    plt.figure(figsize=(6, 5))",
                "    sns.heatmap(cm, annot=True, cmap='Blues', fmt='d')",
                "    plt.title(title)",
                "    plt.xlabel('Predicted')",
                "    plt.ylabel('True')",
                "    plt.tight_layout()",
                "",
                "plot_conf_matrix(y_test, rf_preds, 'Random Forest Confusion Matrix')",
                "plot_conf_matrix(y_test, gb_preds, 'Gradient Boosting Confusion Matrix')"
            ]
        },
        {
            "cell_type": "code",
            "id": "#VSC-90fa9194",
            "metadata": {
                "language": "python"
            },
            "source": [
                "rf_importances = pd.Series(rf_clf.feature_importances_, index=feature_cols)",
                "top_feats = rf_importances.sort_values(ascending=False).head(15)",
                "plt.figure(figsize=(8, 6))",
                "sns.barplot(x=top_feats.values, y=top_feats.index, palette='viridis')",
                "plt.title('Random Forest Feature Importance (Top 15)')",
                "plt.tight_layout()"
            ]
        },
        {
            "cell_type": "code",
            "id": "#VSC-b54fc877",
            "metadata": {
                "language": "python"
            },
            "source": [
                "def evaluate_regressor(model, X_tr, X_te, y_tr, y_te):",
                "    model.fit(X_tr, y_tr)",
                "    preds = model.predict(X_te)",
                "    return {"
            ]
        },
        {
            "cell_type": "code",
            "id": "#VSC-7c7b9531",
            "metadata": {
                "language": "python"
            },
            "source": [
                "results_reg = {}",
                "lin_reg = LinearRegression()",
                "results_reg['linear_regression'] = evaluate_regressor(lin_reg, X_train, X_test, y_train_reg, y_test_reg)",
                "joblib.dump(lin_reg, MODEL_DIR / 'linear_reg.pkl')",
                "",
                "rf_reg = RandomForestRegressor(n_estimators=400, random_state=42)",
                "results_reg['random_forest_regressor'] = evaluate_regressor(rf_reg, X_train, X_test, y_train_reg, y_test_reg)",
                "joblib.dump(rf_reg, MODEL_DIR / 'rf_regressor.pkl')",
                "results_reg"
            ]
        },
        {
            "cell_type": "code",
            "id": "#VSC-1e657785",
            "metadata": {
                "language": "python"
            },
            "source": [
                "display(pd.DataFrame(results_clf).T)",
                "display(pd.DataFrame(results_reg).T)"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "#VSC-dca1e2bd",
            "metadata": {
                "language": "markdown"
            },
            "source": [
                "## Notes",
                "- Re-run after feature tweaks to compare models.",
                "- Saved estimators live in `../models/saved/` for downstream ensembling or deployment."
            ]
        }
    ]
}

IndentationError: unexpected indent (1023727272.py, line 1)

In [2]:
from pathlib import Path
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, roc_curve, confusion_matrix, classification_report, mean_absolute_error, mean_squared_error
import joblib

sns.set_theme(style='whitegrid', context='talk')
DATA_DIR = Path('../data/processed')
MODEL_DIR = Path('../models/saved')
MODEL_DIR.mkdir(parents=True, exist_ok=True)

In [4]:
tabular_path = DATA_DIR / 'tabular_ml_ready.parquet'
if not tabular_path.exists():
    raise FileNotFoundError('Run scripts/preprocess.py to create tabular_ml_ready.parquet')
df = pd.read_parquet(tabular_path)
feature_cols = [c for c in df.columns if c not in ['burnout_level', 'burnout_score']]
X = df[feature_cols]
y_class = df['burnout_level']
y_reg = df['burnout_score']
X_train, X_test, y_train, y_test = train_test_split(X, y_class, stratify=y_class, test_size=0.2, random_state=42)
_, _, y_train_reg, y_test_reg = train_test_split(X, y_reg, stratify=None, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

ImportError: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.

In [None]:
def evaluate_classifier(model, X_tr, X_te, y_tr, y_te):
    model.fit(X_tr, y_tr)
    preds = model.predict(X_te)
    if hasattr(model, "predict_proba"):
        probs = model.predict_proba(X_te)
        roc = roc_auc_score(y_te, probs, multi_class="ovr")
    else:
        probs = None
        roc = np.nan
    metrics = {
        "accuracy": accuracy_score(y_te, preds),
        "f1_macro": f1_score(y_te, preds, average="macro"),
        "roc_auc_ovr": roc,
    }
    return metrics, preds, probs

In [None]:
results_clf = {}
log_reg = LogisticRegression(max_iter=2000, multi_class='multinomial')
metrics, preds, log_probs = evaluate_classifier(log_reg, X_train, X_test, y_train, y_test)
results_clf['logistic_regression'] = metrics
joblib.dump(log_reg, MODEL_DIR / 'log_reg.pkl')

rf_clf = RandomForestClassifier(n_estimators=400, random_state=42, class_weight='balanced_subsample')
metrics, rf_preds, rf_probs = evaluate_classifier(rf_clf, X_train, X_test, y_train, y_test)
results_clf['random_forest'] = metrics
joblib.dump(rf_clf, MODEL_DIR / 'rf_classifier.pkl')

gb_clf = GradientBoostingClassifier(random_state=42)
metrics, gb_preds, gb_probs = evaluate_classifier(gb_clf, X_train, X_test, y_train, y_test)
results_clf['gradient_boosting'] = metrics
joblib.dump(gb_clf, MODEL_DIR / 'gb_classifier.pkl')
results_clf

In [None]:
def plot_conf_matrix(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, cmap='Blues', fmt='d')
    plt.title(title)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.tight_layout()

plot_conf_matrix(y_test, rf_preds, 'Random Forest Confusion Matrix')
plot_conf_matrix(y_test, gb_preds, 'Gradient Boosting Confusion Matrix')

In [None]:
plt.figure(figsize=(8, 6))
classes = sorted(y_test.unique())
for idx, label in enumerate(classes):
    fpr, tpr, _ = roc_curve((y_test == label).astype(int), rf_probs[:, idx])
    plt.plot(fpr, tpr, label=f"Class {label}")
plt.plot([0, 1], [0, 1], "--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Random Forest ROC (One-vs-Rest)")
plt.legend()
plt.tight_layout()

In [None]:
rf_importances = pd.Series(rf_clf.feature_importances_, index=feature_cols)
top_feats = rf_importances.sort_values(ascending=False).head(15)
plt.figure(figsize=(8, 6))
sns.barplot(x=top_feats.values, y=top_feats.index, palette='viridis')
plt.title('Random Forest Feature Importance (Top 15)')
plt.tight_layout()

In [None]:
def evaluate_regressor(model, X_tr, X_te, y_tr, y_te):
    model.fit(X_tr, y_tr)
    preds = model.predict(X_te)
    return {
        "mae": mean_absolute_error(y_te, preds),
        "mse": mean_squared_error(y_te, preds),
        "rmse": mean_squared_error(y_te, preds, squared=False),
    }

In [None]:
results_reg = {}
lin_reg = LinearRegression()
results_reg['linear_regression'] = evaluate_regressor(lin_reg, X_train, X_test, y_train_reg, y_test_reg)
joblib.dump(lin_reg, MODEL_DIR / 'linear_reg.pkl')

rf_reg = RandomForestRegressor(n_estimators=400, random_state=42)
results_reg['random_forest_regressor'] = evaluate_regressor(rf_reg, X_train, X_test, y_train_reg, y_test_reg)
joblib.dump(rf_reg, MODEL_DIR / 'rf_regressor.pkl')
results_reg

In [None]:
display(pd.DataFrame(results_clf).T)
display(pd.DataFrame(results_reg).T)

## Notes
- Re-run after feature tweaks to compare models.
- Saved estimators live in `../models/saved/` for downstream ensembling or deployment.