# 09 - Error and Business Analysis (V3)

- Confusion matrix and per-slice error analysis
- Discuss FP/FN trade-offs, threshold policy
- Document recommendations and monitoring plan


In [None]:
from pathlib import Path
import json
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

INP = Path('../v3_data/employee_promotion_features.csv')
ART = Path('../v3_artifacts'); ART.mkdir(exist_ok=True)

TARGET = 'Promotion_Eligible'
GROUP = 'Current_Position_Level'

# Data
df = pd.read_csv(INP)
X = df.drop(columns=[TARGET])
y = df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

num_cols = X.select_dtypes(include=np.number).columns.tolist()
cat_cols = X.select_dtypes(exclude=np.number).columns.tolist()

pre = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

model = GradientBoostingClassifier(random_state=42)
pipe = Pipeline([('pre', pre), ('model', model)])
pipe.fit(X_train, y_train)
proba = pipe.predict_proba(X_test)[:, 1]

# Use best threshold if available
best_thr = 0.5
bt = ART / 'best_threshold.json'
if bt.exists():
    try:
        best_thr = json.loads(bt.read_text())['best_threshold_f1']
    except Exception:
        pass

pred = (proba >= best_thr).astype(int)
cm = confusion_matrix(y_test, pred)
print('Threshold used:', best_thr)
print('Confusion matrix:\n', cm)
print('\nClassification report:\n', classification_report(y_test, pred, digits=3))

plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Reds')
plt.title('Confusion Matrix (V3)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.savefig(ART / 'confusion_matrix.png', dpi=160)
plt.close()

# Per-slice error rates
if GROUP in X_test.columns:
    rows = []
    for g, idx in X_test.groupby(GROUP).groups.items():
        yt, pt = y_test.loc[idx], pred[idx]
        c = confusion_matrix(yt, pt, labels=[0,1])
        tn, fp, fn, tp = c.ravel()
        rows.append({
            GROUP: g,
            'fp_rate': fp / max(fp + tn, 1),
            'fn_rate': fn / max(fn + tp, 1)
        })
    pd.DataFrame(rows).to_csv(ART / 'slice_error_rates.csv', index=False)

