# Project 2: Injury Risk – Classification
Predict **injury_next_7d** from GPS load & wellness metrics.

### Tasks
1. Exploratory analysis & class balance.
2. Train **LogisticRegression** and **XGBoost/RandomForest** (use RF here).
3. Evaluate with **ROC-AUC, PR-AUC, F1**; calibration curve.
4. Business framing: choose threshold to cap weekly medical costs.

In [None]:
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

df = pd.read_csv('../datasets/gps_load_injury_risk.csv')
X = df.drop(columns=['injury_next_7d'])
y = df['injury_next_7d']
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
pre = ColumnTransformer([('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)], remainder='passthrough')
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

logit = Pipeline([('pre', pre), ('model', LogisticRegression(max_iter=200))])
logit.fit(X_tr, y_tr)
proba_l = logit.predict_proba(X_te)[:,1]
print({'ROC_AUC': roc_auc_score(y_te, proba_l), 'PR_AUC': average_precision_score(y_te, proba_l)})

rf = Pipeline([('pre', pre), ('model', RandomForestClassifier(n_estimators=400, random_state=42))])
rf.fit(X_tr, y_tr)
proba = rf.predict_proba(X_te)[:,1]
print({'ROC_AUC': roc_auc_score(y_te, proba), 'PR_AUC': average_precision_score(y_te, proba)})

pred = (proba >= 0.35).astype(int)
print(classification_report(y_te, pred))

In [None]:
# Simple calibration curve
bins = np.linspace(0,1,11)
df_cal = pd.DataFrame({'p': proba, 'y': y_te.values})
df_cal['bin'] = pd.cut(df_cal['p'], bins)
cal = df_cal.groupby('bin').agg(p_mean=('p','mean'), y_rate=('y','mean'))
cal.plot(y='y_rate', x='p_mean', kind='line', marker='o', title='Calibration Curve')
plt.xlabel('Predicted probability')
plt.ylabel('Observed injury rate')
plt.show()