In [3]:
# notebooks/modeling/07_export_model_for_app.ipynb

import pandas as pd
import numpy as np
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve

# 1. Load data and add age_group
df = pd.read_parquet("../../data/processed/model_ready.parquet")

def age_group(age):
    if age < 10: return "<10"
    elif age < 20: return "10–19"
    elif age < 30: return "20–29"
    elif age < 40: return "30–39"
    elif age < 50: return "40–49"
    elif age < 60: return "50–59"
    elif age < 70: return "60–69"
    else: return "70+"

df["age_group"] = df["age"].apply(age_group)

features = [
    "ride_type_simplified", "ride_incident_count", "duration_min",
    "age", "age_group", "gender", "simulated_medical_condition", "first_time_visitor",
    "season", "is_weekend", "temperature_max", "precipitation_sum"
]
target = "incident_occurred"

df_model = pd.get_dummies(df[features + [target]], drop_first=True).dropna()

# 2. Train final model
X = df_model.drop(columns=target)
y = df_model[target]
model = LogisticRegression(max_iter=1000)
model.fit(X, y)

# 3. Determine optimal threshold
y_probs = model.predict_proba(X)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y, y_probs)
f1s = 2 * (precisions * recalls) / (precisions + recalls + 1e-6)
best_thresh = thresholds[np.argmax(f1s)]

# 4. Save model and metadata
joblib.dump({
    "model": model,
    "threshold": best_thresh,
    "features": list(X.columns)
}, "../../outputs/models/final_logistic_model.joblib")

print(f"✅ Model exported to outputs/models/final_logistic_model.joblib")
print(f"🔧 Optimal threshold: {best_thresh:.3f}")


✅ Model exported to outputs/models/final_logistic_model.joblib
🔧 Optimal threshold: 0.415
