In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, f1_score
import joblib

# Load data
df = pd.read_parquet("../../data/processed/model_ready_balanced.parquet")

# Define features and target
features = [
    "ride_type_simplified", "duration_min",
    "age", "age_group", "gender", "first_time_visitor",
    "season", "is_weekend", "temperature_max", "precipitation_sum"
]
target = "incident_occurred"

# One-hot encoding
df_model = pd.get_dummies(df[features + [target]], drop_first=True).dropna()
X = df_model.drop(columns=target)
y = df_model[target]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict probabilities
y_proba = model.predict_proba(X_test)[:, 1]

# Find optimal threshold
prec, rec, thresh = precision_recall_curve(y_test, y_proba)
f1_scores = [f1_score(y_test, y_proba >= t) for t in thresh]
optimal_threshold = thresh[np.argmax(f1_scores)]
print(f"\n🔧 Optimal threshold: {optimal_threshold:.3f}")

# Evaluate at optimal threshold
y_pred_opt = (y_proba >= optimal_threshold).astype(int)
print("\n📊 Classification report (optimized threshold):\n")
print(classification_report(y_test, y_pred_opt))
print("AUC:", roc_auc_score(y_test, y_proba))

# Save model
joblib.dump((model, optimal_threshold, X.columns.tolist()), "../../outputs/models/final_logistic_model_balanced.joblib")
print("\n✅ Model retrained and saved: final_logistic_model_balanced.joblib")


🔧 Optimal threshold: 0.239

📊 Classification report (optimized threshold):

              precision    recall  f1-score   support

       False       0.89      0.88      0.88       669
        True       0.31      0.33      0.32       112

    accuracy                           0.80       781
   macro avg       0.60      0.60      0.60       781
weighted avg       0.80      0.80      0.80       781

AUC: 0.6467008327994875

✅ Model retrained and saved: final_logistic_model_balanced.joblib
