In [1]:
# 03_train_xgboost.ipynb (Python script version)

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from xgboost import XGBClassifier
from joblib import dump

# Load balanced dataset
path = "../../data/processed/model_ready_balanced.parquet"
df = pd.read_parquet(path)

# Features to include
features = [
    "ride_type_simplified", "duration_min",
    "age", "age_group", "gender", "first_time_visitor",
    "season", "is_weekend", "temperature_max", "precipitation_sum"
]
target = "incident_occurred"

# One-hot encode categorical features
df_model = pd.get_dummies(df[features + [target]], drop_first=True).dropna()
X = df_model.drop(columns=target)
y = df_model[target]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Train XGBoostç
model = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
y_probs = model.predict_proba(X_test)[:, 1]
print("\n📊 Classification report:\n")
print(classification_report(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_probs))

# Save model and metadata
model_bundle = (model, 0.5, X.columns.tolist())  # default threshold
model_path = "../../outputs/models/final_xgboost_model_balanced.joblib"
dump(model_bundle, model_path)
print(f"\n✅ Model saved to {model_path}")

ValueError: feature_names must be string, and may not contain [, ] or <