# Churn Exploration & Baseline Model
Notebook to:
1) Load the **fleet_churn.csv** (or `fleet_churn_30.csv`)
2) Preprocess → one-hot encode categoricals
3) Train **XGBoost** baseline
4) Evaluate: ROC AUC, confusion matrix, classification report
5) Plot ROC curve and Feature Importances

> Tip: run this inside your project venv where `xgboost`, `pandas`, and `scikit-learn` are installed.

In [None]:
# !pip install xgboost pandas scikit-learn matplotlib pyarrow
import os
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import matplotlib.pyplot as plt

DATA_DIR = Path("data")
CSV_CANDIDATES = [
    DATA_DIR / "fleet_churn.csv",
    DATA_DIR / "fleet_churn_30.csv",
]

# Find dataset
for p in CSV_CANDIDATES:
    if p.exists():
        data_path = p
        break
else:
    raise FileNotFoundError("Put fleet_churn.csv or fleet_churn_30.csv under ./data/")

print("Using dataset:", data_path)
df = pd.read_csv(data_path)
print(df.shape)
df.head()


In [None]:
# Basic cleaning and target
assert "churn" in df.columns, "Expected a 'churn' target column."
y = df["churn"].astype(int).values
X = df.drop(columns=["churn", "customer_id"], errors="ignore").copy()

# One-hot encode categorical columns
cat_cols = X.select_dtypes(include="object").columns.tolist()
X[cat_cols] = X[cat_cols].fillna("NA")
X = pd.get_dummies(X, columns=cat_cols, dummy_na=False)

# Train/val split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_train.shape, X_val.shape


In [1]:
# Train a baseline XGBoost model
model = XGBClassifier(
    n_estimators=400,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    reg_alpha=0.0,
    eval_metric="auc",
    random_state=42,
    tree_method="hist",
)
model.fit(X_train, y_train)

proba_val = model.predict_proba(X_val)[:, 1]
pred_val = (proba_val >= 0.5).astype(int)

auc = roc_auc_score(y_val, proba_val)
print("Validation ROC AUC:", round(float(auc), 4))
print()
print(classification_report(y_val, pred_val, digits=4))
cm = confusion_matrix(y_val, pred_val)
cm


NameError: name 'XGBClassifier' is not defined

In [None]:
# Plot ROC curve (matplotlib only; no style/colors specified)
fpr, tpr, thr = roc_curve(y_val, proba_val)
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label=f'ROC AUC = {auc:.3f}')
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve — XGBoost (Validation)')
plt.legend(loc='lower right')
plt.show()


In [None]:
# Feature importances (gain-based)
importances = model.feature_importances_
feat_imp = (
    pd.DataFrame({"feature": X_train.columns, "importance": importances})
      .sort_values("importance", ascending=False)
      .head(25)
      .reset_index(drop=True)
)
feat_imp


In [None]:
# Plot top feature importances (matplotlib only)
plt.figure(figsize=(7, 8))
plt.barh(feat_imp["feature"][::-1], feat_imp["importance"][::-1])
plt.xlabel("Importance")
plt.title("Top 25 Feature Importances — XGBoost")
plt.tight_layout()
plt.show()


## Save the trained model (optional)

In [None]:
# Save the trained model to disk for reuse (optional)
from joblib import dump
models_dir = Path("models"); models_dir.mkdir(parents=True, exist_ok=True)
dump(model, models_dir / "notebook_xgb_model.pkl")
print("Saved:", models_dir / "notebook_xgb_model.pkl")


In [2]:
import json, glob, os, pandas as pd

# read our saved metrics from training logs (fallback: parse Airflow log or recompute quickly)
# here we compute quickly from val.parquet to be definitive:
val = pd.read_parquet("data/processed/val.parquet")
y = val["label"].values
X = val.drop(columns=["label"])
import joblib
from sklearn.metrics import roc_auc_score, f1_score
proba = joblib.load("models/latest_model.pkl").predict_proba(X)[:,1]
print("val_auc:", roc_auc_score(y, proba))
print("val_f1 @0.5:", f1_score(y, (proba>=0.5).astype(int)))

val_auc: 0.5325090909090909
val_f1 @0.5: 0.1814516129032258


In [5]:
# how many features?
#python - <<'PY'
import pandas as pd; df=pd.read_parquet("data/processed/train.parquet")
print("rows:", len(df), "cols:", df.shape[1])
print("label mean (train):", df["label"].mean())
print("first 5 cols:", list(df.columns[:5]))
#PY

rows: 4800 cols: 28
label mean (train): 0.30395833333333333
first 5 cols: ['tenure_months', 'num_vehicles', 'avg_card_swipes_per_vehicle', 'monthly_txn_count', 'monthly_spend']
