# Customer Churn Model (XGBoost) with CUBS Features

This notebook:
- Loads train.csv (Jan-Aug) and validate.csv (Sep-Oct)
- Computes churn label from future snapshots
- Trains XGBoost classifier with CUBS category features
- Outputs scored results with business-readable reasons

**Inputs:** train.csv, validate.csv (from DAX query)

**Outputs:**
- model_quality_report.txt
- feature_importance.csv
- churn_scores_long.csv
- churn_scores_wide_12m.csv
- portfolio_summary.csv

## 0) Environment

In [None]:
import sys, platform
print("python:", sys.version)
print("platform:", platform.platform())

## 1) Imports

In [None]:
import pandas as pd
import numpy as np

import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix

## 2) Load data + normalize column names

In [None]:
TRAIN_PATH = "train.csv"
VALIDATE_PATH = "validate.csv"

train_df = pd.read_csv(TRAIN_PATH)
validate_df = pd.read_csv(VALIDATE_PATH)

def normalize_cols(df: pd.DataFrame) -> pd.DataFrame:
    """Strip whitespace and remove square brackets from column names."""
    df = df.copy()
    df.columns = (
        df.columns.astype(str)
        .str.strip()
        .str.replace(r"^\[", "", regex=True)
        .str.replace(r"\]$", "", regex=True)
    )
    return df

train_df = normalize_cols(train_df)
validate_df = normalize_cols(validate_df)

# Combine for label computation
all_df = pd.concat([train_df, validate_df], ignore_index=True)

print("Train shape:", train_df.shape)
print("Validate shape:", validate_df.shape)
print("Combined shape:", all_df.shape)
print("Columns:", list(train_df.columns))

## 3) Compute churn label

For each customer-snapshot, check if they have orders in the next 3 monthly snapshots.
- WillChurn90 = 1 if no orders in next 90 days (churned)
- WillChurn90 = 0 if they had orders (retained)

In [None]:
# Parse snapshot dates
all_df["SnapshotDate"] = pd.to_datetime(all_df["SnapshotDate"], errors="coerce")

# Sort by customer and date
all_df = all_df.sort_values(["CustomerId", "SnapshotDate"]).reset_index(drop=True)

def compute_churn_label(df: pd.DataFrame) -> pd.DataFrame:
    """
    For each row, look at this customer's next 3 monthly snapshots.
    If Orders_CY > 0 in any of them, customer didn't churn (0).
    If no future snapshots or all have 0 orders, customer churned (1).
    """
    df = df.copy()
    df["WillChurn90"] = np.nan
    
    for cust_id in df["CustomerId"].unique():
        cust_mask = df["CustomerId"] == cust_id
        cust_rows = df.loc[cust_mask].sort_values("SnapshotDate")
        indices = cust_rows.index.tolist()
        
        for i, idx in enumerate(indices):
            # Look at next 3 snapshots for this customer
            future_indices = indices[i+1:i+4]
            
            if len(future_indices) == 0:
                # No future data - can't compute label
                df.loc[idx, "WillChurn90"] = np.nan
            else:
                # Check if any future snapshot has orders
                future_orders = df.loc[future_indices, "Orders_CY"].fillna(0).sum()
                df.loc[idx, "WillChurn90"] = 0 if future_orders > 0 else 1
    
    return df

all_df = compute_churn_label(all_df)

# Check label distribution
print("Label distribution:")
print(all_df["WillChurn90"].value_counts(dropna=False))

## 4) Split back into train/validate

Train: Jan-Aug (have future data for labels)
Validate: Sep-Oct (have future data for labels)
Drop rows without labels (Nov-Dec or missing future)

In [None]:
# Drop rows without labels
labeled_df = all_df.dropna(subset=["WillChurn90"]).copy()
labeled_df["WillChurn90"] = labeled_df["WillChurn90"].astype(int)

# Split by date: Train = Jan-Aug, Validate = Sep-Oct
train_cutoff = pd.Timestamp("2025-08-31")
validate_cutoff = pd.Timestamp("2025-10-31")

train_df = labeled_df[labeled_df["SnapshotDate"] <= train_cutoff].copy()
validate_df = labeled_df[
    (labeled_df["SnapshotDate"] > train_cutoff) & 
    (labeled_df["SnapshotDate"] <= validate_cutoff)
].copy()

print(f"Train: {len(train_df)} rows ({train_df['SnapshotDate'].min()} to {train_df['SnapshotDate'].max()})")
print(f"Validate: {len(validate_df)} rows ({validate_df['SnapshotDate'].min()} to {validate_df['SnapshotDate'].max()})")
print(f"\nTrain churn rate: {train_df['WillChurn90'].mean():.2%}")
print(f"Validate churn rate: {validate_df['WillChurn90'].mean():.2%}")

## 5) Preprocessing

- One-hot encode Segment and CostCenter
- Drop ID columns, date columns, and target from features
- All CUBS category features pass through as-is

In [None]:
TARGET_COL = "WillChurn90"

# All date columns to exclude from features
DATE_COLS = [
    "SnapshotDate", "FirstPurchaseDate", "LastPurchaseDate",
    "Uniforms_FirstPurchaseDate", "Sparring_FirstPurchaseDate",
    "Belts_FirstPurchaseDate", "Bags_FirstPurchaseDate", "Customs_FirstPurchaseDate"
]

ID_COLS = ["CustomerId", "AccountName", "Segment", "CostCenter"]

DROP_FROM_FEATURES = [TARGET_COL, "DataSplit"] + DATE_COLS

def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    """Return model-ready feature matrix."""
    df = df.copy()
    
    # Clean categorical columns
    if "AccountName" in df.columns:
        df["AccountName"] = df["AccountName"].fillna("").astype(str).str.strip()
    if "Segment" in df.columns:
        df["Segment"] = df["Segment"].fillna("UNKNOWN").astype(str).str.strip()
    if "CostCenter" in df.columns:
        df["CostCenter"] = df["CostCenter"].fillna("UNKNOWN").astype(str).str.strip()
    
    # Drop non-feature columns
    X = df.drop(columns=[c for c in (DROP_FROM_FEATURES + ID_COLS) if c in df.columns], errors="ignore")
    
    # One-hot encode Segment and CostCenter
    seg = df["Segment"] if "Segment" in df.columns else pd.Series(["UNKNOWN"] * len(df), index=df.index)
    cc = df["CostCenter"] if "CostCenter" in df.columns else pd.Series(["UNKNOWN"] * len(df), index=df.index)
    
    X = pd.concat([X, pd.get_dummies(seg, prefix="Segment", drop_first=False)], axis=1)
    X = pd.concat([X, pd.get_dummies(cc, prefix="CostCenter", drop_first=False)], axis=1)
    
    return X

## 6) Build train/validate matrices

In [None]:
X_train = preprocess(train_df)
y_train = train_df[TARGET_COL].astype(int)

X_val = preprocess(validate_df)
y_val = validate_df[TARGET_COL].astype(int)

# Align columns (handle segments that appear in one but not other)
X_val = X_val.reindex(columns=X_train.columns, fill_value=0)

print("Feature count:", X_train.shape[1])
print("Train rows:", X_train.shape[0])
print("Validate rows:", X_val.shape[0])
print("\nFeature columns:")
print(list(X_train.columns))

## 7) Train model

In [None]:
model = XGBClassifier(
    n_estimators=400,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

val_probs = model.predict_proba(X_val)[:, 1]
val_preds = (val_probs >= 0.5).astype(int)

# Save model and training columns for deployment
import os
import joblib

os.makedirs("model", exist_ok=True)
joblib.dump(model, "model/churn_model.pkl")
joblib.dump(list(X_train.columns), "model/model_columns.pkl")

print("Training complete.")
print("Saved to ./model/ folder: churn_model.pkl, model_columns.pkl")
print("Upload this folder to Azure ML Studio to register the model.")

## 8) Model quality report

In [None]:
metrics = {
    "roc_auc": float(roc_auc_score(y_val, val_probs)),
    "precision": float(precision_score(y_val, val_preds)),
    "recall": float(recall_score(y_val, val_preds)),
    "f1": float(f1_score(y_val, val_preds)),
}
cm = confusion_matrix(y_val, val_preds)

report_lines = [
    "MODEL QUALITY REPORT",
    "-" * 60,
    f"ROC AUC   : {metrics['roc_auc']:.6f}",
    f"Precision : {metrics['precision']:.6f}",
    f"Recall    : {metrics['recall']:.6f}",
    f"F1        : {metrics['f1']:.6f}",
    "",
    "Confusion Matrix (rows=true, cols=pred):",
    str(cm)
]

report_text = "\n".join(report_lines)
print(report_text)

with open("model_quality_report.txt", "w", encoding="utf-8") as f:
    f.write(report_text)

print("\nWrote: model_quality_report.txt")

## 9) Feature importance

In [None]:
feature_importance = pd.DataFrame({
    "feature": X_train.columns,
    "importance": model.feature_importances_
}).sort_values("importance", ascending=False)

feature_importance.to_csv("feature_importance.csv", index=False)
print("Wrote: feature_importance.csv")
feature_importance.head(25)

## 10) Per-customer reasons

Uses XGBoost native feature contributions to generate human-readable explanations.

In [None]:
# Get feature contributions
dm_val = xgb.DMatrix(X_val)
contrib = model.get_booster().predict(dm_val, pred_contribs=True)
contrib_df = pd.DataFrame(contrib, columns=list(X_train.columns) + ["BIAS"])

def feature_phrase(name: str) -> str:
    """Base phrase for a feature (no polarity)."""
    if name.startswith("Segment_"):
        return f"Customer segment is {name.replace('Segment_', '')}"
    if name.startswith("CostCenter_"):
        return f"Cost center is {name.replace('CostCenter_', '')}"
    
    mapping = {
        # Aggregate features
        "Orders_CY": "order count (current year)",
        "Orders_PY": "order count (prior year)",
        "Orders_Lifetime": "lifetime order count",
        "Spend_CY": "spend (current year)",
        "Spend_PY": "spend (prior year)",
        "Spend_Lifetime": "lifetime spend",
        "Units_CY": "units purchased (current year)",
        "Units_PY": "units purchased (prior year)",
        "Units_Lifetime": "lifetime units",
        "AOV_CY": "average order value",
        "DaysSinceLast": "days since last order",
        "TenureDays": "customer tenure (days)",
        
        # CUBS category features
        "Uniforms_Units_CY": "uniforms units (current year)",
        "Uniforms_Spend_CY": "uniforms spend (current year)",
        "Uniforms_Orders_CY": "uniforms orders (current year)",
        "Uniforms_Pct_of_Total_CY": "uniforms % of total spend",
        "Uniforms_DaysSinceLast": "days since last uniforms order",
        
        "Sparring_Units_CY": "sparring units (current year)",
        "Sparring_Spend_CY": "sparring spend (current year)",
        "Sparring_Orders_CY": "sparring orders (current year)",
        "Sparring_Pct_of_Total_CY": "sparring % of total spend",
        "Sparring_DaysSinceLast": "days since last sparring order",
        
        "Belts_Units_CY": "belts units (current year)",
        "Belts_Spend_CY": "belts spend (current year)",
        "Belts_Orders_CY": "belts orders (current year)",
        "Belts_Pct_of_Total_CY": "belts % of total spend",
        "Belts_DaysSinceLast": "days since last belts order",
        
        "Bags_Units_CY": "bags units (current year)",
        "Bags_Spend_CY": "bags spend (current year)",
        "Bags_Orders_CY": "bags orders (current year)",
        "Bags_Pct_of_Total_CY": "bags % of total spend",
        "Bags_DaysSinceLast": "days since last bags order",
        
        "Customs_Units_CY": "customs units (current year)",
        "Customs_Spend_CY": "customs spend (current year)",
        "Customs_Orders_CY": "customs orders (current year)",
        "Customs_Pct_of_Total_CY": "customs % of total spend",
        "Customs_DaysSinceLast": "days since last customs order",
        
        # Breadth features
        "CUBS_Categories_Active_CY": "product categories active (current year)",
        "CUBS_Categories_Active_PY": "product categories active (prior year)",
        "CUBS_Categories_Ever": "product categories ever purchased",
    }
    return mapping.get(name, name.replace("_", " "))


def reason_text(feature: str, mode: str) -> str:
    """Generate reason text. mode: 'risk' (drivers) or 'safe' (protective)."""
    base = feature_phrase(feature)
    
    # Segment/CostCenter: keep as-is
    if feature.startswith("Segment_") or feature.startswith("CostCenter_"):
        return base
    
    # Features where HIGH value = LOW risk (protective)
    high_is_good = [
        "Orders_CY", "Orders_PY", "Orders_Lifetime",
        "Spend_CY", "Spend_PY", "Spend_Lifetime",
        "Units_CY", "Units_PY", "Units_Lifetime",
        "AOV_CY", "TenureDays",
        "Uniforms_Units_CY", "Uniforms_Spend_CY", "Uniforms_Orders_CY",
        "Sparring_Units_CY", "Sparring_Spend_CY", "Sparring_Orders_CY",
        "Belts_Units_CY", "Belts_Spend_CY", "Belts_Orders_CY",
        "Bags_Units_CY", "Bags_Spend_CY", "Bags_Orders_CY",
        "Customs_Units_CY", "Customs_Spend_CY", "Customs_Orders_CY",
        "CUBS_Categories_Active_CY", "CUBS_Categories_Active_PY", "CUBS_Categories_Ever",
    ]
    
    # Features where HIGH value = HIGH risk
    high_is_bad = [
        "DaysSinceLast",
        "Uniforms_DaysSinceLast", "Sparring_DaysSinceLast",
        "Belts_DaysSinceLast", "Bags_DaysSinceLast", "Customs_DaysSinceLast",
    ]
    
    if mode == "risk":
        if feature in high_is_good:
            return f"Low {base}"
        if feature in high_is_bad:
            return f"High {base}"
        return f"Unfavorable {base}"
    else:  # safe
        if feature in high_is_good:
            return f"High {base}"
        if feature in high_is_bad:
            return f"Low {base}"
        return f"Favorable {base}"


def risk_band(p: float) -> str:
    if p >= 0.7:
        return "A - High Risk"
    elif p >= 0.3:
        return "B - Medium Risk"
    else:
        return "C - Low Risk"


def top_reasons(row_contrib: pd.Series, risk: float, n: int = 3) -> list:
    s = row_contrib.drop(labels=["BIAS"], errors="ignore")
    
    if risk >= 0.7:
        # High risk: top positive contributors
        feats = s.sort_values(ascending=False).head(n).index.tolist()
        return [reason_text(f, "risk") for f in feats]
    
    if risk < 0.3:
        # Low risk: top negative contributors (protective)
        feats = s.sort_values(ascending=True).head(n).index.tolist()
        return [reason_text(f, "safe") for f in feats]
    
    # Medium: 2 risk drivers + 1 protective
    pos = s.sort_values(ascending=False).head(2).index.tolist()
    neg = s.sort_values(ascending=True).head(1).index.tolist()
    return ([reason_text(f, "risk") for f in pos] + [reason_text(f, "safe") for f in neg])[:3]


# Generate reasons for all validation rows
reasons_rows = []
for i in range(len(X_val)):
    reasons_rows.append(top_reasons(contrib_df.iloc[i], float(val_probs[i]), n=3))

reasons = pd.DataFrame(reasons_rows, columns=["Reason_1", "Reason_2", "Reason_3"])
print(f"Generated reasons for {len(reasons)} rows")

## 11) Long output

In [None]:
validate_out = validate_df.copy()
validate_out["AccountName"] = validate_out["AccountName"].fillna("").astype(str).str.strip()
validate_out["Segment"] = validate_out["Segment"].fillna("UNKNOWN").astype(str).str.strip()
validate_out["CostCenter"] = validate_out["CostCenter"].fillna("UNKNOWN").astype(str).str.strip()

final_long = validate_out[["CustomerId", "AccountName", "Segment", "CostCenter", "SnapshotDate"]].copy()
final_long["ChurnRiskPct"] = val_probs
final_long["RiskBand"] = final_long["ChurnRiskPct"].apply(lambda p: risk_band(float(p)))

final_long = pd.concat([final_long.reset_index(drop=True), reasons.reset_index(drop=True)], axis=1)

final_long.to_csv("churn_scores_long.csv", index=False)
print("Wrote: churn_scores_long.csv")
final_long.head(10)

## 12) Wide output (trailing 12 months)

In [None]:
final_long = final_long.copy()
final_long["SnapshotMonth"] = pd.to_datetime(final_long["SnapshotDate"]).dt.to_period("M").astype(str)

max_month = pd.to_datetime(final_long["SnapshotDate"]).max().to_period("M")
months = pd.period_range(end=max_month, periods=12, freq="M").astype(str).tolist()

final_12 = final_long[final_long["SnapshotMonth"].isin(months)].copy()

wide_scores = (
    final_12
    .pivot_table(
        index=["CustomerId", "AccountName", "Segment", "CostCenter"],
        columns="SnapshotMonth",
        values="ChurnRiskPct",
        aggfunc="max"
    )
    .reindex(columns=months)
    .reset_index()
)

start_col = months[0]
current_col = months[-1]

wide_scores["Risk_Start_12m"] = wide_scores[start_col] if start_col in wide_scores.columns else np.nan
wide_scores["Risk_Current"] = wide_scores[current_col] if current_col in wide_scores.columns else np.nan
wide_scores["Risk_Trend_12m"] = wide_scores["Risk_Current"] - wide_scores["Risk_Start_12m"]

def trend_dir(x):
    if pd.isna(x):
        return "Unknown"
    if x >= 0.05:
        return "Up"
    if x <= -0.05:
        return "Down"
    return "Flat"

wide_scores["TrendDirection"] = wide_scores["Risk_Trend_12m"].apply(trend_dir)
wide_scores["Risk_Avg_12m"] = wide_scores[months].mean(axis=1, skipna=True)
wide_scores["Risk_Median_12m"] = wide_scores[months].median(axis=1, skipna=True)
wide_scores["RiskBand_Current"] = wide_scores["Risk_Current"].apply(lambda p: risk_band(float(p)) if pd.notna(p) else "Unknown")

# Add latest reasons
latest_reasons = (
    final_12.sort_values(["CustomerId", "SnapshotDate"])
    .groupby("CustomerId")
    .tail(1)[["CustomerId", "Reason_1", "Reason_2", "Reason_3"]]
)

wide_out = wide_scores.merge(latest_reasons, on="CustomerId", how="left")
wide_out.to_csv("churn_scores_wide_12m.csv", index=False)

print("Wrote: churn_scores_wide_12m.csv")
wide_out.head(10)

## 13) Portfolio summary

In [None]:
portfolio = final_long.copy()

summary = pd.DataFrame({
    "Metric": [
        "Rows (customer-snapshots)",
        "Unique customers",
        "Avg churn risk (all rows)",
        "Median churn risk (all rows)",
        "High risk count (A)",
        "Medium risk count (B)",
        "Low risk count (C)",
        "High risk pct (A)",
        "Medium risk pct (B)",
        "Low risk pct (C)",
    ],
    "Value": [
        int(len(portfolio)),
        int(portfolio["CustomerId"].nunique()),
        float(portfolio["ChurnRiskPct"].mean()),
        float(portfolio["ChurnRiskPct"].median()),
        int((portfolio["RiskBand"] == "A - High Risk").sum()),
        int((portfolio["RiskBand"] == "B - Medium Risk").sum()),
        int((portfolio["RiskBand"] == "C - Low Risk").sum()),
        float((portfolio["RiskBand"] == "A - High Risk").mean()),
        float((portfolio["RiskBand"] == "B - Medium Risk").mean()),
        float((portfolio["RiskBand"] == "C - Low Risk").mean()),
    ]
})

summary.to_csv("portfolio_summary.csv", index=False)
print("Wrote: portfolio_summary.csv")
summary

## 14) File list

In [None]:
import os
for fn in sorted(os.listdir(".")):
    if fn.endswith(".csv") or fn.endswith(".txt"):
        print(fn)