In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    average_precision_score, precision_recall_curve,
    precision_score, recall_score, f1_score
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

# 1) Load your dataset
# Make sure to update the file path to the correct location of your CSV file
df = pd.read_csv("/content/sample_data/customer_churn_dataset_MASTER.csv")

target = "Churn"
categorical = ["Gender", "Subscription Type", "Contract Length"]
numeric = [c for c in df.columns if c not in categorical + [target, "CustomerID"]]

# Drop rows with missing values in the 'Churn' column
df.dropna(subset=[target], inplace=True)

X = df[categorical + numeric]
y = df[target].astype(int)

# 2) Keep your existing split OR add temporal proxy split if you want to mirror chronology
# --- Option A (keep current stratified split) ---
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)
X_val, X_test, y_val, y_split_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42
)

# --- Option B (temporal proxy split using Tenure + Last Interaction) ---
# df_sorted = df.sort_values(by=["Tenure", "Last Interaction"])
# X_sorted = df_sorted[categorical + numeric]
# y_sorted = df_sorted[target].astype(int)
# n = len(df_sorted)
# train_end, val_end = int(0.70*n), int(0.85*n)
# X_train, y_train = X_sorted.iloc[:train_end], y_sorted.iloc[:train_end]
# X_val,   y_val   = X_sorted.iloc[train_end:val_end], y_sorted.iloc[train_end:val_end]
# X_test,  y_test  = X_sorted.iloc[val_end:], y_sorted.iloc[val_end:]

# 3) Preprocessor
pre = ColumnTransformer(
    transformers=[("cat", OneHotEncoder(handle_unknown="ignore"), categorical)],
    remainder="passthrough"
)

In [None]:
# Logistic Regression (fast baseline)
logreg = LogisticRegression(max_iter=500)
logreg_bal = LogisticRegression(max_iter=500, class_weight="balanced")

# Random Forest (ensemble)
rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf_bal = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1, class_weight="balanced")

pipe_logreg      = Pipeline([("pre", pre), ("clf", logreg)])
pipe_logreg_bal  = Pipeline([("pre", pre), ("clf", logreg_bal)])
pipe_rf          = Pipeline([("pre", pre), ("clf", rf)])
pipe_rf_bal      = Pipeline([("pre", pre), ("clf", rf_bal)])

pipe_logreg.fit(X_train, y_train)
pipe_logreg_bal.fit(X_train, y_train)
pipe_rf.fit(X_train, y_train)
pipe_rf_bal.fit(X_train, y_train)

In [None]:
def pr_metrics(pipe, Xv, yv):
    proba = pipe.predict_proba(Xv)[:, 1]
    ap = average_precision_score(yv, proba)  # PR-AUC
    p, r, t = precision_recall_curve(yv, proba)
    return ap, p, r, t, proba

# Logistic Regression (fast baseline)
logreg = LogisticRegression(max_iter=500)
logreg_bal = LogisticRegression(max_iter=500, class_weight="balanced")

# Random Forest (ensemble)
rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf_bal = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1, class_weight="balanced")

pipe_logreg      = Pipeline([("pre", pre), ("clf", logreg)])
pipe_logreg_bal  = Pipeline([("pre", pre), ("clf", logreg_bal)])
pipe_rf          = Pipeline([("pre", pre), ("clf", rf)])
pipe_rf_bal      = Pipeline([("pre", pre), ("clf", rf_bal)])

pipe_logreg.fit(X_train, y_train)
pipe_logreg_bal.fit(X_train, y_train)
pipe_rf.fit(X_train, y_train)
pipe_rf_bal.fit(X_train, y_train)

models = {
    "LogReg": pipe_logreg,
    "LogReg_balanced": pipe_logreg_bal,
    "RandomForest": pipe_rf,
    "RandomForest_balanced": pipe_rf_bal,
}

results = {}
for name, mdl in models.items():
    ap, p, r, t, proba = pr_metrics(mdl, X_val, y_val)
    results[name] = {"AP": ap, "p": p, "r": r, "t": t, "proba": proba}

def sweep_thresholds(y_true, proba, thresholds):
    rows = []
    for thr in thresholds:
        y_pred = (proba >= thr).astype(int)
        rows.append({
            "threshold": thr,
            "precision": precision_score(y_true, y_pred, zero_division=0),
            "recall":    recall_score(y_true, y_pred, zero_division=0),
            "f1":        f1_score(y_true, y_pred, zero_division=0),
        })
    return pd.DataFrame(rows)

thr_values = np.round(np.arange(0.30, 0.81, 0.05), 2)
rf_sweep     = sweep_thresholds(y_val, results["RandomForest"]["proba"], thr_values)
rf_bal_sweep = sweep_thresholds(y_val, results["RandomForest_balanced"]["proba"], thr_values)

best_rf     = rf_sweep.sort_values("f1", ascending=False).iloc[0]
best_rf_bal = rf_bal_sweep.sort_values("f1", ascending=False).iloc[0]

print("PR-AUC (Average Precision):")
for k, v in results.items():
    print(f"  {k}: {v['AP']:.4f}")

print("\nBest RF threshold by F1 (unweighted):",
      dict(best_rf))
print("Best RF threshold by F1 (balanced):   ",
      dict(best_rf_bal))
print("\nValidation churn rate:", y_val.mean())

NameError: name 'pre' is not defined

In [None]:
# Only enable if churn becomes highly imbalanced (e.g., <20%) in another dataset.
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
# Encode train, then resample:
X_train_enc = pre.fit_transform(X_train)
X_train_res, y_train_res = smote.fit_resample(X_train_enc, y_train)
# Fit the classifier on resampled data (and use pre.transform for val/test):
clf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
clf.fit(X_train_res, y_train_res)