In [28]:
# ================================
# Imports (ALL imports in one cell)
# ================================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.impute import SimpleImputer

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline


In [29]:
# ================================
# Data Loading (LOCKED)
# ================================

df_ibm = pd.read_csv("../data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv")
df_bigml = pd.read_csv("../data/raw/churn_bigml.xlsx")


In [30]:
# ================================
# Target Cleaning
# ================================

df_ibm["Churn"] = df_ibm["Churn"].map({"Yes": 1, "No": 0})
df_bigml["Churn"] = df_bigml["Churn"].astype(int)


In [31]:
# ================================
# Feature Engineering (BigML)
# ================================

df_bigml["total_usage_minutes"] = (
    df_bigml["Total day minutes"]
    + df_bigml["Total eve minutes"]
    + df_bigml["Total night minutes"]
    + df_bigml["Total intl minutes"]
)

df_bigml["total_usage_calls"] = (
    df_bigml["Total day calls"]
    + df_bigml["Total eve calls"]
    + df_bigml["Total night calls"]
    + df_bigml["Total intl calls"]
)

df_bigml = df_bigml[
    [
        "International plan",
        "Voice mail plan",
        "Customer service calls",
        "total_usage_minutes",
        "total_usage_calls",
        "Churn",
    ]
]

df_bigml.columns = [
    "international_plan",
    "voice_mail_plan",
    "customer_service_calls",
    "total_usage_minutes",
    "total_usage_calls",
    "churn",
]


In [32]:
# ================================
# Feature Selection (IBM)
# ================================

df_ibm = df_ibm[
    [
        "tenure",
        "MonthlyCharges",
        "TotalCharges",
        "Churn",
    ]
]

df_ibm.columns = [
    "tenure",
    "monthly_charges",
    "total_charges",
    "churn",
]

df_ibm["total_charges"] = pd.to_numeric(df_ibm["total_charges"], errors="coerce")


In [33]:
# ================================
# Dataset Union
# ================================

df_combined = pd.concat(
    [df_ibm, df_bigml],
    axis=0,
    ignore_index=True
)


In [34]:
# ================================
# Feature / Target Split
# ================================

X = df_combined.drop("churn", axis=1)
y = df_combined["churn"]


In [35]:
# ================================
# Train/Test Split
# ================================

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=42,
    stratify=y
)


In [36]:
# ================================
# Feature Type Detection
# ================================

numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()


In [37]:
# ================================
# Preprocessing Pipeline
# ================================

numeric_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]
)

categorical_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, numeric_features),
        ("cat", categorical_pipeline, categorical_features)
    ]
)


In [38]:
# ================================
# Evaluation Function (CHURN-ONLY)
# ================================

def evaluate_churn(y_true, y_pred):
    print(classification_report(y_true, y_pred))
    print(confusion_matrix(y_true, y_pred))


In [39]:
# ================================
# Baseline Model (Logistic Regression)
# ================================

baseline_model = ImbPipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("smote", SMOTE(random_state=42)),
        ("classifier", LogisticRegression(
            max_iter=1000,
            class_weight="balanced"
        ))
    ]
)

baseline_model.fit(X_train, y_train)

y_pred_baseline = baseline_model.predict(X_test)
evaluate_churn(y_test, y_pred_baseline)


              precision    recall  f1-score   support

           0       0.89      0.71      0.79      1437
           1       0.46      0.74      0.57       491

    accuracy                           0.72      1928
   macro avg       0.68      0.72      0.68      1928
weighted avg       0.78      0.72      0.73      1928

[[1017  420]
 [ 128  363]]




In [40]:
# ================================
# Gradient Boosting Model
# ================================

gb_model = ImbPipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("smote", SMOTE(random_state=42)),
        ("classifier", GradientBoostingClassifier(random_state=42))
    ]
)

gb_model.fit(X_train, y_train)

y_pred_gb = gb_model.predict(X_test)
evaluate_churn(y_test, y_pred_gb)




              precision    recall  f1-score   support

           0       0.88      0.76      0.81      1437
           1       0.49      0.70      0.58       491

    accuracy                           0.74      1928
   macro avg       0.69      0.73      0.70      1928
weighted avg       0.78      0.74      0.75      1928

[[1088  349]
 [ 149  342]]


In [41]:
# ================================
# Threshold Tuning (CHURN PRIORITY)
# ================================

def threshold_evaluation(model, X_test, y_test, thresholds):
    probs = model.predict_proba(X_test)[:, 1]
    for t in thresholds:
        preds = (probs >= t).astype(int)
        print(f"\nThreshold: {t}")
        print(classification_report(y_test, preds))


In [42]:
# ================================
# Threshold Sweep
# ================================

thresholds = [0.5, 0.4, 0.3, 0.25, 0.2]
threshold_evaluation(baseline_model, X_test, y_test, thresholds)



Threshold: 0.5
              precision    recall  f1-score   support

           0       0.89      0.71      0.79      1437
           1       0.46      0.74      0.57       491

    accuracy                           0.72      1928
   macro avg       0.68      0.72      0.68      1928
weighted avg       0.78      0.72      0.73      1928


Threshold: 0.4
              precision    recall  f1-score   support

           0       0.92      0.60      0.72      1437
           1       0.42      0.84      0.56       491

    accuracy                           0.66      1928
   macro avg       0.67      0.72      0.64      1928
weighted avg       0.79      0.66      0.68      1928


Threshold: 0.3
              precision    recall  f1-score   support

           0       0.94      0.46      0.62      1437
           1       0.37      0.92      0.52       491

    accuracy                           0.57      1928
   macro avg       0.65      0.69      0.57      1928
weighted avg       0.79   

In [43]:
# ================================
# FINAL CHURN MODEL (LOCK CANDIDATE)
# ================================

final_threshold = 0.3
final_probs = baseline_model.predict_proba(X_test)[:, 1]
final_preds = (final_probs >= final_threshold).astype(int)

evaluate_churn(y_test, final_preds)


              precision    recall  f1-score   support

           0       0.94      0.46      0.62      1437
           1       0.37      0.92      0.52       491

    accuracy                           0.57      1928
   macro avg       0.65      0.69      0.57      1928
weighted avg       0.79      0.57      0.59      1928

[[657 780]
 [ 41 450]]


In [44]:
# ================================
# Improved Churn Model (Recall Floor + Precision Gain)
# ================================

from sklearn.metrics import precision_recall_curve

# Train a stronger Gradient Boosting model (no SMOTE, relies on class imbalance handling via thresholding)
gb_tuned = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", GradientBoostingClassifier(
            n_estimators=200,
            learning_rate=0.05,
            max_depth=3,
            random_state=42
        ))
    ]
)

gb_tuned.fit(X_train, y_train)

# Get churn probabilities
y_probs = gb_tuned.predict_proba(X_test)[:, 1]

# Precision-Recall curve
precisions, recalls, thresholds = precision_recall_curve(y_test, y_probs)

# Enforce recall floor
recall_floor = 0.85
valid_idxs = np.where(recalls >= recall_floor)[0]

best_idx = valid_idxs[np.argmax(precisions[valid_idxs])]
best_threshold = thresholds[best_idx]

print("Selected Threshold:", best_threshold)

# Final predictions using recall-constrained threshold
y_final = (y_probs >= best_threshold).astype(int)

print(classification_report(y_test, y_final))
print(confusion_matrix(y_test, y_final))


Selected Threshold: 0.17081091206089097
              precision    recall  f1-score   support

           0       0.92      0.62      0.74      1437
           1       0.43      0.85      0.58       491

    accuracy                           0.68      1928
   macro avg       0.68      0.74      0.66      1928
weighted avg       0.80      0.68      0.70      1928

[[894 543]
 [ 73 418]]


In [45]:
# ================================
# Lock Final Churn Model (Balanced, Recall-Constrained)
# ================================

FINAL_MODEL = gb_tuned
FINAL_THRESHOLD = best_threshold

# Final predictions
final_probs = FINAL_MODEL.predict_proba(X_test)[:, 1]
final_preds = (final_probs >= FINAL_THRESHOLD).astype(int)

print("FINAL MODEL PERFORMANCE")
print("Threshold:", FINAL_THRESHOLD)
print(classification_report(y_test, final_preds))
print(confusion_matrix(y_test, final_preds))


FINAL MODEL PERFORMANCE
Threshold: 0.17081091206089097
              precision    recall  f1-score   support

           0       0.92      0.62      0.74      1437
           1       0.43      0.85      0.58       491

    accuracy                           0.68      1928
   macro avg       0.68      0.74      0.66      1928
weighted avg       0.80      0.68      0.70      1928

[[894 543]
 [ 73 418]]


In [46]:
# ================================
# Store Final Metrics for Later Reporting
# ================================

final_report = classification_report(y_test, final_preds, output_dict=True)
final_confusion = confusion_matrix(y_test, final_preds)

final_report, final_confusion


({'0': {'precision': 0.9245087900723888,
   'recall': 0.6221294363256785,
   'f1-score': 0.7437603993344426,
   'support': 1437.0},
  '1': {'precision': 0.43496357960457854,
   'recall': 0.8513238289205702,
   'f1-score': 0.5757575757575758,
   'support': 491.0},
  'accuracy': 0.6804979253112033,
  'macro avg': {'precision': 0.6797361848384837,
   'recall': 0.7367266326231243,
   'f1-score': 0.6597589875460093,
   'support': 1928.0},
  'weighted avg': {'precision': 0.7998372660372775,
   'recall': 0.6804979253112033,
   'f1-score': 0.7009754478944832,
   'support': 1928.0}},
 array([[894, 543],
        [ 73, 418]]))

In [47]:
# ================================
# Sanity Check: Churn Capture Rate
# ================================

churn_captured = final_confusion[1, 1]
churn_missed = final_confusion[1, 0]
total_churn = churn_captured + churn_missed

print(f"Churners captured: {churn_captured}")
print(f"Churners missed: {churn_missed}")
print(f"Churn capture rate: {churn_captured / total_churn:.3f}")


Churners captured: 418
Churners missed: 73
Churn capture rate: 0.851


In [48]:
# ================================
# Create Churn Action Table
# ================================

churn_action_table = X_test.copy()
churn_action_table["churn_probability"] = final_probs
churn_action_table["predicted_churn"] = final_preds

# Keep only predicted churners
churn_action_table = churn_action_table[churn_action_table["predicted_churn"] == 1]

churn_action_table.head()


Unnamed: 0,tenure,monthly_charges,total_charges,international_plan,voice_mail_plan,customer_service_calls,total_usage_minutes,total_usage_calls,churn_probability,predicted_churn
6844,29.0,89.65,2623.65,,,,,,0.326971,1
3398,61.0,100.7,6018.65,,,,,,0.209655,1
419,1.0,75.3,75.3,,,,,,0.821473,1
5048,54.0,99.1,5437.1,,,,,,0.237096,1
1103,54.0,105.2,5637.85,,,,,,0.271958,1


In [49]:
# ================================
# Save Predicted Churners to SQLite Database
# ================================

import sqlite3

# Create / connect to local database
conn = sqlite3.connect("../reports/churn_predictions.db")

# Write table to database
churn_action_table.to_sql(
    "predicted_churners",
    conn,
    if_exists="replace",
    index=False
)

conn.close()


In [50]:
# ================================
# Sanity Check: Read From Database
# ================================

conn = sqlite3.connect("../reports/churn_predictions.db")

pd.read_sql(
    "SELECT * FROM predicted_churners LIMIT 5;",
    conn
)

conn.close()
