# Telco Customer Churn Exploration
Load the cleaned dataset, inspect churn probabilities from the production random forest model, and explore several alternative models.


In [None]:
import numpy as np
import pandas as pd
import joblib

from pathlib import Path

#various models to try
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier



## Load the cleaned dataset
We will work off the feature-engineered dataset that powers the training pipeline.


In [7]:
data_path = Path('../data/clean/telco_churn_clean.csv')
churn_df = pd.read_csv(data_path)
churn_df.head()


Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## Score customers with the production random forest model
Load the persisted sklearn preprocessor + model artifacts, transform the features, and attach churn probabilities.


In [None]:
# Load the preprocessor and model
model_path = Path('../models/random_forest_churn_model.pkl')
preprocessor_path = Path('../models/preprocessor.pkl')

preprocessor = joblib.load(preprocessor_path)
rf_model = joblib.load(model_path)

# Preprocess the data and make predictions
feature_cols = [col for col in churn_df.columns if col != 'churn']
X_features = churn_df[feature_cols]
encoded_features = preprocessor.transform(X_features)
encoded_features = encoded_features.toarray() if hasattr(encoded_features, 'toarray') else encoded_features

# Generate churn probabilities and flags and add to DF
churn_flag = churn_df['churn'].str.lower().isin({'yes', 'true', '1'}).astype(int)
rf_probabilities = rf_model.predict_proba(encoded_features)[:, 1]

scored_df = churn_df.copy()
scored_df['churn_probability'] = rf_probabilities
scored_df['churn_flag'] = churn_flag
scored_df[['customerid', 'churn', 'churn_probability']].head()


Unnamed: 0,customerid,churn,churn_probability
0,7590-VHVEG,No,0.300896
1,5575-GNVDE,No,0.252841
2,3668-QPYBK,Yes,0.281614
3,7795-CFOCW,No,0.237972
4,9237-HQITU,Yes,0.323388


## Customers at the highest, median, and lowest churn risk


In [9]:
sorted_scores = scored_df.sort_values('churn_probability')

lowest_risk = sorted_scores.head(5)
median_idx = len(sorted_scores) // 2
median_risk = sorted_scores.iloc[[median_idx]]
highest_risk = sorted_scores.tail(5).iloc[::-1]

cols_to_show = ['customerid', 'contract', 'tenure', 'monthlycharges', 'churn', 'churn_probability']
print('Highest churn probability customers:')
display(highest_risk[cols_to_show])

print('Median churn probability customer:')
display(median_risk[cols_to_show])

print('Lowest churn probability customers:')
display(lowest_risk[cols_to_show])


Highest churn probability customers:


Unnamed: 0,customerid,contract,tenure,monthlycharges,churn,churn_probability
584,5192-EBGOV,Month-to-month,1,85.7,Yes,0.326292
3073,5277-ZLOOR,Month-to-month,2,85.55,Yes,0.326292
6359,2720-WGKHP,Month-to-month,2,94.0,Yes,0.325907
1971,9497-QCMMS,Month-to-month,1,93.55,Yes,0.325907
4792,9300-AGZNL,Month-to-month,1,94.0,Yes,0.325907


Median churn probability customer:


Unnamed: 0,customerid,contract,tenure,monthlycharges,churn,churn_probability
629,5099-BAILX,Month-to-month,43,110.75,Yes,0.266439


Lowest churn probability customers:


Unnamed: 0,customerid,contract,tenure,monthlycharges,churn,churn_probability
3977,5884-FBCTL,Two year,72,25.1,No,0.207695
981,8165-ZJRNM,Two year,72,23.75,No,0.207695
953,3261-CQXOL,Two year,71,25.45,No,0.207695
1229,1299-AURJA,Two year,70,24.7,No,0.207695
6090,7711-GQBZC,Two year,71,24.7,No,0.207695


## Train multiple models and compare metrics
Use the same encoded feature matrix to train a baseline `XGBClassifier` and benchmark it against the production model on a shared hold-out split.


In [None]:
# setup train-test split
X_train, X_test, y_train, y_test = train_test_split(
    encoded_features,
    churn_flag,
    test_size=0.2,
    random_state=42,
    stratify=churn_flag,
)

rf_test_proba = rf_model.predict_proba(X_test)[:, 1]
rf_test_preds = (rf_test_proba >= 0.5).astype(int)

# try random forest
xgb_model = XGBClassifier(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    n_jobs=-1,
    random_state=42,
    eval_metric='logloss',
    tree_method='hist',
)
xgb_model.fit(X_train, y_train)

xgb_test_proba = xgb_model.predict_proba(X_test)[:, 1]
xgb_test_preds = (xgb_test_proba >= 0.5).astype(int)

# try logistic regression
log_reg = LogisticRegression(
    max_iter=2000,
    C=1.0,
    solver='lbfgs',
)
log_reg.fit(X_train, y_train)

log_test_proba = log_reg.predict_proba(X_test)[:, 1]
log_test_preds = (log_test_proba >= 0.5).astype(int)

# try catboost
cat_model = CatBoostClassifier(
    depth=6,
    learning_rate=0.05,
    iterations=600,
    loss_function='Logloss',
    random_seed=42,
    verbose=False,
)
cat_model.fit(X_train, y_train)

cat_test_proba = cat_model.predict_proba(X_test)[:, 1]
cat_test_preds = (cat_test_proba >= 0.5).astype(int)

# try lightgbm
lgbm_model = LGBMClassifier(
    n_estimators=600,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
)
lgbm_model.fit(X_train, y_train)

lgbm_test_proba = lgbm_model.predict_proba(X_test)[:, 1]
lgbm_test_preds = (lgbm_test_proba >= 0.5).astype(int)

# summarize metrics function
def summarise_metrics(y_true, probas, preds):
    return {
        'accuracy': accuracy_score(y_true, preds),
        'roc_auc': roc_auc_score(y_true, probas),
        'log_loss': log_loss(y_true, probas),
    }

# prep and show comparison dataframe
comparison = pd.DataFrame(
    [
        {'model': 'RandomForest (pretrained)', **summarise_metrics(y_test, rf_test_proba, rf_test_preds)},
        {'model': 'XGBoost (fresh fit)', **summarise_metrics(y_test, xgb_test_proba, xgb_test_preds)},
        {'model': 'LogisticRegression', **summarise_metrics(y_test, log_test_proba, log_test_preds)},
        {'model': 'CatBoost', **summarise_metrics(y_test, cat_test_proba, cat_test_preds)},
        {'model': 'LightGBM', **summarise_metrics(y_test, lgbm_test_proba, lgbm_test_preds)},
    ]
).sort_values('accuracy', ascending=False)
comparison


[LightGBM] [Info] Number of positive: 1495, number of negative: 4130
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000237 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 668
[LightGBM] [Info] Number of data points in the train set: 5625, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.265778 -> initscore=-1.016151
[LightGBM] [Info] Start training from score -1.016151




Unnamed: 0,model,accuracy,roc_auc,log_loss
2,LogisticRegression,0.803838,0.835528,0.42797
3,CatBoost,0.799574,0.838343,0.423485
1,XGBoost (fresh fit),0.777541,0.825909,0.441804
4,LightGBM,0.770434,0.816665,0.495925
0,RandomForest (pretrained),0.734186,0.81676,0.54616


Models have different accuracy and AUC, with logistic regression the best.
The model was deployed with random forest and have similar AUC.