## Telco Customer Churn - Part 2: Advanced Model Pipeline & Ensemble Methods

In [1]:
!pip install -q -r requirements.txt

In [108]:
import imblearn
import sklearn

print(f"imblearn version: {imblearn.__version__}")
print(f"scikit-learn version: {sklearn.__version__}")

imblearn version: 0.14.0
scikit-learn version: 1.7.0


In [9]:
pip install --upgrade imbalanced-learn==0.14.0 scikit-learn==1.7.0

Collecting scikit-learn==1.7.0
  Using cached scikit_learn-1.7.0-cp311-cp311-win_amd64.whl.metadata (14 kB)
Using cached scikit_learn-1.7.0-cp311-cp311-win_amd64.whl (10.7 MB)
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.5.2
    Uninstalling scikit-learn-1.5.2:
      Successfully uninstalled scikit-learn-1.5.2
Successfully installed scikit-learn-1.7.0
Note: you may need to restart the kernel to use updated packages.


  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
category-encoders 2.7.0 requires scikit-learn<1.6.0,>=1.0.0, but you have scikit-learn 1.7.0 which is incompatible.
sklearn-compat 0.1.3 requires scikit-learn<1.7,>=1.2, but you have scikit-learn 1.7.0 which is incompatible.


### 1. Imports and display settings

In [109]:
import warnings

import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve, auc
from sklearn.metrics import (roc_auc_score, average_precision_score, precision_score,
recall_score, f1_score, confusion_matrix, classification_report)

warnings.filterwarnings('ignore')

Try imports for optional libraries

In [110]:
try:
    from xgboost import XGBClassifier
except Exception:
    XGBClassifier = None

try:
    from catboost import CatBoostClassifier, Pool
except Exception:
    CatBoostClassifier = None
    
try:
    import category_encoders as ce
except Exception:
    ce = None
    
pd.set_option('display.max_columns', 200)
plt.rcParams['figure.figsize'] = (10,6)

### 2. Load cleaned dataset

In [111]:
DATA_PATH = 'src/data/processed/telco-customer-churn_cleaned.csv'
df = pd.read_csv(DATA_PATH)

print('Loaded shape:', df.shape)
df.head()

Loaded shape: (7043, 24)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,tenure_group,services_count,avg_charge_per_month
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,0 months,1.0,29.85
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No,25-48,3.0,55.573529
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1-12,3.0,54.075
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No,25-48,3.0,40.905556
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1-12,1.0,75.825


### 3. Quick checks & final cleaning

- Ensure datatypes are correct
- Fix TotalCharges NaNs if still present (common case: tenure == 0)

Convert TotalCharges to numeric if needed

In [112]:
if df['TotalCharges'].dtype == 'object':
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'].str.strip().replace('', np.nan), errors='coerce')

Fill TotalCharges NaN with MonthlyCharges*tenure (reasonable proxy) or 0 when tenure==0

In [113]:
mask = df['TotalCharges'].isnull()

if mask.sum() > 0:
    df.loc[mask, 'TotalCharges'] = df.loc[mask, 'MonthlyCharges'] * df.loc[mask, 'tenure']
    print('Filled', mask.sum(), 'TotalCharges NaNs using MonthlyCharges * tenure proxy')

Filled 11 TotalCharges NaNs using MonthlyCharges * tenure proxy


it filled 11 NaN with MonthlyCharges * tenure proxy (ran 2 times so the output was merged, thats why isnt showing anything here)

Ensure target is binary 0/1

In [114]:
df['Churn_flag'] = df['Churn'].map({'No':0, 'Yes':1}).astype(int)

Drop customerID if exists

In [115]:
if 'customerID' in df.columns:
    df.drop(columns=['customerID'], inplace=True)

In [116]:
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,tenure_group,services_count,avg_charge_per_month,Churn_flag
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,0 months,1.0,29.850000,0
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.50,No,25-48,3.0,55.573529,0
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1-12,3.0,54.075000,1
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No,25-48,3.0,40.905556,0
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes,1-12,1.0,75.825000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.50,No,13-24,7.0,82.937500,0
7039,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.90,No,48+,6.0,102.262500,0
7040,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No,1-12,1.0,31.495455,0
7041,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.60,Yes,1-12,2.0,76.650000,1


Show final dtypes

In [117]:
print(df.dtypes)

gender                   object
SeniorCitizen             int64
Partner                  object
Dependents               object
tenure                    int64
PhoneService             object
MultipleLines            object
InternetService          object
OnlineSecurity           object
OnlineBackup             object
DeviceProtection         object
TechSupport              object
StreamingTV              object
StreamingMovies          object
Contract                 object
PaperlessBilling         object
PaymentMethod            object
MonthlyCharges          float64
TotalCharges            float64
Churn                    object
tenure_group             object
services_count          float64
avg_charge_per_month    float64
Churn_flag                int32
dtype: object


### 4. Feature Engineering

- Tenure categories: New (0-12), Established (13-48), Loyal (49+)
- Service adoption score (count of services subscribed / total service features)
- Average monthly charges per active service
- Payment reliability indicators (proxy): electronic_check flag & paperless billing flag

In [118]:
SERVICES = ['PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup',
'DeviceProtection','TechSupport','StreamingTV','StreamingMovies']

Tenure categories

In [119]:
def tenure_category(t):
    if t <= 12:
        return 'New'
    elif t <= 48:
        return 'Established'
    else:
        return 'Loyal'


df['tenure_category'] = df['tenure'].apply(tenure_category) 

Service adoption score (binary map)

In [120]:
service_map = {'Yes':1, 'No':0, 'No phone service':0, 'No internet service':0}
service_flags = df[SERVICES].applymap(lambda x: service_map.get(x, 0))
df['services_count'] = service_flags.sum(axis=1)
df['service_adoption_score'] = df['services_count'] / len(SERVICES)

Average monthly charge per active service (avoid div by 0)

In [121]:
df['avg_charge_per_service'] = df['MonthlyCharges'] / (df['services_count'].replace(0, np.nan))

fill inf/NaN (no services) with MonthlyCharges (or 0) — chosen approach: MonthlyCharges

In [122]:
df['avg_charge_per_service'].fillna(df['MonthlyCharges'], inplace=True)

#### Payment reliability proxies
- 'Electronic check' historically correlates with higher churn in this dataset

In [123]:
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,tenure_group,services_count,avg_charge_per_month,Churn_flag,tenure_category,service_adoption_score,avg_charge_per_service
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,0 months,1,29.850000,0,New,0.111111,29.850000
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.50,No,25-48,3,55.573529,0,Established,0.333333,18.983333
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1-12,3,54.075000,1,New,0.333333,17.950000
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No,25-48,3,40.905556,0,Established,0.333333,14.100000
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes,1-12,1,75.825000,1,New,0.111111,70.700000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.50,No,13-24,7,82.937500,0,Established,0.777778,12.114286
7039,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.90,No,48+,6,102.262500,0,Loyal,0.666667,17.200000
7040,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No,1-12,1,31.495455,0,New,0.111111,29.600000
7041,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.60,Yes,1-12,2,76.650000,1,New,0.222222,37.200000


In [124]:
if 'PaymentMethod' in df.columns:
    df['is_electronic_check'] = df['PaymentMethod'].str.contains('Electronic check', na=False).astype(int)
else:
    df['is_electronic_check'] = 0

- Paperless billing often related to easier cancellation

In [125]:
if 'PaperlessBilling' in df.columns:
    df['paperless_flag'] = df['PaperlessBilling'].map({'Yes':1,'No':0}).fillna(0).astype(int)
else:
    df['paperless_flag'] = 0

- Interaction feature example: Contract x tenure_category

In [126]:
if 'Contract' in df.columns:
    df['contract_tenure_interaction'] = df['Contract'].astype(str) + '_' + df['tenure_category'].astype(str)

print('Feature engineering complete — new features:', 
      ['tenure_category',
       'services_count',
       'service_adoption_score',
       'avg_charge_per_service',
       'is_electronic_check',
       'paperless_flag',
       'contract_tenure_interaction'])

Feature engineering complete — new features: ['tenure_category', 'services_count', 'service_adoption_score', 'avg_charge_per_service', 'is_electronic_check', 'paperless_flag', 'contract_tenure_interaction']


### 5. Train-test split (stratified)

- Keep an untouched test set for final evaluation

In [127]:
TARGET = 'Churn_flag'
X = df.drop(columns=[TARGET, 'Churn'])
y = df[TARGET]

In [128]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)

Train shape: (5634, 28) Test shape: (1409, 28)


In [129]:
np.savez('artifacts/X_train.npz', X_train)
np.savez('artifacts/Y_train.npz', y_train)
np.savez('artifacts/X_test.npz', X_test)
np.savez('artifacts/Y_test.npz', y_test)

### 6. Encoding strategies - define three preprocessing strategies:

1) One-Hot encoding for categorical
2) Ordinal encoding (simple) for high-cardinality
3) Target encoding (with smoothing) — implemented with category_encoders if available

We build modular ColumnTransformer that we can swap in pipelines.

Identify columns

In [130]:
numeric_features = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_features = X.select_dtypes(include=['object','category']).columns.tolist()

# Remove engineered numeric-like categories if present
for c in ['tenure_category','contract_tenure_interaction']:
    if c in cat_features:
        pass

In [131]:
print('Numeric features:\n', numeric_features)
print('Categorical features:\n', cat_features)

Numeric features:
 ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'services_count', 'avg_charge_per_month', 'service_adoption_score', 'avg_charge_per_service']
Categorical features:
 ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'tenure_group', 'tenure_category', 'contract_tenure_interaction']


#### 6.1 One-Hot encoding pipeline (with imputation and scaling)

In [132]:
numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
])


onehot_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


preprocessor_onehot = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', onehot_transformer, cat_features)
], remainder='drop')

#### 6.2 Ordinal encoding pipeline (useful for tree-based models or low-cardinality categories)

In [133]:
ordinal_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])


preprocessor_ordinal = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', ordinal_transformer, cat_features)
], remainder='drop')

#### 6.3 Target encoding pipeline (requires category_encoders library)

In [134]:
if ce is not None:
    target_enc = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('target_encoder', ce.TargetEncoder())
])
    preprocessor_target = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', target_enc, cat_features)
    ], remainder='drop')
else:
    preprocessor_target = preprocessor_ordinal
    print('category_encoders not available — using ordinal fallback for "target" pipeline')

### 7. Helper: model evaluation function

In [135]:
def evaluate_model(model, X_val, y_val, threshold=0.5, show_report=True, model_name=None):
    y_proba = model.predict_proba(X_val)[:,1]
    y_pred = (y_proba >= threshold).astype(int)
    roc = roc_auc_score(y_val, y_proba)
    ap = average_precision_score(y_val, y_proba)
    prec = precision_score(y_val, y_pred)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    cm = confusion_matrix(y_val, y_pred)
    
    if show_report:
        model_info = f"{model_name} - " if model_name else ""
        print(f'{model_info}ROC AUC: {roc:.4f}, PR AUC: {ap:.4f}')
        print(f'{model_info}Precision: {prec:.4f}, Recall: {rec:.4f}, F1: {f1:.4f}')
        print(f'{model_info}Confusion matrix:\n', cm)
    return {'roc_auc':roc, 'pr_auc':ap, 'precision':prec, 'recall':rec, 'f1':f1, 'confusion_matrix':cm, 'y_proba':y_proba}

### 8. Baseline models (Logistic Regression & Decision Tree)

- We'll build pipelines for baselines using the one-hot preprocessor.

In [136]:
baseline_lr = Pipeline(steps=[
    ('pre', preprocessor_onehot),
    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))
])


baseline_dt = Pipeline(steps=[
    ('pre', preprocessor_onehot),
    ('clf', DecisionTreeClassifier(class_weight='balanced', random_state=42))
])

Fit baseline models quickly (on a subset if slow)

In [137]:
print('Training baseline Logistic Regression...')
baseline_lr.fit(X_train, y_train)
print('Evaluating on test set:')
evaluate_model(baseline_lr, X_test, y_test)

print('\nTraining baseline Decision Tree...')
baseline_dt.fit(X_train, y_train)
print('Evaluating on test set:')
evaluate_model(baseline_dt, X_test, y_test)

Training baseline Logistic Regression...
Evaluating on test set:
ROC AUC: 0.8471, PR AUC: 0.6661
Precision: 0.5008, Recall: 0.7968, F1: 0.6151
Confusion matrix:
 [[738 297]
 [ 76 298]]

Training baseline Decision Tree...
Evaluating on test set:
ROC AUC: 0.6272, PR AUC: 0.3522
Precision: 0.4620, Recall: 0.4385, F1: 0.4499
Confusion matrix:
 [[844 191]
 [210 164]]


{'roc_auc': 0.6271926425379111,
 'pr_auc': 0.35219000503275427,
 'precision': 0.4619718309859155,
 'recall': 0.4385026737967914,
 'f1': 0.4499314128943759,
 'confusion_matrix': array([[844, 191],
        [210, 164]], dtype=int64),
 'y_proba': array([0., 0., 0., ..., 0., 0., 0.])}

### 9. Ensemble Models: Random Forest (Bagging), XGBoost (Boosting), CatBoost (Advanced Boosting)

We'll create pipelines for each model type and run a randomized search for hyperparameters.

#### 9.1 Random Forest pipeline & hyperparameter tuning

In [138]:
rf_pipeline = Pipeline(steps=[
    ('pre', preprocessor_ordinal), # trees don't need one-hot
    ('clf', RandomForestClassifier(random_state=42, class_weight='balanced', n_jobs=-1))
])


rf_param_dist = {
    'clf__n_estimators': [100, 300, 600],
    'clf__max_depth': [None, 6, 12, 20],
    'clf__min_samples_split': [2, 5, 10],
    'clf__max_features': ['sqrt', 'log2', 0.3, 0.6]
}


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rf_search = RandomizedSearchCV(rf_pipeline, rf_param_dist, n_iter=20, scoring='roc_auc', cv=cv, verbose=1, n_jobs=-1, random_state=42)
rf_search.fit(X_train, y_train)

print('Best RF params:', rf_search.best_params_)
rf_best = rf_search.best_estimator_

print('RF test evaluation:')
evaluate_model(rf_best, X_test, y_test)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best RF params: {'clf__n_estimators': 100, 'clf__min_samples_split': 10, 'clf__max_features': 'log2', 'clf__max_depth': 6}
RF test evaluation:
ROC AUC: 0.8415, PR AUC: 0.6506
Precision: 0.5184, Recall: 0.7914, F1: 0.6265
Confusion matrix:
 [[760 275]
 [ 78 296]]


{'roc_auc': 0.8415497687876204,
 'pr_auc': 0.6506113157547074,
 'precision': 0.5183887915936952,
 'recall': 0.7914438502673797,
 'f1': 0.6264550264550265,
 'confusion_matrix': array([[760, 275],
        [ 78, 296]], dtype=int64),
 'y_proba': array([0.07448482, 0.85838473, 0.20397021, ..., 0.25397529, 0.04124232,
        0.05679325])}

- Feature importance (from preprocessor -> feature names)
- helper to extract feature names after ColumnTransformer

In [139]:
def get_feature_names_from_column_transformer(column_transformer):
    # adapted helper for sklearn ColumnTransformer
    feature_names = []
    for name, transformer, cols in column_transformer.transformers_:
        if name == 'remainder' and transformer == 'drop':
            continue
        if hasattr(transformer, 'named_steps') and 'onehot' in transformer.named_steps:
            ohe = transformer.named_steps['onehot']
            in_cols = cols
            names = ohe.get_feature_names_out(in_cols)
            feature_names.extend(names.tolist())
        elif hasattr(transformer, 'named_steps') and 'ordinal' in transformer.named_steps:
            # ordinal encoder keeps same columns
            feature_names.extend(cols)
        elif transformer == 'drop':
            continue
        else:
            # numeric or pass-through
            feature_names.extend(cols)
    return feature_names

try:
    rf_feature_names = get_feature_names_from_column_transformer(rf_best.named_steps['pre'])
    importances = rf_best.named_steps['clf'].feature_importances_
    imp_df = pd.DataFrame({'feature': rf_feature_names, 'importance': importances}).sort_values('importance', ascending=False).head(30)
    display(imp_df)
except Exception as e:
    print('Could not extract RF feature importances cleanly:', e)

Unnamed: 0,feature,importance
20,Contract,0.16479
25,contract_tenure_interaction,0.141801
1,tenure,0.089449
14,OnlineSecurity,0.0886
7,avg_charge_per_service,0.076255
2,MonthlyCharges,0.061467
17,TechSupport,0.059219
5,avg_charge_per_month,0.053574
3,TotalCharges,0.048368
23,tenure_group,0.043172


#### 9.2 XGBoost pipeline & tuning (if available)

In [140]:
import xgboost 

if XGBClassifier is not None:
    xgb_pipeline = Pipeline(steps=[
        ('pre', preprocessor_ordinal),
        ('clf', XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_jobs=-1, random_state=42))
])


    xgb_param_dist = {
        'clf__n_estimators': [100, 300, 600],
        'clf__learning_rate': [0.01, 0.05, 0.1],
        'clf__max_depth': [3,6,9],
        'clf__subsample': [0.6, 0.8, 1.0]
    }
    xgb_search = RandomizedSearchCV(xgb_pipeline, xgb_param_dist, n_iter=20, scoring='roc_auc', cv=cv, verbose=1, n_jobs=-1, random_state=42)
    xgb_search.fit(X_train, y_train)
    print('Best XGB params:', xgb_search.best_params_)
    xgb_best = xgb_search.best_estimator_
    print('XGB test evaluation:')
    evaluate_model(xgb_best, X_test, y_test)


    # feature importance
    try:
        xgb_feats = get_feature_names_from_column_transformer(xgb_best.named_steps['pre'])
        xgb_importances = xgb_best.named_steps['clf'].feature_importances_
        xgb_imp_df = pd.DataFrame({'feature': xgb_feats, 'importance': xgb_importances}).sort_values('importance', ascending=False).head(30)
        display(xgb_imp_df)
    except Exception as e:
        print('Could not extract XGB feature importances cleanly:', e)
    else:
        print('XGBoost not installed; skip XGB section')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best XGB params: {'clf__subsample': 0.8, 'clf__n_estimators': 100, 'clf__max_depth': 3, 'clf__learning_rate': 0.05}
XGB test evaluation:
ROC AUC: 0.8470, PR AUC: 0.6606
Precision: 0.6857, Recall: 0.5134, F1: 0.5872
Confusion matrix:
 [[947  88]
 [182 192]]


Unnamed: 0,feature,importance
20,Contract,0.350699
14,OnlineSecurity,0.105737
17,TechSupport,0.087818
13,InternetService,0.065517
1,tenure,0.0435
2,MonthlyCharges,0.038657
21,PaperlessBilling,0.035336
19,StreamingMovies,0.032441
25,contract_tenure_interaction,0.031629
5,avg_charge_per_month,0.023645


XGBoost not installed; skip XGB section


#### 9.3 CatBoost pipeline & tuning (if available)

- CatBoost can take categorical features natively, so we pass cat feature names.

In [141]:
if CatBoostClassifier is not None:
    # Identify categorical column indices for CatBoost (based on X_train)
    cat_cols_in_train = [c for c in cat_features if c in X_train.columns]
    
    # We will use a minimal preprocessing: impute numeric, leave cats as-is
    cat_num_transformer = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', 'passthrough', cat_cols_in_train)
    ], remainder='drop')
    
    # After transformation, categorical features will be at the end
    # Calculate the indices after transformation
    num_cols_count = len(numeric_features)
    cat_idx = list(range(num_cols_count, num_cols_count + len(cat_cols_in_train)))
    
    cat_pipeline = Pipeline(steps=[
        ('pre', cat_num_transformer),
        ('clf', CatBoostClassifier(verbose=0, random_state=42, auto_class_weights='Balanced', cat_features=cat_idx))
    ])


    cat_param_dist = {
        'clf__iterations': [100, 300],
        'clf__learning_rate': [0.01, 0.05, 0.1],
        'clf__depth': [4, 6, 8]
    }


    cat_search = RandomizedSearchCV(cat_pipeline, cat_param_dist, n_iter=12, scoring='roc_auc', cv=cv, verbose=1, n_jobs=1, random_state=42)
    cat_search.fit(X_train, y_train)
    print('Best CatBoost params:', cat_search.best_params_)
    cat_best = cat_search.best_estimator_
    print('CatBoost test evaluation:')
    evaluate_model(cat_best, X_test, y_test)


    # feature importance using CatBoost's internal method
    try:
        # need to transform data for catboost Pool
        X_test_trans = cat_best.named_steps['pre'].transform(X_test)
        # cat_best.named_steps['clf'] is CatBoostClassifier
        fi = cat_best.named_steps['clf'].get_feature_importance()
        # feature names: numeric_features + cat_cols_in_train
        feat_names = numeric_features + cat_cols_in_train
        fi_df = pd.DataFrame({'feature': feat_names, 'importance': fi}).sort_values('importance', ascending=False).head(30)
        display(fi_df)
    except Exception as e:
        print('Could not extract CatBoost feature importances cleanly:', e)
else:
    print('CatBoost not installed; skip CatBoost section')

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best CatBoost params: {'clf__learning_rate': 0.1, 'clf__iterations': 100, 'clf__depth': 4}
CatBoost test evaluation:
ROC AUC: 0.8457, PR AUC: 0.6660
Precision: 0.5136, Recall: 0.8075, F1: 0.6279
Confusion matrix:
 [[749 286]
 [ 72 302]]


Unnamed: 0,feature,importance
20,Contract,29.917168
1,tenure,11.156907
2,MonthlyCharges,9.675875
13,InternetService,7.091059
14,OnlineSecurity,5.139345
5,avg_charge_per_month,4.481611
3,TotalCharges,4.252478
22,PaymentMethod,4.207263
17,TechSupport,3.527987
23,tenure_group,2.863321


### Section 10: Model Stacking Ensemble

In [142]:
from sklearn.ensemble import StackingClassifier


print("\n--- Stacking Ensemble ---")

# First preprocess X_train and X_test using the appropriate preprocessor
# We'll use ordinal encoding which works well with most models
X_train_processed = preprocessor_ordinal.fit_transform(X_train)
X_test_processed = preprocessor_ordinal.transform(X_test)

print(f"Processed data shapes - X_train: {X_train_processed.shape}, X_test: {X_test_processed.shape}")


--- Stacking Ensemble ---
Processed data shapes - X_train: (5634, 26), X_test: (1409, 26)


In [143]:
# Prepare estimators list - only add models that are available
estimators = []

# Always include LogisticRegression and RandomForest
estimators.append(('lr', LogisticRegression(max_iter=1000, random_state=42)))
estimators.append(('rf', RandomForestClassifier(n_estimators=200, random_state=42)))

# Add XGBoost if available
if XGBClassifier is not None:
    estimators.append(('xgb', XGBClassifier(eval_metric='logloss', use_label_encoder=False, random_state=42)))

# Add CatBoost if available
if CatBoostClassifier is not None:
    estimators.append(('cat', CatBoostClassifier(verbose=0, random_state=42)))

print(f"Creating stacking ensemble with {len(estimators)} base models:", [name for name, _ in estimators])

stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=1000, random_state=42),
    cv=5,
    n_jobs=-1
)

# Fit using the preprocessed data
stacking_clf.fit(X_train_processed, y_train)
print("Stacking Ensemble evaluation:")
y_proba = stacking_clf.predict_proba(X_test_processed)[:,1]
y_pred = (y_proba >= 0.5).astype(int)
print("Stacking Ensemble - Evaluation Results:")
evaluate_model(stacking_clf, X_test_processed, y_test)

# Save the stacking model if successful
try:
    # Save both the preprocessor and the model in a dict
    stacking_model = {
        'preprocessor': preprocessor_ordinal,
        'model': stacking_clf
    }
    joblib.dump(stacking_model, 'models/trained_models/stacking_ensemble_pipeline.joblib')
    print('Saved Stacking Ensemble pipeline to models/trained_models/stacking_ensemble_pipeline.joblib')
except Exception as e:
    print(f"Error saving stacking model: {e}")

# Add to results dictionary
if 'results' in locals():
    try:
        results['StackingEnsemble'] = {
            'roc_auc': roc_auc_score(y_test, y_proba),
            'pr_auc': average_precision_score(y_test, y_proba),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred),
            'confusion_matrix': confusion_matrix(y_test, y_pred),
            'y_proba': y_proba
        }
        print("Added stacking ensemble to results for comparison")
    except Exception as e:
        print(f"Error adding stacking ensemble to results: {e}")

Creating stacking ensemble with 4 base models: ['lr', 'rf', 'xgb', 'cat']
Stacking Ensemble evaluation:
Stacking Ensemble - Evaluation Results:
ROC AUC: 0.8426, PR AUC: 0.6442
Precision: 0.6678, Recall: 0.5107, F1: 0.5788
Confusion matrix:
 [[940  95]
 [183 191]]
Saved Stacking Ensemble pipeline to models/trained_models/stacking_ensemble_pipeline.joblib
Added stacking ensemble to results for comparison


### 11. Model comparison summary
- Collect metrics from all trained models and compare (ROC AUC, PR AUC, F1)

In [None]:
results = {}
for name, model in [('LogisticRegression', baseline_lr), ('DecisionTree', baseline_dt), ('RandomForest', rf_best)]:
    try:
        res = evaluate_model(model, X_test, y_test, show_report=False)
        results[name] = res
    except Exception:
        pass

if XGBClassifier is not None:
    try:
        results['XGBoost'] = evaluate_model(xgb_best, X_test, y_test, show_report=False)
    except Exception:
        pass
if CatBoostClassifier is not None:
    try:
        results['CatBoost'] = evaluate_model(cat_best, X_test, y_test, show_report=False)
    except Exception:
        pass

# Add Stacking Ensemble if available
if 'stacking_clf' in locals():
    try:
        results['StackingEnsemble'] = evaluate_model(stacking_clf, X_test, y_test, show_report=False)
    except Exception as e:
        print(f"Error evaluating stacking ensemble: {e}")

summary_df = pd.DataFrame({k: {m: v for m, v in res.items() if m in ['roc_auc','pr_auc','precision','recall','f1']} for k,res in results.items()}).T
summary_df

Unnamed: 0,roc_auc,pr_auc,precision,recall,f1
LogisticRegression,0.847129,0.666142,0.50084,0.796791,0.615067
DecisionTree,0.627193,0.35219,0.461972,0.438503,0.449931
RandomForest,0.84155,0.650611,0.518389,0.791444,0.626455
XGBoost,0.846979,0.660561,0.685714,0.513369,0.587156
CatBoost,0.845748,0.666034,0.513605,0.807487,0.627859


### 12. Save models & results

Random Forest

In [44]:
joblib.dump(rf_best, 'models/trained_models/rf_best_pipeline.joblib')
print('Saved RF pipeline to models/trained_models/rf_best_pipeline.joblib')

Saved RF pipeline to models/trained_models/rf_best_pipeline.joblib


Logistic Regression

In [46]:
joblib.dump(baseline_lr, 'models/trained_models/baseline_lr_pipeline.joblib')
print('Saved Baseline LR pipeline to models/trained_models/baseline_lr_pipeline.joblib') 

Saved Baseline LR pipeline to models/trained_models/baseline_lr_pipeline.joblib


Decision Tree

In [47]:
joblib.dump(baseline_dt, 'models/trained_models/baseline_dt_pipeline.joblib')
print('Saved Baseline DT pipeline to models/trained_models/baseline_dt_pipeline.joblib') 

Saved Baseline DT pipeline to models/trained_models/baseline_dt_pipeline.joblib


XGBoost

In [48]:
if XGBClassifier is not None and 'xgb_best' in locals():
    joblib.dump(xgb_best, 'models/trained_models/xgb_best_pipeline.joblib')
    print('Saved XGBoost pipeline to models/trained_models/xgb_best_pipeline.joblib')
else:
    print('XGBoost model not available to save')

Saved XGBoost pipeline to models/trained_models/xgb_best_pipeline.joblib


CatBoost

In [49]:
if CatBoostClassifier is not None and 'cat_best' in locals():
    joblib.dump(cat_best, 'models/trained_models/cat_best_pipeline.joblib')
    print('Saved CatBoost pipeline to models/trained_models/cat_best_pipeline.joblib')
else:
    print('CatBoost model not available to save')

Saved CatBoost pipeline to models/trained_models/cat_best_pipeline.joblib
