In [1]:
import warnings
warnings.filterwarnings("ignore")

import random
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

from sklearn.preprocessing import (OrdinalEncoder, OneHotEncoder, LabelEncoder, 
                                   StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler,
                                   PowerTransformer)
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, RepeatedStratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, precision_recall_curve, confusion_matrix, classification_report, roc_curve, auc
from sklearn.utils import resample

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from sklearn.ensemble import StackingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

import lightgbm as lgbm
from xgboost import XGBClassifier
# from catboost import CatBoostClassifier

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

%matplotlib inline

# Predicting Churn for IBM Subscription Service

**Goals:**
1. Use subscription service data to determine key factors driving churn for *Telco*--a Telecommunications business.
2. Train a machine learning model to be able to predict customer churn for more targeted churn-reduction strategies.

**In this notebook:**
- Prepare training and test datasets
- Encode categorical features
- Scale numerical features
- Create model fit function with K-Fold on ROC-AUC scoring
    - CatBoost Model
    - XGBoost Model
    - LGBM Model
    - Stacking Ensemble Model
- Plot feature importance

In [2]:
df = pd.read_pickle("../data/telco_customer_churn_clean.pkl")
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,tenure_binned,MonthlyCharges_binned,TotalCharges_binned,Churn
0,Female,0,Yes,No,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,Low,Low,Low,False
1,Male,0,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,Medium,Medium,Low,False
2,Male,0,No,No,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,Low,Medium,Low,True
3,Male,0,No,No,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),Medium,Low,Low,False
4,Female,0,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,Low,Medium,Low,True


In [3]:
# Fix randomness in all utility packages with seed
SEED = 2024
random.seed(SEED)
np.random.seed(SEED)

### Train/Test Split

In [5]:
TRAIN_TEST_RATIO = 0.8

# Split dataset into Training and Testing sets using StratifiedShuffleSplit
stratified = StratifiedShuffleSplit(n_splits=1, train_size=TRAIN_TEST_RATIO, random_state=SEED)
folds = [(train_indices, test_indices) for (train_indices, test_indices) in stratified.split(df, df['Churn'])]

### Modeling

In [None]:
class BaseModel:
    def __init__(self, model, model_name: str, seed: int):
        self.model = model
        self.model_name = model_name
        self.seed = seed
    
    def fit(self, X, y, X_test, y_test, n_splits, options):
        auc_scores = []
        test_predictions = None
        folds = StratifiedKFold(n_splits=n_splits, random_state=self.seed, shuffle=True)

        # Loop over train/validation folds
        for k, (train_indices, val_indices) in enumerate(folds):
            # Extract X, y for training and validation
            X_train, y_train = X.iloc[train_indices], y[train_indices]
            X_val, y_val = X.iloc[val_indices], y[val_indices]

            # Train the model on this fold
            self.model.fit(X_train, y_train, eval_set=(X_val, y_val), *options)

            # Calculate ROC-AUC
            auc_scores.append(roc_auc_score(y_val, self.model.predict_proba(X_val)[:,1]))

            preds = self.model.predict_proba(X_test)[:,1]
            test_predictions = preds if test_predictions is None else preds + test_predictions

        test_predictions /= n_splits

        print(f"Training Set for {self.model_name}".ljust(32), f"AUC score - mean={np.mean(auc_scores):.2f}\tstd={np.std(auc_scores)}")

        test_auc_score = roc_auc_score(y_test, test_predictions)
        print(f"Testing Set for {self.model_name}".ljust(32), f"AUC score - mean={np.mean(test_auc_score):.2f}\tstd={np.std(test_auc_score)}")

    def plot(self, importances, features):
        imp_df = pd.DataFrame({'feature': np.array(features), 'importance': np.array(importances)})
        imp_df.sort_values(by=['importance'], ascending=False, inplace=True)

        plt.figure(figsize=(10,8), dpi=100)
        sns.barplot(x=imp_df['importance'], y=imp_df['feature'])
        plt.title(f"{self.model_name} Feature Importances")
        plt.xlabel('Importance')
        plt.ylabel('Feature')
        plt.show()
        plt.savefig(f"../images/{self.model_name}_importances.png")

