In [None]:
# Load needed packages
import logging
import multiprocessing as mp
import sys
import os
import glob
import re

import pandas as pd
import numpy as np

# Handle warnings
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.utils._testing import ignore_warnings

# Handle class imbalance
from imblearn.pipeline import Pipeline as imbPipeline
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import (TomekLinks, 
                                     NeighbourhoodCleaningRule as NCR, 
                                     RandomUnderSampler)

# Neural Network
from scikeras.wrappers import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense
from keras.layers.core import Dropout
from keras import regularizers
from tensorflow.keras.layers import BatchNormalization
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint

# ML
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from lightgbm import LGBMClassifier

# Performance metrics
from sklearn.model_selection import RepeatedStratifiedKFold, cross_validate
from sklearn.metrics import make_scorer
from mlxtend.evaluate import lift_score

# Assemble pipeline(s)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn import set_config

# Create own Classifier
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from scipy.sparse import csr_matrix, isspmatrix
from GEVNN import MLP_AE

import pickle

# 1. Initial configuration and loading data

In [None]:
# Suppress annoying warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
ignore_warnings(category=ConvergenceWarning)
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = ('ignore::UserWarning,ignore::RuntimeWarning')
    
# Take care of logging
logging.basicConfig(
    format='%(asctime)s:%(name)s:%(levelname)s - %(message)s',
    level=logging.INFO,
    handlers=[
        logging.FileHandler("churn_benchmarking.log"),
        logging.StreamHandler()
    ],
    datefmt='%Y-%m-%d %H:%M:%S')

# Determine number of cpus for parallel computing
n_cpus = mp.cpu_count()
logging.info(f"{n_cpus} cpus available")

# Check if files for benchmarking already exist
call = "wget -qO- https://github.com/darinkist/customer_churn_benchmarking/raw/main/bnchmrk_datasets.tar.gz | tar -xvz"

if not os.path.isdir('00_data'):
    logging.info("Directory 00_data not found - Downloading files")
    os.system(call)
elif not os.listdir('00_data'):
    logging.info("No files found - Downloading files")
    os.system(call)
else:
    logging.info("Files found")

# Visualize pipelines
set_config(display="diagram")

# Load prepared (pre-cleaned) files for benchmarking
file_paths = [f for f in glob.glob("00_data/*") if f.endswith('_cleaned.csv')]
file_names = [re.search('[ \w-]+?(?=\_cleaned.)',f)[0] for f in file_paths]

dfs = [pd.read_csv(df, low_memory=False) for df in file_paths]
data_sets = dict(zip(file_names, dfs))

if not data_sets:
    logging.error('No data sets have been loaded')
    raise ValueError("No data sets have been loaded")

logging.info(f"{len(data_sets)} data sets have been loaded.")

In [None]:
# data_sets.keys()

# 2. Defining sampling approaches

In [None]:
# Store different sampling approaches
sampl_app = dict()

# No sampling
sampl_app['no_sampling'] = ('no_sampling', None)

# SMOTE
sampl_app['o_SMOTE'] = ('smote', SMOTE())

# ADASYN
sampl_app['o_ADASYN'] = ('adasyn', ADASYN(sampling_strategy='not minority'))

# TomekLinks
sampl_app['u_TomekLinks'] = ('tomeklinks', TomekLinks())

# NCR
sampl_app['u_NCR'] = ('ncr', NCR())

# SMOTE + RND
sampl_app['h_SMOTE_RND'] = imbPipeline([('smote', SMOTE()),
                                        ('rnd', RandomUnderSampler())])

# SMOTE + TomekLinks
sampl_app['h_SMOTE_Tomek'] = imbPipeline([('smote', SMOTE()),
                                          ('tomeklinks', TomekLinks())])

# SMOTE + NCR
sampl_app['h_SMOTE_NCR'] = imbPipeline([('smote', SMOTE()),
                                        ('ncr', 
                                         NCR(sampling_strategy='not majority'))]
                                       )

# 3. Define models

In [None]:
# https://github.com/lhagiimn/GEV-NN-A-deep-neural-network-architecture-for-class-imbalance-problem-in-binary-classification
class GEV_NN_Classifier(BaseEstimator, ClassifierMixin):
    
    def __init__(self, epoch=50, batch_size=16, learning_rate=0.001, 
                 encoder=[32,16,8], decoder=[16,32], sofnn=[32], 
                 early_stopping=200, neurons=[32], activation='gev', 
                 reg_lambda=0.0001, loss_weight=0.25, rand=42, verbose_ae=0, 
                 verbose_mlp=0):
    
        self.epoch = epoch
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.encoder = encoder
        self.decoder = decoder
        self.sofnn = sofnn
        self.early_stopping = early_stopping
        self.neurons = neurons
        self.activation = activation
        self.reg_lambda = reg_lambda
        self.loss_weight = loss_weight
        self.rand = rand
        self.verbose_ae = verbose_ae
        self.verbose_mlp = verbose_mlp

    def fit(self, X, y, **fit_params):
        
        if isspmatrix(X):
            raise TypeError("Sparse input is not supported")
        
        if not isinstance(X,(list,pd.core.series.Series,np.ndarray)):
            raise ValueError("Not supported")
        
        if fit_params:
            if 'batch_size' in fit_params:
                self.batch_size = fit_params['batch_size']
                
        
        self.X_, self.y_ = check_X_y(X, y)
        
        self.n_features_in_ = X.shape[1]
        self.classes_ = unique_labels(y)
        self.is_fitted_ = True
        
        model = MLP_AE(trainX=self.X_, trainY=self.y_, epoch_number=self.epoch, 
                       batch_size=self.batch_size, 
                       learning_rate=self.learning_rate, encoder=self.encoder, 
                       decoder=self.decoder, sofnn=self.sofnn, 
                       early_stopping=self.early_stopping, neurons=self.neurons, 
                       activation=self.activation, reg_lambda=self.reg_lambda,
                       loss_weight=self.loss_weight, rand=self.rand, 
                       verbose_ae=self.verbose_ae, verbose_mlp=self.verbose_mlp)
        
        self.model_ = model.MLP_AE()
        return self
    
    def predict_proba(self, testX):

        testX = check_array(testX)
        check_is_fitted(self)
        
        _, pred_Y = self.model_.predict([testX, testX])
        return pred_Y
    
    def decision_function(self, testX):
        return self.predict_proba(testX)
    
    def predict(self, testX):

        y_pred = self.predict_proba(testX)
        y_pred = np.where(y_pred >= 0.5, 1, 0)

        if len(self.classes_) == 1:
            return np.squeeze(y_pred, 1)
        else:
            return np.squeeze(y_pred, 1)

In [None]:
# Define different ML models

# Feed Forward Neural Network
# https://github.com/naomifridman/Neural-Network-Churn-Prediction/blob/master/FFNN_churn_predict_0_12174.ipynb
def ffnn_mdl(meta):
    
    lsize=128
    n_features_in_ = meta["n_features_in_"]

    model = Sequential()
    model.add(Dense(lsize, input_dim=n_features_in_,activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(rate=0.25))
    model.add(Dense(int(lsize/2), activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(rate=0.25))
    model.add(Dense(int(lsize/4),kernel_regularizer=regularizers.l2(0.1), 
                    activation='relu'))
    model.add(Dropout(rate=0.1))
    model.add(Dense(1, activation='sigmoid'))
    #metrics=[auroc] causes issues # adadelta was original
    model.compile(loss='binary_crossentropy', optimizer='adam', 
                  metrics=['accuracy'])
    return model

# Define callbacks
reduce_lr = ReduceLROnPlateau(monitor='accuracy', factor=0.2,patience=1, 
                              min_lr=0.0001)
reduce_val_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.4, 
                                  patience=1, min_lr=0.0001)
es = EarlyStopping(monitor='loss', patience=3, verbose=1, mode='auto')

callbacks_list = [es, reduce_lr, reduce_val_lr]


ffnn = KerasClassifier(ffnn_mdl,
                       epochs=50,
                       batch_size=64,
                       validation_split=0.2,
                       callbacks=callbacks_list,
                       verbose=0)

In [None]:
# GEV NN
gev_nn = GEV_NN_Classifier()

# Linear model (logistic regression)
lr = LogisticRegression(solver='saga',
                            warm_start=True,
                            max_iter=100)

# RandomForest
rf = RandomForestClassifier()

# XGB
xgb = XGBClassifier(tree_method="hist",
                        verbosity=0,
                        silent=True)

# GaussianNB
gnb = GaussianNB()

# LR, XGB,RF
lr_xgb_rf = VotingClassifier(estimators=[('lr', lr),
                                         ('xgb', xgb),
                                         ('rf', rf)], voting='soft')

# LR, XGB,RF
lr_xgb_rf = VotingClassifier(estimators=[('lr', lr),
                                         ('xgb', xgb),
                                         ('rf', rf)
                                        ], voting='soft')

# LR, XGB,RF, FFNN
lr_xgb_rf_ffnn = VotingClassifier(estimators=[('lr', lr),
                                         ('xgb', xgb),
                                         ('rf', rf),
                                         ('ffnn', ffnn)
                                        ], voting='soft')

# knn
knn = KNeighborsClassifier()

# svc
svc = SVC()

# lgb
lgb = LGBMClassifier()

# Store them as tuples in a list
models = [('lr', lr),
          ('rf', rf),
          ('xgb', xgb),
          ('svc',svc),
          ('gnb', gnb),
          ('lgb', lgb),
          ('knn', knn),
          ('gev_nn', gev_nn),
          ('ffnn', ffnn),
          ('lr_xgb_rf', lr_xgb_rf),
          ('lr_xgb_rf_ffnn', lr_xgb_rf_ffnn)]

# 4. Initial pipeline

In [None]:
# Initial pipeline
ppl = imbPipeline([
    ('transformation', ColumnTransformer([
        ('num',make_pipeline(
            SimpleImputer(strategy='mean'),
            MinMaxScaler()),
         make_column_selector(dtype_include='number')
        ),
        ('cat',make_pipeline(
            SimpleImputer(strategy='most_frequent'),
            OneHotEncoder(sparse=False, handle_unknown='ignore')),
         make_column_selector(dtype_include='object')
        )])
    )
])

initial_steps = len(ppl.steps)

In [None]:
ppl

# 5. Scores to track and the right batch size

In [None]:
# Scores to track
scorer = {
    'lift_score': make_scorer(lift_score),
    'roc_auc':'roc_auc', 
    'f1_macro':'f1_macro', 
    'recall':'recall'
}

# To store the performance
bnchmrk_results = {}

In [None]:
# Determine batch size for NNs
def helper_batch_size(X):
    
    if X.shape[0] < 500:
        return 16
    elif X.shape[0] < 1000:
        return 32
    elif X.shape[0] < 5000:
        return 64
    elif X.shape[0] < 10000:
        return 128
    elif X.shape[0] < 20000:
        return 256
    else:
        return 512

In [None]:
# Cell for debugging/selective approaches
# Should be set to False when appling the whole process
DEBUG = False

if DEBUG:
    data_sets = {'ibm_hr': data_sets['ibm_hr']}
    models = [('gev_nn', gev_nn)]
    sampl_app = {'smote':('smote', SMOTE())}

In [None]:
ppl

In [None]:
#reset
#ppl = ppl[:initial_steps]

# 6. The benchmark loops

In [None]:
for ds in data_sets.keys():
    
    logging.info(f"= Starting benchmarking with {ds} data set =")
    
    # Define X and y
    X = data_sets[ds].drop("churn", axis=1)
    y = data_sets[ds]["churn"]
    
    model_results = {} # Store model performance
    
    for m in models:
        
        sampling_results = {} # Store sampling performance for respective model
        for sa in sampl_app.keys():
            
            logging.info(f"== Running {m[0]} with {sa} strategy ==")
            
            # Extend initial pipeline by sampling approach and model
            # Since some sampling approaches have multiple steps 
            # (e.g., SMOTE + RND) we have to append them via loop
            if hasattr(sampl_app[sa], 'steps'):
                for s in sampl_app[sa].steps:
                    ppl.steps.append(s)
            else:
                ppl.steps.append(sampl_app[sa])
            
            # Add model to pipeline
            ppl.steps.append(m)
            
            # Determine and set appropriate batch size for NN
            if (m[0] == 'ffnn') or (m[0] == 'gev_nn'):
                batch_size = helper_batch_size(X)
                
                if m[0] == 'ffnn':
                    fit_params = {"ffnn__batch_size":batch_size}
                if m[0] == 'gev_nn':
                    fit_params = {"gev_nn__batch_size":batch_size}
            else:
                fit_params = None
            
            # Configure KFold and CV
            rsf = RepeatedStratifiedKFold(n_repeats=5, random_state=42)
            
            scores = cross_validate(ppl, X, y, 
                                    cv=rsf, 
                                    scoring=scorer, 
                                    verbose=0, 
                                    n_jobs=1,
                                    error_score='raise',
                                    fit_params=fit_params,
                                    return_estimator=False
                                   )
            
            # Write results in dict
            sampling_results[sa] = scores
            
            # After running CV we reset pipeline to initial state
            # to be clean for next iteration
            ppl = ppl[:initial_steps]
        
        # Write results in dict
        model_results[m[0]] = sampling_results
    
    # Write results in dict
    bnchmrk_results[ds] = model_results
    
    # After one data set has been benchmarked we persistent the results
    file_to_write = open(f"data_set_results_{ds}.pickle", "wb")
    pickle.dump(bnchmrk_results, file_to_write)
    logging.info(f"Results for {ds} have been written to pickle file")

# Done
logging.info("Benchmarking finished")         