# KKBox Customer Churn Prediction
### w/ XGBOOST

---

# Part III: <font color=green>*Model Creation and Evaluation*</font>
Please refer to the following article for a comprehensive review of the project: XXXXXX

---

In [1]:
# General Imports
from __future__ import absolute_import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
import time 

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, RFECV
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

## Import Data

In [9]:
## Working Locally

# Import Subsamples
DRV_Jan2016_1to1 = pd.read_csv('D:\J-5 Local\DRV_Jan2016_1to1_clust')
DRV_Jan2016_3to1 = pd.read_csv('D:\J-5 Local\DRV_Jan2016_3to1_clust')
DRV_Jan2016_5to1 = pd.read_csv('D:\J-5 Local\DRV_Jan2016_5to1_clust')
DRV_Jan2016_7to1 = pd.read_csv('D:\J-5 Local\DRV_Jan2016_7to1_clust')
DRV_Jan2016_9to1 = pd.read_csv('D:\J-5 Local\DRV_Jan2016_9to1_clust')
DRV_Jan2016_11to1 = pd.read_csv('D:\J-5 Local\DRV_Jan2016_11to1_clust')
DRV_Jan2016_13to1 = pd.read_csv('D:\J-5 Local\DRV_Jan2016_13to1_clust')

# Import Main Sets
DRV_Jan2016 = pd.read_csv('D:\J-5 Local\DRV_Jan2016_With_Cluster')

# Import DRV_Feb2016 (Validation Set) 
DRV_Feb2016 = pd.read_csv('D:\J-5 Local\DRV_Feb2016_With_Cluster')

## Model Pre-Processing

### - <font color=blue>Data Pre-Processing</font> -

#### <font color=purple>*Split Feautres by Categorical or Continuous*</font> -

In [10]:
# Create list of Categorical feature names
cat_feats = ['is_auto_renew', 'total_spent_zero', 'city_agg', 'payment_method_agg', 'never_active_subscriber', 'Cluster']

#### <font color=purple>*Initial Feature Selection*</font>

In [11]:
# Drop all un-needed columns
DRV_Jan2016 = DRV_Jan2016.drop(['Unnamed: 0', 'msno', 'membership_expire_date', 'is_net_paid_amount', 'registration_init_time','registration_init_time',
                          'city','bd','payment_method_id','registered_via'], axis=1)
DRV_Feb2016 = DRV_Feb2016.drop(['Unnamed: 0', 'msno', 'membership_expire_date', 'is_net_paid_amount', 'registration_init_time','registration_init_time',
                          'city','bd','payment_method_id','registered_via'], axis=1)

DRV_Jan2016_1to1 = DRV_Jan2016_1to1.drop(['Unnamed: 0', 'msno', 'membership_expire_date', 'is_net_paid_amount', 'registration_init_time','registration_init_time',
                          'city','bd','payment_method_id','registered_via'], axis=1)
DRV_Jan2016_3to1 = DRV_Jan2016_3to1.drop(['Unnamed: 0', 'msno', 'membership_expire_date', 'is_net_paid_amount', 'registration_init_time','registration_init_time',
                          'city','bd','payment_method_id','registered_via'], axis=1)
DRV_Jan2016_5to1 = DRV_Jan2016_5to1.drop(['Unnamed: 0', 'msno', 'membership_expire_date', 'is_net_paid_amount', 'registration_init_time','registration_init_time',
                          'city','bd','payment_method_id','registered_via'], axis=1)
DRV_Jan2016_7to1 = DRV_Jan2016_7to1.drop(['Unnamed: 0', 'msno', 'membership_expire_date', 'is_net_paid_amount', 'registration_init_time','registration_init_time',
                          'city','bd','payment_method_id','registered_via'], axis=1)
DRV_Jan2016_9to1 = DRV_Jan2016_9to1.drop(['Unnamed: 0', 'msno', 'membership_expire_date', 'is_net_paid_amount', 'registration_init_time','registration_init_time',
                          'city','bd','payment_method_id','registered_via'], axis=1)
DRV_Jan2016_11to1 = DRV_Jan2016_11to1.drop(['Unnamed: 0', 'msno', 'membership_expire_date', 'is_net_paid_amount', 'registration_init_time','registration_init_time',
                          'city','bd','payment_method_id','registered_via'], axis=1)
DRV_Jan2016_13to1 = DRV_Jan2016_13to1.drop(['Unnamed: 0', 'msno', 'membership_expire_date', 'is_net_paid_amount', 'registration_init_time','registration_init_time',
                          'city','bd','payment_method_id','registered_via'], axis=1)

#### <font color=purple>*Encode Categorical Variables*</font>

In [12]:
# Encoded all categoricals
DRV_Jan2016 = pd.get_dummies(DRV_Jan2016, prefix=cat_feats, columns=cat_feats, drop_first=True)
DRV_Feb2016 = pd.get_dummies(DRV_Feb2016, prefix=cat_feats, columns=cat_feats, drop_first=True)

DRV_Jan2016_1to1 = pd.get_dummies(DRV_Jan2016_1to1, prefix=cat_feats, columns=cat_feats, drop_first=True)
DRV_Jan2016_3to1 = pd.get_dummies(DRV_Jan2016_3to1, prefix=cat_feats, columns=cat_feats, drop_first=True)
DRV_Jan2016_5to1 = pd.get_dummies(DRV_Jan2016_5to1, prefix=cat_feats, columns=cat_feats, drop_first=True)
DRV_Jan2016_7to1 = pd.get_dummies(DRV_Jan2016_7to1, prefix=cat_feats, columns=cat_feats, drop_first=True)
DRV_Jan2016_9to1 = pd.get_dummies(DRV_Jan2016_9to1, prefix=cat_feats, columns=cat_feats, drop_first=True)
DRV_Jan2016_11to1 = pd.get_dummies(DRV_Jan2016_11to1, prefix=cat_feats, columns=cat_feats, drop_first=True)
DRV_Jan2016_13to1 = pd.get_dummies(DRV_Jan2016_13to1, prefix=cat_feats, columns=cat_feats, drop_first=True)

#### <font color=purple>*Feature Scaling*</font>

In [13]:
# Instantiate Scaler Object
scaler = StandardScaler()

# # Scale Train and Validation Sets
DRV_Jan2016_scaled = scaler.fit_transform(DRV_Jan2016.drop('is_churn', axis=1))
DRV_Feb2016_scaled = scaler.fit_transform(DRV_Feb2016.drop('is_churn', axis=1))

# Scale Split Sets
DRV_Jan2016_1to1_scaled = scaler.fit_transform(DRV_Jan2016_1to1.drop('is_churn', axis=1))
DRV_Jan2016_3to1_scaled = scaler.fit_transform(DRV_Jan2016_3to1.drop('is_churn', axis=1))
DRV_Jan2016_5to1_scaled = scaler.fit_transform(DRV_Jan2016_5to1.drop('is_churn', axis=1))
DRV_Jan2016_7to1_scaled = scaler.fit_transform(DRV_Jan2016_7to1.drop('is_churn', axis=1))
DRV_Jan2016_9to1_scaled = scaler.fit_transform(DRV_Jan2016_9to1.drop('is_churn', axis=1))
DRV_Jan2016_11to1_scaled = scaler.fit_transform(DRV_Jan2016_11to1.drop('is_churn', axis=1))
DRV_Jan2016_13to1_scaled = scaler.fit_transform(DRV_Jan2016_13to1.drop('is_churn', axis=1))

In [14]:
DRV_Jan2016_1to1_scaled.shape

(38013, 239)

In [15]:
DRV_Jan2016_13to1_scaled.shape

(264393, 239)

## Model Creation: Pipeline and Tuning

### - <font color=blue>Model Tuning</font> -

#### <font color=purple>XGBOOST Parameter Tuning</font>

In [16]:
# Create Param Grid
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5, 6],
        'learning_rate': [.1, .075, .05, .025, .01],
        'n_estimators': [100, 250, 500, 750, 1000]
        }

# Instatiate Esitmator Object
xgb = XGBClassifier(objective='binary:logistic', n_jobs=1)

# Instatiate StratKFold Object
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=4, shuffle = True)

# # Instatiate Random Search CV Object
# rscv = RandomizedSearchCV(xgb, param_distributions=params, n_iter=5, scoring='recall', 
#                                    n_jobs=4, cv=skf, verbose=3)

***Create Custom Evaluator***

In [20]:
# Custom evaluator that we can use instead of BinaryClassificationEvaluator() in grid search
class ClassEvaluatorPandas:

    def __init__(self, modelname, model, y_pred, y_true):
        
        # Initialize variables
        self.modelname = modelname
        self.y_pred = y_pred
        self.y_true = y_true
        self.model = model
        
        # Calculate confusion matrix
        from sklearn.metrics import confusion_matrix
        self.cm = confusion_matrix(y_true,y_pred)
        
        # Calculate confusion matrix values
        self.tp = self.cm[0][0]
        self.fp = self.cm[0][1]
        self.tn = self.cm[1][1]
        self.fn = self.cm[1][0]
        
    def evaluate(self):
        
        # Calculate Metrics and add epsilon to prevent division by zero
        precision = self.tp / float(self.tp + self.fp + 0.00001)
        recall = self.tp / float(self.tp + self.fn + 0.00001)
        f1 = (2 * precision * recall) / float(precision + recall + 0.00001)
        error = (self.fp + self.fn + 0.00001) / (self.tp + self.fp + self.tn + self.fn + 0.00001)
        
        # Instantiate Evaluator and call AUC metric
        from sklearn.metrics import confusion_matrix, roc_curve, auc, roc_auc_score
        
        AUC = roc_auc_score(self.y_true, self.y_pred)
        
        return pd.DataFrame(data=[[self.modelname, AUC, f1, precision, recall, error]], 
                            columns=['modelname', 'AUC', 'f1', 'precision', 'recall', 'error'])
    
    def confusionmatrix(self):
        from sklearn.metrics import confusion_matrix
        # Print Confusion Matrix
        return self.cm
        
    
    def modelparams(self):
        scores = self.model.avgMetrics
        params = [{p.name: v for p, v in m.items()} for m in self.model.getEstimatorParamMaps()]
        params_pd = pd.DataFrame(params)
        params_pd['AUC score'] = scores
        return params_pd

## Model Execution and Evaluation

### <font color=blue>Train Model: All Splits, XGB + GridCV </font>

In [None]:
start = time.time()
rscv = RandomizedSearchCV(xgb, param_distributions=params, n_iter=5, scoring='recall', 
                                   n_jobs=1, cv=skf, verbose=3)
gbx1to1 = rscv.fit(DRV_Jan2016_1to1_scaled, DRV_Jan2016_1to1['is_churn'])
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Fitting 4 folds for each of 5 candidates, totalling 20 fits
[CV] subsample=1.0, n_estimators=500, min_child_weight=5, max_depth=5, learning_rate=0.075, gamma=0.5, colsample_bytree=0.8 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  subsample=1.0, n_estimators=500, min_child_weight=5, max_depth=5, learning_rate=0.075, gamma=0.5, colsample_bytree=0.8, score=0.902, total= 3.3min
[CV] subsample=1.0, n_estimators=500, min_child_weight=5, max_depth=5, learning_rate=0.075, gamma=0.5, colsample_bytree=0.8 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.3min remaining:    0.0s


[CV]  subsample=1.0, n_estimators=500, min_child_weight=5, max_depth=5, learning_rate=0.075, gamma=0.5, colsample_bytree=0.8, score=0.910, total= 3.5min
[CV] subsample=1.0, n_estimators=500, min_child_weight=5, max_depth=5, learning_rate=0.075, gamma=0.5, colsample_bytree=0.8 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  6.8min remaining:    0.0s


[CV]  subsample=1.0, n_estimators=500, min_child_weight=5, max_depth=5, learning_rate=0.075, gamma=0.5, colsample_bytree=0.8, score=0.910, total= 3.6min
[CV] subsample=1.0, n_estimators=500, min_child_weight=5, max_depth=5, learning_rate=0.075, gamma=0.5, colsample_bytree=0.8 
[CV]  subsample=1.0, n_estimators=500, min_child_weight=5, max_depth=5, learning_rate=0.075, gamma=0.5, colsample_bytree=0.8, score=0.907, total= 3.4min
[CV] subsample=1.0, n_estimators=500, min_child_weight=1, max_depth=3, learning_rate=0.05, gamma=0.5, colsample_bytree=1.0 
[CV]  subsample=1.0, n_estimators=500, min_child_weight=1, max_depth=3, learning_rate=0.05, gamma=0.5, colsample_bytree=1.0, score=0.903, total= 2.7min
[CV] subsample=1.0, n_estimators=500, min_child_weight=1, max_depth=3, learning_rate=0.05, gamma=0.5, colsample_bytree=1.0 
[CV]  subsample=1.0, n_estimators=500, min_child_weight=1, max_depth=3, learning_rate=0.05, gamma=0.5, colsample_bytree=1.0, score=0.909, total= 2.5min
[CV] subsample=1.

In [None]:
start = time.time()
rscv = RandomizedSearchCV(xgb, param_distributions=params, n_iter=5, scoring='recall', 
                                   n_jobs=1, cv=skf, verbose=3)
gbx3to1 = rscv.fit(DRV_Jan2016_3to1_scaled, DRV_Jan2016_3to1['is_churn'])
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

In [None]:
start = time.time()
rscv = RandomizedSearchCV(xgb, param_distributions=params, n_iter=5, scoring='recall', 
                                   n_jobs=1, cv=skf, verbose=3)
gbx5to1 = rscv.fit(DRV_Jan2016_5to1_scaled, DRV_Jan2016_5to1['is_churn'])
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

In [None]:
start = time.time()
rscv = RandomizedSearchCV(xgb, param_distributions=params, n_iter=5, scoring='recall', 
                                   n_jobs=4, cv=skf, verbose=3)
gbx7to1 = rscv.fit(DRV_Jan2016_7to1_scaled, DRV_Jan2016_7to1['is_churn'])
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

In [None]:
start = time.time()
rscv = RandomizedSearchCV(xgb, param_distributions=params, n_iter=5, scoring='recall', 
                                   n_jobs=4, cv=skf, verbose=3)
gbx9to1 = rscv.fit(DRV_Jan2016_9to1_scaled, DRV_Jan2016_9to1['is_churn'])
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

In [None]:
start = time.time()
rscv = RandomizedSearchCV(xgb, param_distributions=params, n_iter=5, scoring='recall', 
                                   n_jobs=4, cv=skf, verbose=3)
gbx11to1 = rscv.fit(DRV_Jan2016_11to1_scaled, DRV_Jan2016_11to1['is_churn'])
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

In [None]:
start = time.time()
rscv = RandomizedSearchCV(xgb, param_distributions=params, n_iter=5, scoring='recall', 
                                   n_jobs=4, cv=skf, verbose=3)
gbx13to1 = rscv.fit(DRV_Jan2016_13to1_scaled, DRV_Jan2016_13to1['is_churn'])
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

In [None]:
# Dictionary of Models that were created
ensembles_created = {
                  'gbx1to1' : gbx1to1,
                  'gbx3to1' : gbx3to1,
                  'gbx5to1' : gbx5to1,
                  'gbx7to1' : gbx7to1,
                  'gbx9to1' : gbx9to1,
                  'gbx11to1' : gbx11to1,
                  'gbx13to1' : gbx13to1}

#### - <font color=blue>Evaluate Train Model: All Splits, XGB + GridCV </font> -

In [None]:
# Train Model Results
train_all_results = pd.DataFrame()

for model_name, model1 in ensembles_created.items():
    
    # Temporary Variables for our Loop
    temp = model1.predict(DRV_Jan2016_scaled)
    temp_class = ClassEvaluatorPandas(modelname=model_name, model=model1, y_pred=temp, y_true=DRV_Jan2016['is_churn'])
   
    # Create a Dataframe of Train Results and Print Confusion Matrixes
    train_all_results = train_all_results.append(temp_class.evaluate())
    print('{}'.format(model_name))
    print(temp_class.confusionmatrix())
    print('')

In [None]:
train_all_results

#### - <font color=blue>Evaluate Valuation Model: All Splits, XGB + GridCV </font> -

In [None]:
# Validation Model Results
validation_all_results = pd.DataFrame()

# Create a Dataframe of Validation Results and Print Confusion Matrixes
for model_name, model1 in ensembles_created.items():
    
    # Temporary Variables for our Loop
    temp = model1.predict(DRV_Feb2016_scaled)
    temp_class = ClassEvaluatorPandas(modelname=model_name, model=model1, y_pred=temp, y_true=DRV_Feb2016['is_churn'])

    # Validation Results and Print Confusion Matrixes
    validation_all_results = validation_all_results.append(temp_class.evaluate())
    print('{}'.format(model_name))
    print(temp_class.confusionmatrix())
    print('')

In [None]:
validation_all_results

#### <font color=purple>Generalization Between Train and Validation</font>

In [None]:
results_all = train_all_results[train_all_results.columns[1:]] - validation_all_results[validation_all_results.columns[1:]]
results_all['modelname'] = train_all_results['modelname']
results_all.sort_values('AUC', ascending=True)