# KKBox Customer Churn Prediction
### w/ XGBOOST

---

# Part III: <font color=green>*Model Creation and Evaluation*</font>
Please refer to the following article for a comprehensive review of the project: XXXXXX

---

In [1]:
# General Imports
from __future__ import absolute_import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
import time 

# General Imports
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, RFECV
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

## Import Data

In [2]:
## Working Locally

# Import Subsamples
DRV_Jan2016_1to1 = pd.read_csv('D:\J-5 Local\DRV_Jan2016_1to1_clust')
DRV_Jan2016_3to1 = pd.read_csv('D:\J-5 Local\DRV_Jan2016_3to1_clust')
DRV_Jan2016_5to1 = pd.read_csv('D:\J-5 Local\DRV_Jan2016_5to1_clust')
DRV_Jan2016_7to1 = pd.read_csv('D:\J-5 Local\DRV_Jan2016_7to1_clust')
DRV_Jan2016_9to1 = pd.read_csv('D:\J-5 Local\DRV_Jan2016_9to1_clust')
DRV_Jan2016_11to1 = pd.read_csv('D:\J-5 Local\DRV_Jan2016_11to1_clust')
DRV_Jan2016_13to1 = pd.read_csv('D:\J-5 Local\DRV_Jan2016_13to1_clust')

# Import Main Sets
DRV_Jan2016 = pd.read_csv('D:\J-5 Local\DRV_Jan2016_With_Cluster')

# Import DRV_Feb2016 (Validation Set) 
DRV_Feb2016 = pd.read_csv('D:\J-5 Local\DRV_Feb2016_With_Cluster')

## Model Pre-Processing

### - <font color=blue>Data Pre-Processing</font> -

#### <font color=purple>*Split Feautres by Categorical or Continuous*</font> -

In [3]:
# Create list of Categorical feature names
cat_feats = ['is_auto_renew', 'total_spent_zero', 'city_agg', 'payment_method_agg', 'never_active_subscriber', 'Cluster']

#### <font color=purple>*Initial Feature Selection*</font>

In [4]:
# Drop all un-needed columns
DRV_Jan2016 = DRV_Jan2016.drop(['Unnamed: 0', 'msno', 'membership_expire_date', 'is_net_paid_amount', 'registration_init_time','registration_init_time',
                          'city','bd','payment_method_id','registered_via'], axis=1)
DRV_Feb2016 = DRV_Feb2016.drop(['Unnamed: 0', 'msno', 'membership_expire_date', 'is_net_paid_amount', 'registration_init_time','registration_init_time',
                          'city','bd','payment_method_id','registered_via'], axis=1)

DRV_Jan2016_1to1 = DRV_Jan2016_1to1.drop(['Unnamed: 0', 'msno', 'membership_expire_date', 'is_net_paid_amount', 'registration_init_time','registration_init_time',
                          'city','bd','payment_method_id','registered_via'], axis=1)
DRV_Jan2016_3to1 = DRV_Jan2016_3to1.drop(['Unnamed: 0', 'msno', 'membership_expire_date', 'is_net_paid_amount', 'registration_init_time','registration_init_time',
                          'city','bd','payment_method_id','registered_via'], axis=1)
DRV_Jan2016_5to1 = DRV_Jan2016_5to1.drop(['Unnamed: 0', 'msno', 'membership_expire_date', 'is_net_paid_amount', 'registration_init_time','registration_init_time',
                          'city','bd','payment_method_id','registered_via'], axis=1)
DRV_Jan2016_7to1 = DRV_Jan2016_7to1.drop(['Unnamed: 0', 'msno', 'membership_expire_date', 'is_net_paid_amount', 'registration_init_time','registration_init_time',
                          'city','bd','payment_method_id','registered_via'], axis=1)
DRV_Jan2016_9to1 = DRV_Jan2016_9to1.drop(['Unnamed: 0', 'msno', 'membership_expire_date', 'is_net_paid_amount', 'registration_init_time','registration_init_time',
                          'city','bd','payment_method_id','registered_via'], axis=1)
DRV_Jan2016_11to1 = DRV_Jan2016_11to1.drop(['Unnamed: 0', 'msno', 'membership_expire_date', 'is_net_paid_amount', 'registration_init_time','registration_init_time',
                          'city','bd','payment_method_id','registered_via'], axis=1)
DRV_Jan2016_13to1 = DRV_Jan2016_13to1.drop(['Unnamed: 0', 'msno', 'membership_expire_date', 'is_net_paid_amount', 'registration_init_time','registration_init_time',
                          'city','bd','payment_method_id','registered_via'], axis=1)

#### <font color=purple>*Encode Categorical Variables*</font>

In [5]:
# Encoded all categoricals
DRV_Jan2016 = pd.get_dummies(DRV_Jan2016, prefix=cat_feats, columns=cat_feats, drop_first=True)
DRV_Feb2016 = pd.get_dummies(DRV_Feb2016, prefix=cat_feats, columns=cat_feats, drop_first=True)

DRV_Jan2016_1to1 = pd.get_dummies(DRV_Jan2016_1to1, prefix=cat_feats, columns=cat_feats, drop_first=True)
DRV_Jan2016_3to1 = pd.get_dummies(DRV_Jan2016_3to1, prefix=cat_feats, columns=cat_feats, drop_first=True)
DRV_Jan2016_5to1 = pd.get_dummies(DRV_Jan2016_5to1, prefix=cat_feats, columns=cat_feats, drop_first=True)
DRV_Jan2016_7to1 = pd.get_dummies(DRV_Jan2016_7to1, prefix=cat_feats, columns=cat_feats, drop_first=True)
DRV_Jan2016_9to1 = pd.get_dummies(DRV_Jan2016_9to1, prefix=cat_feats, columns=cat_feats, drop_first=True)
DRV_Jan2016_11to1 = pd.get_dummies(DRV_Jan2016_11to1, prefix=cat_feats, columns=cat_feats, drop_first=True)
DRV_Jan2016_13to1 = pd.get_dummies(DRV_Jan2016_13to1, prefix=cat_feats, columns=cat_feats, drop_first=True)

#### <font color=purple>*Feature Scaling*</font>

In [6]:
# Instantiate Scaler Object
scaler = StandardScaler()

# # Scale Train and Validation Sets
DRV_Jan2016_scaled = scaler.fit_transform(DRV_Jan2016.drop('is_churn', axis=1))
DRV_Feb2016_scaled = scaler.fit_transform(DRV_Feb2016.drop('is_churn', axis=1))

# Scale Split Sets
DRV_Jan2016_1to1_scaled = scaler.fit_transform(DRV_Jan2016_1to1.drop('is_churn', axis=1))
DRV_Jan2016_3to1_scaled = scaler.fit_transform(DRV_Jan2016_3to1.drop('is_churn', axis=1))
DRV_Jan2016_5to1_scaled = scaler.fit_transform(DRV_Jan2016_5to1.drop('is_churn', axis=1))
DRV_Jan2016_7to1_scaled = scaler.fit_transform(DRV_Jan2016_7to1.drop('is_churn', axis=1))
DRV_Jan2016_9to1_scaled = scaler.fit_transform(DRV_Jan2016_9to1.drop('is_churn', axis=1))
DRV_Jan2016_11to1_scaled = scaler.fit_transform(DRV_Jan2016_11to1.drop('is_churn', axis=1))
DRV_Jan2016_13to1_scaled = scaler.fit_transform(DRV_Jan2016_13to1.drop('is_churn', axis=1))

In [7]:
DRV_Jan2016_1to1_scaled.shape

(38013, 239)

In [8]:
DRV_Jan2016_13to1_scaled.shape

(264393, 239)

## Model Creation: Pipeline and Tuning

### - <font color=blue>Model Tuning</font> -

#### <font color=purple>XGBOOST Parameter Tuning</font>

In [9]:
# Create Param Grid
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5, 6],
        'learning_rate': [.1, .075, .05, .025, .01],
        'n_estimators': [100, 250, 500, 750, 1000]
        }

# Instatiate Esitmator Object
xgb = XGBClassifier(objective='binary:logistic', n_jobs=1)

# Instatiate StratKFold Object
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=4, shuffle = True)

# # Instatiate Random Search CV Object
# rscv = RandomizedSearchCV(xgb, param_distributions=params, n_iter=5, scoring='recall', 
#                                    n_jobs=4, cv=skf, verbose=3)

***Create Custom Evaluator***

In [10]:
# Custom evaluator that we can use instead of BinaryClassificationEvaluator() in grid search
class ClassEvaluatorPandas:

    def __init__(self, modelname, model, y_pred, y_true):
        
        # Initialize variables
        self.modelname = modelname
        self.y_pred = y_pred
        self.y_true = y_true
        self.model = model
        
        # Calculate confusion matrix
        from sklearn.metrics import confusion_matrix
        self.cm = confusion_matrix(y_true,y_pred)
        
        # Calculate confusion matrix values
        self.tp = self.cm[0][0]
        self.fp = self.cm[0][1]
        self.tn = self.cm[1][1]
        self.fn = self.cm[1][0]
        
    def evaluate(self):
        
        # Calculate Metrics and add epsilon to prevent division by zero
        precision = self.tp / float(self.tp + self.fp + 0.00001)
        recall = self.tp / float(self.tp + self.fn + 0.00001)
        f1 = (2 * precision * recall) / float(precision + recall + 0.00001)
        error = (self.fp + self.fn + 0.00001) / (self.tp + self.fp + self.tn + self.fn + 0.00001)
        
        # Instantiate Evaluator and call AUC metric
        from sklearn.metrics import confusion_matrix, roc_curve, auc, roc_auc_score
        
        AUC = roc_auc_score(self.y_true, self.y_pred)
        
        return pd.DataFrame(data=[[self.modelname, AUC, f1, precision, recall, error]], 
                            columns=['modelname', 'AUC', 'f1', 'precision', 'recall', 'error'])
    
    def confusionmatrix(self):
        from sklearn.metrics import confusion_matrix
        # Print Confusion Matrix
        return self.cm
        
    
    def modelparams(self):
        scores = self.model.avgMetrics
        params = [{p.name: v for p, v in m.items()} for m in self.model.getEstimatorParamMaps()]
        params_pd = pd.DataFrame(params)
        params_pd['AUC score'] = scores
        return params_pd

## Model Execution and Evaluation

### <font color=blue>Train Model: All Splits, XGB + GridCV </font>

In [11]:
start = time.time()
rscv = RandomizedSearchCV(xgb, param_distributions=params, n_iter=5, scoring='recall', 
                                   n_jobs=1, cv=skf, verbose=3)
gbx1to1 = rscv.fit(DRV_Jan2016_1to1_scaled, DRV_Jan2016_1to1['is_churn'])
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Fitting 4 folds for each of 5 candidates, totalling 20 fits
[CV] subsample=0.6, n_estimators=100, min_child_weight=1, max_depth=6, learning_rate=0.1, gamma=1.5, colsample_bytree=0.6 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  subsample=0.6, n_estimators=100, min_child_weight=1, max_depth=6, learning_rate=0.1, gamma=1.5, colsample_bytree=0.6, score=0.905, total=  38.2s
[CV] subsample=0.6, n_estimators=100, min_child_weight=1, max_depth=6, learning_rate=0.1, gamma=1.5, colsample_bytree=0.6 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   38.1s remaining:    0.0s


[CV]  subsample=0.6, n_estimators=100, min_child_weight=1, max_depth=6, learning_rate=0.1, gamma=1.5, colsample_bytree=0.6, score=0.905, total=  37.1s
[CV] subsample=0.6, n_estimators=100, min_child_weight=1, max_depth=6, learning_rate=0.1, gamma=1.5, colsample_bytree=0.6 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.3min remaining:    0.0s


[CV]  subsample=0.6, n_estimators=100, min_child_weight=1, max_depth=6, learning_rate=0.1, gamma=1.5, colsample_bytree=0.6, score=0.911, total=  37.1s
[CV] subsample=0.6, n_estimators=100, min_child_weight=1, max_depth=6, learning_rate=0.1, gamma=1.5, colsample_bytree=0.6 
[CV]  subsample=0.6, n_estimators=100, min_child_weight=1, max_depth=6, learning_rate=0.1, gamma=1.5, colsample_bytree=0.6, score=0.902, total=  37.0s
[CV] subsample=0.6, n_estimators=1000, min_child_weight=5, max_depth=4, learning_rate=0.025, gamma=0.5, colsample_bytree=1.0 
[CV]  subsample=0.6, n_estimators=1000, min_child_weight=5, max_depth=4, learning_rate=0.025, gamma=0.5, colsample_bytree=1.0, score=0.906, total= 6.1min
[CV] subsample=0.6, n_estimators=1000, min_child_weight=5, max_depth=4, learning_rate=0.025, gamma=0.5, colsample_bytree=1.0 
[CV]  subsample=0.6, n_estimators=1000, min_child_weight=5, max_depth=4, learning_rate=0.025, gamma=0.5, colsample_bytree=1.0, score=0.905, total= 6.1min
[CV] subsample=

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 48.6min finished


Time spent for training: 2971


In [12]:
start = time.time()
rscv = RandomizedSearchCV(xgb, param_distributions=params, n_iter=5, scoring='recall', 
                                   n_jobs=1, cv=skf, verbose=3)
gbx3to1 = rscv.fit(DRV_Jan2016_3to1_scaled, DRV_Jan2016_3to1['is_churn'])
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Fitting 4 folds for each of 5 candidates, totalling 20 fits
[CV] subsample=0.8, n_estimators=500, min_child_weight=5, max_depth=5, learning_rate=0.05, gamma=5, colsample_bytree=0.8 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  subsample=0.8, n_estimators=500, min_child_weight=5, max_depth=5, learning_rate=0.05, gamma=5, colsample_bytree=0.8, score=0.802, total= 6.4min
[CV] subsample=0.8, n_estimators=500, min_child_weight=5, max_depth=5, learning_rate=0.05, gamma=5, colsample_bytree=0.8 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  6.4min remaining:    0.0s


[CV]  subsample=0.8, n_estimators=500, min_child_weight=5, max_depth=5, learning_rate=0.05, gamma=5, colsample_bytree=0.8, score=0.788, total= 6.4min
[CV] subsample=0.8, n_estimators=500, min_child_weight=5, max_depth=5, learning_rate=0.05, gamma=5, colsample_bytree=0.8 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 12.8min remaining:    0.0s


[CV]  subsample=0.8, n_estimators=500, min_child_weight=5, max_depth=5, learning_rate=0.05, gamma=5, colsample_bytree=0.8, score=0.799, total= 6.4min
[CV] subsample=0.8, n_estimators=500, min_child_weight=5, max_depth=5, learning_rate=0.05, gamma=5, colsample_bytree=0.8 
[CV]  subsample=0.8, n_estimators=500, min_child_weight=5, max_depth=5, learning_rate=0.05, gamma=5, colsample_bytree=0.8, score=0.788, total=13.0min
[CV] subsample=1.0, n_estimators=250, min_child_weight=5, max_depth=5, learning_rate=0.025, gamma=1, colsample_bytree=1.0 
[CV]  subsample=1.0, n_estimators=250, min_child_weight=5, max_depth=5, learning_rate=0.025, gamma=1, colsample_bytree=1.0, score=0.795, total= 9.2min
[CV] subsample=1.0, n_estimators=250, min_child_weight=5, max_depth=5, learning_rate=0.025, gamma=1, colsample_bytree=1.0 
[CV]  subsample=1.0, n_estimators=250, min_child_weight=5, max_depth=5, learning_rate=0.025, gamma=1, colsample_bytree=1.0, score=0.783, total= 9.2min
[CV] subsample=1.0, n_estimato

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 230.1min finished


Time spent for training: 14909


In [13]:
start = time.time()
rscv = RandomizedSearchCV(xgb, param_distributions=params, n_iter=5, scoring='recall', 
                                   n_jobs=1, cv=skf, verbose=3)
gbx5to1 = rscv.fit(DRV_Jan2016_5to1_scaled, DRV_Jan2016_5to1['is_churn'])
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Fitting 4 folds for each of 5 candidates, totalling 20 fits
[CV] subsample=0.8, n_estimators=250, min_child_weight=10, max_depth=3, learning_rate=0.01, gamma=0.5, colsample_bytree=0.6 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  subsample=0.8, n_estimators=250, min_child_weight=10, max_depth=3, learning_rate=0.01, gamma=0.5, colsample_bytree=0.6, score=0.572, total= 2.7min
[CV] subsample=0.8, n_estimators=250, min_child_weight=10, max_depth=3, learning_rate=0.01, gamma=0.5, colsample_bytree=0.6 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.7min remaining:    0.0s


[CV]  subsample=0.8, n_estimators=250, min_child_weight=10, max_depth=3, learning_rate=0.01, gamma=0.5, colsample_bytree=0.6, score=0.601, total= 2.7min
[CV] subsample=0.8, n_estimators=250, min_child_weight=10, max_depth=3, learning_rate=0.01, gamma=0.5, colsample_bytree=0.6 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  5.4min remaining:    0.0s


[CV]  subsample=0.8, n_estimators=250, min_child_weight=10, max_depth=3, learning_rate=0.01, gamma=0.5, colsample_bytree=0.6, score=0.579, total= 2.7min
[CV] subsample=0.8, n_estimators=250, min_child_weight=10, max_depth=3, learning_rate=0.01, gamma=0.5, colsample_bytree=0.6 
[CV]  subsample=0.8, n_estimators=250, min_child_weight=10, max_depth=3, learning_rate=0.01, gamma=0.5, colsample_bytree=0.6, score=0.585, total= 2.7min
[CV] subsample=0.8, n_estimators=750, min_child_weight=10, max_depth=5, learning_rate=0.1, gamma=0.5, colsample_bytree=0.8 
[CV]  subsample=0.8, n_estimators=750, min_child_weight=10, max_depth=5, learning_rate=0.1, gamma=0.5, colsample_bytree=0.8, score=0.699, total=14.5min
[CV] subsample=0.8, n_estimators=750, min_child_weight=10, max_depth=5, learning_rate=0.1, gamma=0.5, colsample_bytree=0.8 
[CV]  subsample=0.8, n_estimators=750, min_child_weight=10, max_depth=5, learning_rate=0.1, gamma=0.5, colsample_bytree=0.8, score=0.731, total=14.5min
[CV] subsample=0.

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 162.1min finished


Time spent for training: 10640


In [14]:
start = time.time()
rscv = RandomizedSearchCV(xgb, param_distributions=params, n_iter=5, scoring='recall', 
                                   n_jobs=4, cv=skf, verbose=3)
gbx7to1 = rscv.fit(DRV_Jan2016_7to1_scaled, DRV_Jan2016_7to1['is_churn'])
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Fitting 4 folds for each of 5 candidates, totalling 20 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed: 44.6min remaining:    0.0s
[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed: 44.6min finished


Time spent for training: 3485


In [15]:
start = time.time()
rscv = RandomizedSearchCV(xgb, param_distributions=params, n_iter=5, scoring='recall', 
                                   n_jobs=4, cv=skf, verbose=3)
gbx9to1 = rscv.fit(DRV_Jan2016_9to1_scaled, DRV_Jan2016_9to1['is_churn'])
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Fitting 4 folds for each of 5 candidates, totalling 20 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed: 123.3min remaining:    0.0s
[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed: 123.3min finished


Time spent for training: 8927


In [16]:
start = time.time()
rscv = RandomizedSearchCV(xgb, param_distributions=params, n_iter=5, scoring='recall', 
                                   n_jobs=4, cv=skf, verbose=3)
gbx11to1 = rscv.fit(DRV_Jan2016_11to1_scaled, DRV_Jan2016_11to1['is_churn'])
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Fitting 4 folds for each of 5 candidates, totalling 20 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed: 119.0min remaining:    0.0s
[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed: 119.0min finished


Time spent for training: 9012


In [17]:
start = time.time()
rscv = RandomizedSearchCV(xgb, param_distributions=params, n_iter=5, scoring='recall', 
                                   n_jobs=4, cv=skf, verbose=3)
gbx13to1 = rscv.fit(DRV_Jan2016_13to1_scaled, DRV_Jan2016_13to1['is_churn'])
end = time.time()
print('Time spent for training: {}'.format(round(end-start)))

Fitting 4 folds for each of 5 candidates, totalling 20 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed: 161.3min remaining:    0.0s
[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed: 161.4min finished


Time spent for training: 11891


In [18]:
# Dictionary of Models that were created
ensembles_created = {
                  'gbx1to1' : gbx1to1,
                  'gbx3to1' : gbx3to1,
                  'gbx5to1' : gbx5to1,
                  'gbx7to1' : gbx7to1,
                  'gbx9to1' : gbx9to1,
                  'gbx11to1' : gbx11to1,
                  'gbx13to1' : gbx13to1}

#### - <font color=blue>Evaluate Train Model: All Splits, XGB + GridCV </font> -

In [19]:
# Train Model Results
train_all_results = pd.DataFrame()

for model_name, model1 in ensembles_created.items():
    
    # Temporary Variables for our Loop
    temp = model1.predict(DRV_Jan2016_scaled)
    temp_class = ClassEvaluatorPandas(modelname=model_name, model=model1, y_pred=temp, y_true=DRV_Jan2016['is_churn'])
   
    # Create a Dataframe of Train Results and Print Confusion Matrixes
    train_all_results = train_all_results.append(temp_class.evaluate())
    print('{}'.format(model_name))
    print(temp_class.confusionmatrix())
    print('')

gbx1to1
[[549877 122874]
 [  1386  17762]]

gbx3to1
[[627060  45691]
 [  4122  15026]]

gbx5to1
[[642246  30505]
 [  4186  14962]]

gbx7to1
[[635090  37661]
 [  4233  14915]]

gbx9to1
[[650096  22655]
 [  4820  14328]]

gbx11to1
[[661803  10948]
 [  7297  11851]]

gbx13to1
[[666593   6158]
 [  8973  10175]]



In [20]:
train_all_results

Unnamed: 0,modelname,AUC,f1,precision,recall,error
0,gbx1to1,0.872486,0.898477,0.817356,0.997486,0.179593
0,gbx3to1,0.858406,0.961793,0.932083,0.993469,0.071995
0,gbx5to1,0.868022,0.973698,0.954656,0.993524,0.050139
0,gbx7to1,0.861476,0.968065,0.944019,0.993379,0.060549
0,gbx9to1,0.857301,0.979301,0.966325,0.99264,0.03971
0,gbx11to1,0.801321,0.986398,0.983727,0.989094,0.026369
0,gbx13to1,0.761117,0.988773,0.990847,0.986718,0.021869


#### - <font color=blue>Evaluate Valuation Model: All Splits, XGB + GridCV </font> -

In [21]:
# Validation Model Results
validation_all_results = pd.DataFrame()

# Create a Dataframe of Validation Results and Print Confusion Matrixes
for model_name, model1 in ensembles_created.items():
    
    # Temporary Variables for our Loop
    temp = model1.predict(DRV_Feb2016_scaled)
    temp_class = ClassEvaluatorPandas(modelname=model_name, model=model1, y_pred=temp, y_true=DRV_Feb2016['is_churn'])

    # Validation Results and Print Confusion Matrixes
    validation_all_results = validation_all_results.append(temp_class.evaluate())
    print('{}'.format(model_name))
    print(temp_class.confusionmatrix())
    print('')

gbx1to1
[[426173 102031]
 [  1571  18352]]

gbx3to1
[[491210  36994]
 [  4714  15209]]

gbx5to1
[[507790  20414]
 [  7481  12442]]

gbx7to1
[[510516  17688]
 [  8466  11457]]

gbx9to1
[[519327   8877]
 [ 10797   9126]]

gbx11to1
[[520290   7914]
 [ 11156   8767]]

gbx13to1
[[522139   6065]
 [ 12350   7573]]



In [22]:
validation_all_results

Unnamed: 0,modelname,AUC,f1,precision,recall,error
0,gbx1to1,0.86399,0.891619,0.806834,0.996327,0.189011
0,gbx3to1,0.846676,0.95927,0.929963,0.990495,0.076092
0,gbx5to1,0.792928,0.973262,0.961352,0.985481,0.050891
0,gbx7to1,0.770788,0.975019,0.966513,0.983687,0.047715
0,gbx9to1,0.720629,0.981405,0.983194,0.979633,0.035893
0,gbx11to1,0.712531,0.981998,0.985017,0.979008,0.034791
0,gbx13to1,0.684316,0.982666,0.988518,0.976894,0.033596


#### <font color=purple>Generalization Between Train and Validation</font>

In [23]:
results_all = train_all_results[train_all_results.columns[1:]] - validation_all_results[validation_all_results.columns[1:]]
results_all['modelname'] = train_all_results['modelname']
results_all.sort_values('AUC', ascending=True)

Unnamed: 0,AUC,f1,precision,recall,error,modelname
0,0.008496,0.006858,0.010522,0.001159,-0.009418,gbx1to1
0,0.011731,0.002523,0.002121,0.002975,-0.004097,gbx3to1
0,0.075094,0.000435,-0.006696,0.008043,-0.000753,gbx5to1
0,0.076801,0.006106,0.002329,0.009824,-0.011727,gbx13to1
0,0.088791,0.0044,-0.001291,0.010086,-0.008422,gbx11to1
0,0.090687,-0.006954,-0.022494,0.009692,0.012834,gbx7to1
0,0.136672,-0.002104,-0.016869,0.013007,0.003816,gbx9to1
