In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [25]:
data = pd.read_csv('../data/data.csv')
Y = data['Y']
data = data.drop(['Y'], axis = 1)


# data = pd.read_csv('../data/data_scaled.csv')
# Y = data['Y']
# data = data.drop(['Unnamed: 0', 'Y'], axis = 1)

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, Y, test_size=0.3, random_state=42)

In [27]:
#Oversampling 

from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
ros = RandomOverSampler(random_state=0)
# ros = SMOTE(random_state=0)
# ros = ADASYN(random_state=0)
X_train, y_train = ros.fit_resample(X_train, y_train)



In [28]:
import xgboost

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

In [29]:
# STEP 1 - First XGB - Highly Overfitted
# Fiz Learning Rate and n_estimators

xgb1_model = XGBClassifier( learning_rate=0.1, 
                                        n_estimators=1200,
                                        max_depth=5,
                                        min_child_weight=1,                         
                                        gamma=0,
                                        colsample_bytree=0.8,
                                        objective ='binary:logistic',
                                        scale_pos_weight = 1,
                                        seed=42)


xgb1_model.fit(X_train,y_train)
y_predicted = xgb1_model.predict(X_test)
y_predicted_train = xgb1_model.predict(X_train)


print('performance over the training set: ' + str(f1_score(y_train, y_predicted_train)))
print('performance over the test set: ' + str(f1_score(y_test, y_predicted)) + '\n')
print(classification_report(y_test, y_predicted))

performance over the training set: 1.0
performance over the test set: 0.18888888888888886

              precision    recall  f1-score   support

           0       0.93      0.97      0.95      1491
           1       0.30      0.14      0.19       123

    accuracy                           0.91      1614
   macro avg       0.62      0.56      0.57      1614
weighted avg       0.88      0.91      0.89      1614



In [None]:
# Data not Scaled performance
# performance over the training set: 1.0
# performance over the test set: 0.18888888888888886

#               precision    recall  f1-score   support

#            0       0.93      0.97      0.95      1491
#            1       0.30      0.14      0.19       123

#     accuracy                           0.91      1614
#    macro avg       0.62      0.56      0.57      1614
# weighted avg       0.88      0.91      0.89      1614


# DataScaled performance
# performance over the training set: 1.0
# performance over the test set: 0.17877094972067037

#               precision    recall  f1-score   support

#            0       0.93      0.97      0.95      1491
#            1       0.29      0.13      0.18       123

#     accuracy                           0.91      1614
#    macro avg       0.61      0.55      0.57      1614
# weighted avg       0.88      0.91      0.89      1614

In [None]:
# STEP 2 - Tuning max_depth and min_child_weight

param_test2 = {
    'max_depth':range(1,7,1),
    'min_child_weight':range(0,7,1)
}

gsearch2 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, 
                                                  n_estimators=1200, 
                                                  max_depth=3,
                                                  min_child_weight=4,
                                                  gamma=0,  
                                                  colsample_bytree=0.8,
                                                  objective= 'binary:logistic',
                                                  scale_pos_weight=1, 
                                                  seed=42), 
                        param_grid = param_test2, 
                        scoring='f1',
                        n_jobs=4,
                        iid=False, 
                        cv=5)


gsearch2.fit(X_train, y_train)
best_max_depth, best_min_child_weight = gsearch2.best_params_['max_depth'], gsearch2.best_params_['min_child_weight']
gsearch2.best_params_, gsearch2.best_score_

In [None]:
#Testing New Parameters

xgb2_model = XGBClassifier( learning_rate=0.1, 
                                        n_estimators=1200,
                                        max_depth=best_max_depth,
                                        min_child_weight=best_min_child_weight,                         
                                        gamma=0,
                                        colsample_bytree=0.8,
                                        objective ='binary:logistic',
                                        scale_pos_weight = 1,
                                        seed=42)


xgb2_model.fit(X_train,y_train)
y_predicted = xgb2_model.predict(X_test)
y_predicted_train = xgb2_model.predict(X_train)


print('performance over the training set: ' + str(f1_score(y_train, y_predicted_train)))
print('performance over the test set: ' + str(f1_score(y_test, y_predicted)) + '\n')
print(classification_report(y_test, y_predicted))

In [None]:
# STEP 3 - Tuning Gamma 

param_test3 = { 
    'gamma':[i/10.0 for i in range(0,5)] 
}

gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, 
                                                  n_estimators=1200, 
                                                  max_depth = best_max_depth,
                                                  min_child_weight = best_min_child_weight,
                                                  gamma=0, 
                                                  colsample_bytree=0.8,
                                                  objective= 'binary:logistic',
                                                  scale_pos_weight=1,
                                                  seed=42), 
                        param_grid = param_test3, 
                        scoring='f1',
                        n_jobs=4,
                        iid=False, 
                        cv=5)

gsearch3.fit(X_train, y_train)
best_gamma = gsearch3.best_params_['gamma']
gsearch3.best_params_, gsearch3.best_score_

In [None]:
# Testing New Parameters

xgb3_model = XGBClassifier( learning_rate=0.1, 
                                        n_estimators=1200,
                                        max_depth=best_max_depth,
                                        min_child_weight=best_min_child_weight,                         
                                        gamma=best_gamma,
                                        colsample_bytree=0.8,
                                        objective = 'binary:logistic',
                                        scale_pos_weight = 1,
                                        seed=42)


xgb3_model.fit(X_train,y_train)
y_predicted = xgb3_model.predict(X_test)
y_predicted_train = xgb3_model.predict(X_train)


print('performance over the training set: ' + str(f1_score(y_train, y_predicted_train)))
print('performance over the test set: ' + str(f1_score(y_test, y_predicted)) + '\n')
print(classification_report(y_test, y_predicted))

In [None]:
# STEP 4 - Tuning colsample_bytree and subsaample

param_test4 = {
    'subsample':[i/10.0 for i in range(7,11)],
    'colsample_bytree':[i/10.0 for i in range(7,11)]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, 
                                                  n_estimators=1200, 
                                                  max_depth = 3,
                                                  min_child_weight = 5,
                                                  gamma=0, 
                                                  subsample=1, 
                                                  colsample_bytree=0.8,
                                                  objective= 'binary:logistic', 
                                                  scale_pos_weight=1,
                                                  seed=42), 
                        param_grid = param_test4, 
                        scoring='f1',
                        n_jobs=4,
                        iid=False, 
                        cv=5)

gsearch4.fit(X_train, y_train)
best_subsample, best_colsample_bytree = gsearch4.best_params_['subsample'], gsearch4.best_params_['colsample_bytree']
gsearch4.best_params_, gsearch4.best_score_

In [None]:
# Testing New Parameters

xgb4_model = XGBClassifier( learning_rate=0.1, 
                                        n_estimators=1200,
                                        max_depth=best_max_depth,
                                        min_child_weight=best_min_child_weight,                         
                                        gamma=best_gamma,
                                        subsample= best_subsample,
                                        colsample_bytree= best_colsample_bytree,
                                        objective ='binary:logistic',
                                        scale_pos_weight = 1,
                                        seed=42)


xgb4_model.fit(X_train,y_train)
y_predicted = xgb4_model.predict(X_test)
y_predicted_train = xgb4_model.predict(X_train)


print('performance over the training set: ' + str(f1_score(y_train, y_predicted_train)))
print('performance over the test set: ' + str(f1_score(y_test, y_predicted)) + '\n')
print(classification_report(y_test, y_predicted))

In [None]:
# STEP 5 - Tuning Regularization Parameters
# Lambda L2 Regularization

param_test5 = {
    'reg_lambda':[1e-2, 0.1, 0.5, 1, 2, 10]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, 
                                                  n_estimators=1200, 
                                                  max_depth=best_max_depth,
                                                  min_child_weight=best_min_child_weight,                         
                                                  gamma=best_gamma,
                                                  subsample=best_subsample,
                                                  colsample_bytree=best_colsample_bytree,
                                                  reg_lambda = 2,
                                                  objective= 'binary:logistic', 
                                                  scale_pos_weight=1,
                                                  seed=42), 
                        param_grid = param_test5, 
                        scoring='f1',
                        n_jobs=4,
                        iid=False, 
                        cv=5)

gsearch5.fit(X_train, y_train)
best_reg_lambda = gsearch5.best_params_['reg_lambda']
gsearch5.best_params_, gsearch5.best_score_

In [None]:
# Closer Look - Lambda L2 Regularization

param_test5 = {
    'reg_lambda':[0.7*best_reg_lambda, 0.8*best_reg_lambda, best_reg_lambda, 1.5*best_reg_lambda, 3*best_reg_lambda]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, 
                                                  n_estimators=1200, 
                                                  max_depth=best_max_depth,
                                                  min_child_weight=best_min_child_weight,                         
                                                  gamma=best_gamma,
                                                  subsample=best_subsample,
                                                  colsample_bytree=best_colsample_bytree,
                                                  reg_lambda = best_reg_lambda,
                                                  objective= 'binary:logistic', 
                                                  scale_pos_weight=1,
                                                  seed=42), 
                        param_grid = param_test5, 
                        scoring='f1',
                        n_jobs=4,
                        iid=False, 
                        cv=5)

gsearch5.fit(X_train, y_train)
best_reg_lambda = gsearch5.best_params_['reg_lambda']
gsearch5.best_params_, gsearch5.best_score_

In [None]:
# Even Closer Look - Lambda L2 Regularization

param_test5 = {
    'reg_lambda':[0.66*best_reg_lambda, 0.8*best_reg_lambda, best_reg_lambda, 1.5*best_reg_lambda, 3*best_reg_lambda]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, 
                                                  n_estimators=1200, 
                                                  max_depth=best_max_depth,
                                                  min_child_weight=best_min_child_weight,                         
                                                  gamma=best_gamma,
                                                  subsample=best_subsample,
                                                  colsample_bytree=best_colsample_bytree,
                                                  reg_lambda = best_reg_lambda,
                                                  objective= 'binary:logistic',  
                                                  scale_pos_weight=1,
                                                  seed=42), 
                        param_grid = param_test5, 
                        scoring='f1',
                        n_jobs=4,
                        iid=False, 
                        cv=5)

gsearch5.fit(X_train, y_train)
best_reg_lambda = gsearch5.best_params_['reg_lambda']
gsearch5.best_params_, gsearch5.best_score_

In [None]:
# Alpha L1 Regularization

param_test5 = {
    'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, 
                                                  n_estimators=1200, 
                                                  max_depth=best_max_depth,
                                                  min_child_weight=best_min_child_weight,                         
                                                  gamma=best_gamma,
                                                  subsample=best_subsample,
                                                  colsample_bytree=best_colsample_bytree,
                                                  reg_lambda = best_reg_lambda,
                                                  objective= 'binary:logistic', 
                                                  scale_pos_weight=1,
                                                  seed=42), 
                        param_grid = param_test5, 
                        scoring='f1',
                        n_jobs=4,
                        iid=False, 
                        cv=5)

gsearch5.fit(X_train, y_train)
best_reg_alpha = gsearch5.best_params_['reg_alpha']
gsearch5.best_params_, gsearch5.best_score_


In [None]:
# STEP 6 - Reducing Learning Rate and Adding More Trees

param_test6 = {
    'learning_rate':[i/100.0 for i in range(5,20,2)]
}
gsearch6 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, 
                                                  n_estimators=3000, 
                                                  max_depth=best_max_depth,
                                                  min_child_weight=best_min_child_weight,                         
                                                  gamma=best_gamma,
                                                  subsample=best_subsample,
                                                  colsample_bytree=best_colsample_bytree,
                                                  reg_lambda=best_reg_lambda,
                                                  reg_alpha=best_reg_alpha,
                                                  objective= 'binary:logistic', 
                                                  scale_pos_weight=1,
                                                  seed=42), 
                        param_grid = param_test6, 
                        scoring='f1',
                        n_jobs=4,
                        iid=False, 
                        cv=5)

gsearch6.fit(X_train, y_train)
best_LR = gsearch6.best_params_['learning_rate']
gsearch6.best_params_, gsearch6.best_score_

In [None]:
#Final Evaluation


xgb6_model = XGBClassifier(learning_rate=best_LR, 
                                        n_estimators=3000,
                                        max_depth=best_max_depth,
                                        min_child_weight=best_min_child_weight,                         
                                        gamma=best_gamma,
                                        subsample=best_subsample,
                                        colsample_bytree=best_colsample_bytree,
                                        reg_lambda=best_reg_lambda,
                                        reg_alpha=best_reg_alpha,
                                        objective= 'binary:logistic', 
                                        scale_pos_weight = 1,
                                        seed=42)


xgb6_model.fit(X_train,y_train)
y_predicted = xgb6_model.predict(X_test)
y_predicted_train = xgb6_model.predict(X_train)


print('performance over the training set: ' + str(f1_score(y_train, y_predicted_train)))
print('performance over the test set: ' + str(f1_score(y_test, y_predicted)) + '\n')
print(classification_report(y_test, y_predicted))

In [30]:
#Final Evaluation


xgb6_model = XGBClassifier(learning_rate=0.07, 
                                        n_estimators=2000,
                                        max_depth=3,
                                        min_child_weight=5,                         
                                        gamma=0,
                                        subsample=1,
                                        colsample_bytree=0.8,
                                        reg_alpha=0,
                                        reg_lambda=3,
                                        objective ='binary:logistic',
                                        scale_pos_weight = 1,
                                        seed=42)


xgb6_model.fit(X_train,y_train)
y_predicted = xgb6_model.predict(X_test)
y_predicted_train = xgb6_model.predict(X_train)


print('performance over the training set: ' + str(f1_score(y_train, y_predicted_train)))
print('performance over the test set: ' + str(f1_score(y_test, y_predicted)) + '\n')
print(classification_report(y_test, y_predicted))

performance over the training set: 0.9937464468447982
performance over the test set: 0.2375478927203065

              precision    recall  f1-score   support

           0       0.94      0.93      0.93      1491
           1       0.22      0.25      0.24       123

    accuracy                           0.88      1614
   macro avg       0.58      0.59      0.59      1614
weighted avg       0.88      0.88      0.88      1614



In [23]:
# data not scaled
# performance over the training set: 0.9937464468447982
# performance over the test set: 0.2375478927203065

#               precision    recall  f1-score   support

#            0       0.94      0.93      0.93      1491
#            1       0.22      0.25      0.24       123

#     accuracy                           0.88      1614
#    macro avg       0.58      0.59      0.59      1614
# weighted avg       0.88      0.88      0.88      1614


# data_scaled
# performance over the training set: 0.9937464468447982
# performance over the test set: 0.2375478927203065

#               precision    recall  f1-score   support

#            0       0.94      0.93      0.93      1491
#            1       0.22      0.25      0.24       123

#     accuracy                           0.88      1614
#    macro avg       0.58      0.59      0.59      1614
# weighted avg       0.88      0.88      0.88      1614

Unnamed: 0,Number_Of_Dependant,Years_At_Residence,Years_At_Business,Nb_Of_Products,DAY(BirthDate),DAY(Customer_Open_Date),DAY(Prod_Decision_Date),YEAR(BirthDate),YEAR(Customer_Open_Date),YEAR(Prod_Decision_Date),...,Prod_Category_I,Prod_Category_J,Prod_Category_K,Prod_Category_L,Prod_Category_M,KMeans 2,KMeans 3,Agglomerative Clustering Cosine,Agglomerative Clustering Euclidean,DBSCAN eps=3
0,0.703491,2.243852,-0.451898,-0.299213,1.637976,0.280783,1.080961,-0.433226,0.284388,-0.770762,...,-0.027277,-0.115644,-0.227615,-0.214193,-0.095872,0.633855,0.622740,0.802342,0.301022,-0.193059
1,-0.790403,0.739525,-0.451898,3.061464,-0.916956,0.280783,0.485969,-1.892971,0.284388,-0.770762,...,-0.027277,-0.115644,-0.227615,-0.214193,-0.095872,0.633855,-1.788445,-1.246352,0.301022,-0.486433
2,-0.790403,0.238083,-0.451898,3.061464,0.244376,0.761181,0.723966,0.661583,0.284388,-0.770762,...,-0.027277,-0.115644,-0.227615,-0.214193,-0.095872,0.633855,0.622740,0.802342,0.301022,-0.125357
3,0.703491,-0.062783,-0.313478,-0.299213,0.825043,-0.079516,0.009976,0.296646,0.603939,1.297417,...,-0.027277,-0.115644,-0.227615,-0.214193,-0.095872,0.633855,0.622740,0.802342,0.301022,-0.193059
4,1.450438,4.149332,3.285447,3.061464,-0.800823,0.040584,-0.228021,-1.436801,-0.354713,-0.770762,...,-0.027277,-0.115644,-0.227615,-0.214193,-0.095872,0.633855,0.622740,0.802342,0.301022,-0.486433
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6987,-0.790403,1.240967,-0.313478,-0.299213,1.405709,1.721978,-1.537003,1.482689,0.284388,-0.770762,...,-0.027277,-0.115644,-0.227615,-0.214193,-0.095872,0.633855,0.622740,0.802342,0.301022,-0.486433
6988,-0.790403,0.438659,1.209144,-0.299213,0.128243,-1.640811,-1.537003,0.114178,0.284388,-0.770762,...,-0.027277,-0.115644,-0.227615,-0.214193,10.430528,-1.577649,-1.788445,-1.246352,0.301022,-0.486433
6989,-0.790403,0.438659,1.209144,-0.299213,0.128243,-1.640811,-1.537003,0.114178,0.284388,-0.770762,...,-0.027277,-0.115644,-0.227615,-0.214193,10.430528,-1.577649,-1.788445,-1.246352,0.301022,-0.486433
6990,-0.790403,0.238083,-0.451898,-0.299213,-0.916956,1.241580,-0.109023,1.391455,0.284388,-0.770762,...,-0.027277,-0.115644,-0.227615,4.668684,-0.095872,0.633855,0.622740,0.802342,0.301022,-0.035088
