In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [21]:
data = pd.read_csv('../data/data.csv')
Y = data['Y']
data = data.drop(['Y'], axis = 1)


data_scaled = pd.read_csv('../data/data_scaled.csv')
Y_scaled = data_scaled['Y']
data_scaled = data_scaled.drop(['Unnamed: 0', 'Y'], axis = 1)

In [144]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, Y, test_size=0.2)
# X_train, X_test, y_train, y_test = train_test_split(data_scaled, Y, test_size=0.2)

In [97]:
import xgboost

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

In [131]:
# Not Scaled Dataset
# STEP 1 - First XGB - Highly Overfitted
# Fiz Learning Rate and n_estimators

xgb1_model = XGBClassifier( learning_rate=0.1, 
                                        n_estimators=1200,
                                        max_depth=5,
                                        min_child_weight=1,                         
                                        gamma=0,
                                        colsample_bytree=0.8,
                                        objective ='binary:logistic',
                                        scale_pos_weight = 1,
                                        seed=42)


xgb1_model.fit(X_train,y_train)
y_predicted = xgb1_model.predict(X_test)
y_predicted_train = xgb1_model.predict(X_train)


print('performance over the training set: ' + str(f1_score(y_train, y_predicted_train)))
print('performance over the test set: ' + str(f1_score(y_test, y_predicted)) + '\n')
print(classification_report(y_test, y_predicted))

performance over the training set: 1.0
performance over the test set: 0.18691588785046728

              precision    recall  f1-score   support

           0       0.93      0.99      0.96       992
           1       0.43      0.12      0.19        84

    accuracy                           0.92      1076
   macro avg       0.68      0.55      0.57      1076
weighted avg       0.89      0.92      0.90      1076



In [146]:
# STEP 2 - Tuning max_depth and min_child_weight

param_test2 = {
    'max_depth':range(1,7,1),
    'min_child_weight':range(0,7,1)
}

gsearch2 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, 
                                                  n_estimators=1200, 
                                                  max_depth=3,
                                                  min_child_weight=4,
                                                  gamma=0,  
                                                  colsample_bytree=0.8,
                                                  objective= 'binary:logistic',
                                                  scale_pos_weight=1, 
                                                  seed=42), 
                        param_grid = param_test2, 
                        scoring='f1',
                        n_jobs=4,
                        iid=False, 
                        cv=5)


gsearch2.fit(X_train, y_train)
gsearch2.best_params_, gsearch2.best_score_

({'max_depth': 3, 'min_child_weight': 4}, 0.16407333382683148)

In [151]:
#Testing New Parameters

xgb2_model = XGBClassifier( learning_rate=0.1, 
                                        n_estimators=1200,
                                        max_depth=3,
                                        min_child_weight=5,                         
                                        gamma=0,
                                        colsample_bytree=0.8,
                                        objective ='binary:logistic',
                                        scale_pos_weight = 1,
                                        seed=42)


xgb2_model.fit(X_train,y_train)
y_predicted = xgb2_model.predict(X_test)
y_predicted_train = xgb2_model.predict(X_train)


print('performance over the training set: ' + str(f1_score(y_train, y_predicted_train)))
print('performance over the test set: ' + str(f1_score(y_test, y_predicted)) + '\n')
print(classification_report(y_test, y_predicted))

performance over the training set: 0.7890625
performance over the test set: 0.15094339622641512

              precision    recall  f1-score   support

           0       0.93      0.98      0.96       993
           1       0.35      0.10      0.15        83

    accuracy                           0.92      1076
   macro avg       0.64      0.54      0.55      1076
weighted avg       0.88      0.92      0.89      1076



In [137]:
# STEP 3 - Tuning Gamma 

param_test3 = { 
    'gamma':[i/10.0 for i in range(0,5)] 
}

gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, 
                                                  n_estimators=1200, 
                                                  max_depth = 3,
                                                  min_child_weight = 5,
                                                  gamma=0, 
                                                  colsample_bytree=0.8,
                                                  objective= 'binary:logistic',
                                                  scale_pos_weight=1,
                                                  seed=42), 
                        param_grid = param_test3, 
                        scoring='f1',
                        n_jobs=4,
                        iid=False, 
                        cv=5)

gsearch3.fit(X_train, y_train)
gsearch3.best_params_, gsearch3.best_score_

({'gamma': 0.0}, 0.18547316732055688)

In [138]:
# Testing New Parameters

xgb3_model = XGBClassifier( learning_rate=0.1, 
                                        n_estimators=1200,
                                        max_depth=3,
                                        min_child_weight=5,                         
                                        gamma=0,
                                        colsample_bytree=0.8,
                                        objective = 'binary:logistic',
                                        scale_pos_weight = 1,
                                        seed=42)


xgb3_model.fit(X_train,y_train)
y_predicted = xgb3_model.predict(X_test)
y_predicted_train = xgb3_model.predict(X_train)


print('performance over the training set: ' + str(f1_score(y_train, y_predicted_train)))
print('performance over the test set: ' + str(f1_score(y_test, y_predicted)) + '\n')
print(classification_report(y_test, y_predicted))

performance over the training set: 0.7953216374269005
performance over the test set: 0.18018018018018017

              precision    recall  f1-score   support

           0       0.93      0.98      0.96       992
           1       0.37      0.12      0.18        84

    accuracy                           0.92      1076
   macro avg       0.65      0.55      0.57      1076
weighted avg       0.89      0.92      0.89      1076



In [140]:
# STEP 4 - Tuning colsample_bytree and subsaample

param_test4 = {
    'subsample':[i/10.0 for i in range(7,11)],
    'colsample_bytree':[i/10.0 for i in range(7,11)]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, 
                                                  n_estimators=1200, 
                                                  max_depth = 3,
                                                  min_child_weight = 5,
                                                  gamma=0, 
                                                  subsample=1, 
                                                  colsample_bytree=0.8,
                                                  objective= 'binary:logistic', 
                                                  scale_pos_weight=1,
                                                  seed=42), 
                        param_grid = param_test4, 
                        scoring='f1',
                        n_jobs=4,
                        iid=False, 
                        cv=5)

gsearch4.fit(X_train, y_train)
gsearch4.best_params_, gsearch4.best_score_

({'colsample_bytree': 0.8, 'subsample': 1.0}, 0.18547316732055688)

In [145]:
# Testing New Parameters

xgb4_model = XGBClassifier( learning_rate=0.1, 
                                        n_estimators=1200,
                                        max_depth=3,
                                        min_child_weight=5,                         
                                        gamma=0,
                                        subsample=1,
                                        colsample_bytree=0.8,
                                        objective ='binary:logistic',
                                        scale_pos_weight = 1,
                                        seed=42)


xgb4_model.fit(X_train,y_train)
y_predicted = xgb4_model.predict(X_test)
y_predicted_train = xgb4_model.predict(X_train)


print('performance over the training set: ' + str(f1_score(y_train, y_predicted_train)))
print('performance over the test set: ' + str(f1_score(y_test, y_predicted)) + '\n')
print(classification_report(y_test, y_predicted))

performance over the training set: 0.7890625
performance over the test set: 0.15094339622641512

              precision    recall  f1-score   support

           0       0.93      0.98      0.96       993
           1       0.35      0.10      0.15        83

    accuracy                           0.92      1076
   macro avg       0.64      0.54      0.55      1076
weighted avg       0.88      0.92      0.89      1076



In [157]:
# STEP 5 - Tuning Regularization Parameters
# Lambda L2 Regularization

param_test5 = {
    'reg_lambda':[1e-2, 0.1, 0.5, 1, 2, 10]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, 
                                                  n_estimators=1200, 
                                                  max_depth = 3,
                                                  min_child_weight = 5,
                                                  gamma=0, 
                                                  subsample=1, 
                                                  colsample_bytree=0.8,
                                                  reg_lambda = 2,
                                                  objective= 'binary:logistic', 
                                                  scale_pos_weight=1,
                                                  seed=42), 
                        param_grid = param_test5, 
                        scoring='f1',
                        n_jobs=4,
                        iid=False, 
                        cv=5)

gsearch5.fit(X_train, y_train)
gsearch5.best_params_, gsearch5.best_score_

({'reg_lambda': 2}, 0.1663915032336085)

In [158]:
# Closer Look - Lambda L2 Regularization

param_test5 = {
    'reg_lambda':[2, 3, 4, 5]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, 
                                                  n_estimators=1200, 
                                                  max_depth = 3,
                                                  min_child_weight = 5,
                                                  gamma=0, 
                                                  subsample=1, 
                                                  colsample_bytree=0.8,
                                                  reg_lambda = 3,
                                                  objective= 'binary:logistic', 
                                                  scale_pos_weight=1,
                                                  seed=42), 
                        param_grid = param_test5, 
                        scoring='f1',
                        n_jobs=4,
                        iid=False, 
                        cv=5)

gsearch5.fit(X_train, y_train)
gsearch5.best_params_, gsearch5.best_score_

({'reg_lambda': 3}, 0.17262618083670717)

In [161]:
# Even Closer Look - Lambda L2 Regularization

param_test5 = {
    'reg_lambda':[2.6, 2.8, 3, 3.2, 3.4]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, 
                                                  n_estimators=1200, 
                                                  max_depth = 3,
                                                  min_child_weight = 5,
                                                  gamma=0, 
                                                  subsample=1, 
                                                  colsample_bytree=0.8,
                                                  reg_lambda = 3,
                                                  objective= 'binary:logistic', 
                                                  scale_pos_weight=1,
                                                  seed=42), 
                        param_grid = param_test5, 
                        scoring='f1',
                        n_jobs=4,
                        iid=False, 
                        cv=5)

gsearch5.fit(X_train, y_train)
gsearch5.best_params_, gsearch5.best_score_

({'reg_lambda': 3}, 0.17262618083670717)

In [162]:
# Alpha L1 Regularization

param_test5 = {
    'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, 
                                                  n_estimators=1200, 
                                                  max_depth = 3,
                                                  min_child_weight = 5,
                                                  gamma=0, 
                                                  subsample=1, 
                                                  colsample_bytree=0.8,
                                                  reg_lambda = 3,
                                                  objective= 'binary:logistic', 
                                                  scale_pos_weight=1,
                                                  seed=42), 
                        param_grid = param_test5, 
                        scoring='f1',
                        n_jobs=4,
                        iid=False, 
                        cv=5)

gsearch5.fit(X_train, y_train)
gsearch5.best_params_, gsearch5.best_score_


({'reg_alpha': 1e-05}, 0.17262618083670717)

In [163]:
# STEP 6 - Reducing Learning Rate and Adding More Trees

param_test6 = {
    'learning_rate':[i/100.0 for i in range(5,20,2)]
}
gsearch6 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, 
                                                  n_estimators=3000, 
                                                  max_depth = 3,
                                                  min_child_weight = 5,
                                                  gamma=0, 
                                                  subsample=1, 
                                                  colsample_bytree=0.8,
                                                  reg_alpha=0,
                                                  reg_lambda=3,
                                                  objective= 'binary:logistic', 
                                                  scale_pos_weight=1,
                                                  seed=42), 
                        param_grid = param_test6, 
                        scoring='f1',
                        n_jobs=4,
                        iid=False, 
                        cv=5)

gsearch6.fit(X_train, y_train)
gsearch6.best_params_, gsearch6.best_score_

({'learning_rate': 0.07}, 0.16661248265677714)

In [165]:
#Final Evaluation


xgb6_model = XGBClassifier(learning_rate=0.07, 
                                        n_estimators=3000,
                                        max_depth=3,
                                        min_child_weight=5,                         
                                        gamma=0,
                                        subsample=1,
                                        colsample_bytree=0.8,
                                        reg_alpha=0,
                                        reg_lambda=3,
                                        objective ='binary:logistic',
                                        scale_pos_weight = 1,
                                        seed=42)


xgb6_model.fit(X_train,y_train)
y_predicted = xgb6_model.predict(X_test)
y_predicted_train = xgb6_model.predict(X_train)


print('performance over the training set: ' + str(f1_score(y_train, y_predicted_train)))
print('performance over the test set: ' + str(f1_score(y_test, y_predicted)) + '\n')
print(classification_report(y_test, y_predicted))

performance over the training set: 0.9236111111111112
performance over the test set: 0.14953271028037382

              precision    recall  f1-score   support

           0       0.93      0.98      0.96       993
           1       0.33      0.10      0.15        83

    accuracy                           0.92      1076
   macro avg       0.63      0.54      0.55      1076
weighted avg       0.88      0.92      0.89      1076



In [166]:
#Final Evaluation


xgb6_model = XGBClassifier(learning_rate=0.07, 
                                        n_estimators=2000,
                                        max_depth=3,
                                        min_child_weight=5,                         
                                        gamma=0,
                                        subsample=1,
                                        colsample_bytree=0.8,
                                        reg_alpha=0,
                                        reg_lambda=3,
                                        objective ='binary:logistic',
                                        scale_pos_weight = 1,
                                        seed=42)


xgb6_model.fit(X_train,y_train)
y_predicted = xgb6_model.predict(X_test)
y_predicted_train = xgb6_model.predict(X_train)


print('performance over the training set: ' + str(f1_score(y_train, y_predicted_train)))
print('performance over the test set: ' + str(f1_score(y_test, y_predicted)) + '\n')
print(classification_report(y_test, y_predicted))

performance over the training set: 0.7914230019493177
performance over the test set: 0.15686274509803924

              precision    recall  f1-score   support

           0       0.93      0.99      0.96       993
           1       0.42      0.10      0.16        83

    accuracy                           0.92      1076
   macro avg       0.68      0.54      0.56      1076
weighted avg       0.89      0.92      0.90      1076

