### **XGBoost**

#### Imports

In [70]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score
#import matplotlib.pyplot as plt
#import seaborn as sns

#### Constants

In [71]:
#TRAIN_SET = "df.csv"
#TEST_SET = "df1.csv"
TRAIN_SET = "original.csv"
TEST_SET = "shift_07original.csv"

#### Import data

Import training set

In [72]:
train_set = pd.read_csv(TRAIN_SET)
x_train = train_set.drop('target', axis=1)
y_train = train_set['target']
x_train.head()

Unnamed: 0,feature_1,feature_2
0,14.866983,19.95109
1,3.830674,14.511167
2,-4.163375,14.254921
3,7.459221,18.798361
4,-1.236098,13.234694


Import shifted test set

In [73]:
test_set = pd.read_csv(TEST_SET)
x_test = test_set.drop('target', axis=1)
y_test = test_set['target']
x_test.head()

Unnamed: 0,feature_1,feature_2
0,7.255285,20.3021
1,9.457898,18.473645
2,45.318798,93.759114
3,4.753057,21.030337
4,9.727295,17.360757


Decision tree

In [106]:
model = DTC(criterion='gini', max_depth=7)#, min_samples_split=2)
model.fit(x_train, y_train)
print(model.score(x_train, y_train))

1.0


In [107]:
y_pred = model.predict(x_test)
y_pred_proba = model.predict_proba(x_test)[:,1]

print("score:", model.score(x_test, y_test))
print(classification_report(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_proba))

score: 0.701
              precision    recall  f1-score   support

           0       0.99      0.61      0.75       745
           1       0.46      0.98      0.62       255

    accuracy                           0.70      1000
   macro avg       0.72      0.79      0.69      1000
weighted avg       0.85      0.70      0.72      1000

AUC: 0.7915909988156337


Gradient boosting

In [102]:
model = GBC(learning_rate=0.05, max_depth=6, max_features='log2', min_samples_leaf=3, min_samples_split=3, n_estimators=100, subsample=0.7)
model.fit(x_train, y_train)
print("score:", model.score(x_train, y_train))

score: 1.0


In [103]:
y_pred = model.predict(x_test)
y_pred_proba = model.predict_proba(x_test)[:,1]

print("score:", model.score(x_test, y_test))
print(classification_report(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_pred_proba))

score: 0.702
              precision    recall  f1-score   support

           0       0.99      0.61      0.75       745
           1       0.46      0.98      0.63       255

    accuracy                           0.70      1000
   macro avg       0.73      0.79      0.69      1000
weighted avg       0.86      0.70      0.72      1000

AUC: 0.7517804974338729


Grid search over parameters

In [100]:
def grid_search(model):
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [4, 5, 6],
        'min_samples_split': [2, 3, 4],
        'min_samples_leaf': [2, 3, 4],
        'max_features': [None, 'sqrt', 'log2'],
        'learning_rate': [0.075, 0.05, 0.025],
        'subsample': [0.6, 0.7, 0.8]
    }
    grid_search = GridSearchCV(model, param_grid, n_jobs=-1, cv=3)
    grid_search.fit(x_train, y_train)
    print(grid_search.best_params_)
    print(grid_search.best_score_)

In [101]:
grid_search(model)

{'learning_rate': 0.05, 'max_depth': 6, 'max_features': 'log2', 'min_samples_leaf': 3, 'min_samples_split': 3, 'n_estimators': 100, 'subsample': 0.7}
0.9950099800399202
