In [1]:
import numpy as np
import pandas as pd
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

%matplotlib inline
plt.rcParams['figure.figsize'] = [10, 8]

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from custom_scripts.config import *
from custom_scripts.evaluate_performance import evaluate_performance
from custom_scripts.prepare_data import prepare_data

from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import KFold

import xgboost as xgb

### Load the data

In [55]:
train_data, tue_test_data, wed_test_data = all_day_loader(test_frac=0.2)

In [51]:
train_data.shape

(1439121, 68)

In [52]:
tue_test_data.shape

(89129, 68)

In [53]:
wed_test_data.shape

(138282, 68)

## Preparing the Dataset for Classification 

Here we need to separate our class variable, which in our case is "Label" from the rest of the dataset.

In [8]:
X_train, y_train = prepare_data(data=train_data,class_column='Label',classes='binary',neg_class='BENIGN')

X_tue_test, y_tue_test = prepare_data(data=tue_test_data,class_column='Label',classes='binary',neg_class='BENIGN')

X_wed_test, y_wed_test = prepare_data(data=wed_test_data,class_column='Label',classes='binary',neg_class='BENIGN')

In [9]:
X_test ={'Tuesday':X_tue_test,'Wednesday':X_wed_test}
y_test ={'Tuesday':y_tue_test,'Wednesday':y_wed_test}

## Training XGBoost

https://www.kaggle.com/stuarthallows/using-xgboost-with-scikit-learn

In [10]:
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
xgb_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

## Save the trained model

Let's pickle our saved model as a serialized binary file for later fetching.

In [11]:
filename='./models/xgb_model_3days.pkl'

with open(filename,'wb') as file:
    pickle.dump(xgb_model,file)

## Predict labels

In [44]:
y_pred={}

for day in X_test:
    y_pred[day]=xgb_model.predict(X_test[day])
    print(day)
    print(confusion_matrix(y_test[day],y_pred[day],labels=[0,1]))
    evaluate_performance(y_test[day],y_pred[day],dec_digits=4,pos_label=1,just_numbers=True)

Tuesday
[[86253    57]
 [    6  2813]]
0.9993
0.9801
0.9979
0.9889
0.9986
Wednesday
[[87545   338]
 [   97 50302]]
0.9969
0.9933
0.9981
0.9957
0.9971
