# XGBoost Modeling

In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

  data = yaml.load(f.read()) or {}
  defaults = yaml.load(f)
Using TensorFlow backend.


# 1.1 Load Data

In [2]:
train = pd.read_csv("train.csv")
val = pd.read_csv("val.csv")

# 1.2 Preprocessing

First I will have to make sure there are at least 6 rows in each class for SMOTE to work properly.

In [3]:
# see number of rows per a class
train.Type.value_counts()

2    48
1    34
7    19
3    10
5     6
6     3
Name: Type, dtype: int64

In [4]:
# copy rows for class 6
copy = train[train["Type"] == 6]
train = pd.concat([train, copy], axis=0)

Here I will scale the data

In [5]:
X_train, y_train = train.drop("Type", axis=1), train.Type
X_val, y_val = val.drop("Type", axis=1), val.Type

In [6]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_val = sc.transform(X_val)

Now I will apply smote to make the classes balanced.

In [7]:
sm = SMOTE()
X_train, y_train = sm.fit_resample(X_train, y_train)

# 2.1 Hyper Parameter Tuning

To find the optimal parameters I will use a grid search.

In [8]:
params = {"max_depth":[2, 3, 4, None]}
gscv = GridSearchCV(XGBClassifier(), params)

gscv.fit(X_train, y_train)
gscv.best_params_



{'max_depth': 4}

Now I will do a more fine combed search

In [10]:
params = {"max_depth": [4,  5, 6]}
gscv = GridSearchCV(XGBClassifier(), params)

gscv.fit(X_train, y_train)
gscv.best_params_



{'max_depth': 4}

In [11]:
# save model
model = gscv.best_estimator_

# 3.1 Evaluation

Here I will see how well the model does on the training and validation sets.

In [12]:
# fit
model.fit(X_train, y_train)

# training predictions
train_pred = model.predict(X_train)

# validation predictions
val_pred = model.predict(X_val)

# scores
print("training performance")
print(classification_report(y_train, train_pred))

print("validation performance")
print(classification_report(y_val, val_pred))

training performance
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        48
           2       1.00      1.00      1.00        48
           3       1.00      1.00      1.00        48
           5       1.00      1.00      1.00        48
           6       1.00      1.00      1.00        48
           7       1.00      1.00      1.00        48

    accuracy                           1.00       288
   macro avg       1.00      1.00      1.00       288
weighted avg       1.00      1.00      1.00       288

validation performance
              precision    recall  f1-score   support

           1       0.88      0.78      0.82        18
           2       0.50      0.83      0.62         6
           3       1.00      0.50      0.67         4
           5       0.60      0.75      0.67         4
           6       1.00      0.50      0.67         2
           7       0.83      0.83      0.83         6

    accuracy                     