## XGBoost
### XGBoost -->  “Extreme Gradient Boosting” - Gradient Boosted Trees
### “Gradient Boosting” originates from the paper Greedy Function Approximation: A Gradient Boosting Machine, by Friedman.
https://xgboost.readthedocs.io/en/latest/tutorials/model.html 

### AIS Data – Activity Classification – Demo Data Extracted from Main Data

In [1]:
#### INPUTS (X): VesselName, MMSI, VesselType, Length, Width, SOG
##### MMSI --> Maritime Mobile Service Identity --> nine digits ID
##### SOG --> Speed Over Ground
##### OUTPUT (y): Status
#####             0 - 'engaged in fishing‘
#####             1 - 'under way using engine‘
#####             2 - 'undefined‘
#####             3 - 'at anchor'

In [2]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
# Importing the dataset
dataset = pd.read_csv('AIS_2017_01_Zone01.csv')
dataset.head()

Unnamed: 0,VesselName,MMSI,VesselType,Length,Width,SOG,Status
0,ALEUTIAN NO 1,366988820,1001,37.22,9.73,9.2,engaged in fishing
1,ALEUTIAN NO 1,366988820,1001,37.22,9.73,9.6,engaged in fishing
2,ALEUTIAN NO 1,366988820,1001,37.22,9.73,9.3,engaged in fishing
3,ALEUTIAN NO 1,366988820,1001,37.22,9.73,9.5,engaged in fishing
4,ALEUTIAN NO 1,366988820,1001,37.22,9.73,9.6,engaged in fishing


In [4]:
dataset.tail()

Unnamed: 0,VesselName,MMSI,VesselType,Length,Width,SOG,Status
9952,WESTWOOD PACIFIC,211517000,1004,183.2,29.94,13.0,under way using engine
9953,WESTWOOD PACIFIC,211517000,1004,183.2,29.94,13.1,under way using engine
9954,WESTWOOD PACIFIC,211517000,1004,183.2,29.94,13.1,under way using engine
9955,WESTWOOD PACIFIC,211517000,1004,183.2,29.94,13.1,under way using engine
9956,WESTWOOD PACIFIC,211517000,1004,183.2,29.94,13.1,under way using engine


In [5]:
X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, -1].values
print(y[0:3])

['engaged in fishing' 'engaged in fishing' 'engaged in fishing']


In [6]:
## Categories
# {'engaged in fishing','under way using engine','undefined','at anchor'}
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)
print(y[0:3])

[1 1 1]


In [7]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [8]:
# Fitting XGBoost to the Training set
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              objective='multi:softprob', predictor=None, ...)

In [9]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [10]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[ 247    0    0    0]
 [   0  189    0    1]
 [   0    0  635    0]
 [   2    0    1 1415]]


### k-Fold Cross Validation

In [11]:
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
# K Fold == 10 Fold
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
accuracies.mean()

0.9986607735679088

In [12]:
accuracies.std()

0.001339047428180596

### Grid Search

In [13]:
# Definition : XGBClassifier(self, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, 
#                            objective="binary:logistic", booster='gbtree', n_jobs=1, nthread=None, gamma=0, 
#                            min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, 
#                            reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, 
#                            missing=None, **kwargs)

In [14]:
# More Optimizing
from sklearn.model_selection import GridSearchCV
parameters = [{'max_depth': [2,4], 'gamma': [0.05, 0.1]}]
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = 1)
grid_search = grid_search.fit(X_train, y_train)
best_accuracy2 = grid_search.best_score_
best_parameters2 = grid_search.best_params_

In [15]:
best_parameters2

{'gamma': 0.1, 'max_depth': 2}

In [16]:
best_accuracy2

0.9987948218252815

In [17]:
# Need to reset to BEST PARAMETERS
classifier = XGBClassifier(gamma= 0.1, max_depth = 2, random_state = 0)
classifier.fit(X_train, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0.1, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=2, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              objective='multi:softprob', predictor=None, ...)

In [18]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [19]:
# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[ 247,    0,    0,    0],
       [   0,  189,    0,    1],
       [   0,    0,  635,    0],
       [   0,    0,    1, 1417]], dtype=int64)