## XGBoost Training - Full

In [None]:
import numpy as np
import pandas as pd

import xgboost as xgb
from xgboost import DMatrix

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import re

Load the data

In [None]:
training_feats_filepath = "~/scratch/datasets/yale_new_haven/training_test_sets/full_dataset/features/normalized_preprocessing/xgboost/yale_new_haven_training_features_xgb.csv"
training_labels_filepath = "~/scratch/datasets/yale_new_haven/training_test_sets/full_dataset/labels/full_dataset_training_labels.csv"

In [None]:
df_train = pd.read_csv(training_feats_filepath)
y_train = pd.read_csv(training_labels_filepath)

In [None]:
train_ids = df_train['ID'].astype('int32')
df_train = df_train[[col for col in df_train if col != 'ID']]

Training/Validation split

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(df_train, y_train, test_size=0.1)

Create DMatrix's for XGBoost

In [None]:
dtrain = DMatrix(X_train, y_train)
dvalid = DMatrix(X_valid, y_valid)

In [None]:
# needed to get rid of the memory
df_train = 0
X_train = 0
X_valid = 0

#### Setup the tree

In [None]:
params = {
    'tree_method': 'hist',   # tree construction algorithm, 'hist' and 'gpu_hist' are recommended for large datasets

    # parameters Hong used
    'eta': 0.3,                              # learning rate
    'nthread': 5,                            # maximum number of threads to run simulateously
    'eval_metric': 'auc',                    # evaluation metric
    'objective': 'binary:logistic',          # objective function

    # parameters Hong optimized for
    'max_depth': 20,                         # max depth of the tree
    'colsample_bylevel': 0.05,               # subsample ratio of columns at each level. Subsampling occurs once for every new depth level reached in a tree. Columns are subsampled from the set of columns chosen for the current tree.
}

num_boost_round = 50                         # number of rounds for boosting 
evals=[(dtrain, 'train'), (dvalid, 'eval')]  # list of validation sets for which metrics will evaluated during training
early_stopping_rounds=3                      # activates early stopping; Validation metric needs to improve at least once in every 'early_stopping_rounds' rounds to continue training
verbose_eval = True                          # the evaluation metric on the validation set is printed at each boosting stage

### Train the tree

In [None]:
xgb_tree = xgb.train(params, dtrain, num_boost_round, evals=evals, early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose_eval)

### Initial Performance

In [None]:
xgb_valid_auroc = float(re.findall("auc:([\d\.]+)", xgb_tree.eval(dvalid))[0])
xgb_train_auroc = float(re.findall("auc:([\d\.]+)", xgb_tree.eval(dtrain))[0])

xgb_train_acc = accuracy_score(y_train, np.round(xgb_tree.predict(dtrain)))
xgb_valid_acc = accuracy_score(y_valid, np.round(xgb_tree.predict(dvalid)))

In [None]:
print("AUROC")
print(f"\tTraining: {np.round(xgb_train_auroc*100, 2)}\n\tValidation: {np.round(xgb_valid_auroc*100, 2)}")
print()
print("Accuracy")
print(f"\tTraining: {np.round(xgb_train_acc*100, 2)}\n\tValidation: {np.round(xgb_valid_acc*100, 2)}")

### Save model

In [None]:
model_filepath = "/home/mila/d/david.hobson/scratch/models/full/xgboost_full.ubj"

In [None]:
xgb_tree.save_model(model_filepath)