## loading libraries

In [None]:
# Python libraries
# Libraries to read and manipulate data
import numpy as np
import pandas as pd

# Libraries to import decision tree classifier 
from xgboost import XGBClassifier

# Libtune to tune model, get different metric scores
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV

# Library to suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# loading packages related to parallelization
import ipyparallel as ipp
from ipyparallel import Client
from ipyparallel.joblib import IPythonParallelBackend
from sklearn.utils import parallel_backend
from sklearn.utils import register_parallel_backend

# libraries concering multithreaded processing
import multiprocessing 

# so I don't have to waste time redoing all the ML models
import pickle

## connecting to the cluster

In [None]:
# must start cluster before running this! 
# connecting to the cluster
cluster = ipp.Cluster.from_file()
cluster
register_parallel_backend('ipyparallel', lambda : IPythonParallelBackend()) 

In [None]:
# Checking the cluster connection.
cluster

## loading test and training data sets

In [None]:
X_train = pickle.load(open("X_train.pkl", "rb" ))
y_train = pickle.load(open("y_train.pkl", "rb" ))

## machine learning hyperparameter tuning 

In [None]:
%%time

# Choose the type of classifier. 
xgb_grid = XGBClassifier(random_state=1, eval_metric='logloss', nthread=-1) #, tree_method='gpu_hist', nthread=no_cpus,

# Grid of parameters to choose from
parameters = {
    "n_estimators": [10,30,50],
    "scale_pos_weight":[1,2,5],
    "subsample":[0.7,0.9,1],
    "learning_rate":[0.05, 0.1,0.2],
    "colsample_bytree":[0.7,0.9,1],
    "colsample_bylevel":[0.5,0.7,1]
}

# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.f1_score)

# Run the grid search
grid_obj = GridSearchCV(xgb_grid, parameters,scoring=scorer,cv=5, n_jobs=1) #, n_jobs=no_cpus

with parallel_backend('ipyparallel'):
    grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
xgb_grid = grid_obj.best_estimator_

# Fit the best algorithm to the data.
xgb_grid.fit(X_train, y_train)