In [None]:
import pandas as pd
import category_encoders
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_validate, GridSearchCV, StratifiedKFold
from xgboost import XGBClassifier

from pipeline.custom_transformers import NAEncoder, ColumnDropper

In [None]:
X_train = pd.read_csv('data/X_train.csv', na_values=['N/A or Unknown', 'unknown'])
y_train = pd.read_csv('data/y_train.csv', names=['injury'])

In [None]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [None]:
X = make_pipeline(
        NAEncoder(['other_person_location']),
        NAEncoder(['other_factor_1', 'other_factor_2', 'other_factor_3']),
        ColumnDropper('age_in_years'),
        category_encoders.OneHotEncoder(impute_missing=False),
    ).fit_transform(X_train)
y = y_train

params = {
        'min_child_weight': [1, 2, 3, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.8, 0.85, 0.9, 0.95, 1],
        'colsample_bytree': [0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.8, 1.0],
        'booster': ['dart', 'gbtree'],
        'learning_rate': [0.1, 0.05, 0.005]
        }

folds = 3

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 30)

clf = XGBClassifier(
    n_estimators=100, 
    objective='binary:logistic', 
    max_depth=7, 
    base_score=np.mean(y_train.values),
    silent=True, 
    nthread=1
)

random_search = GridSearchCV(
    clf,
    param_grid=params,
    scoring='roc_auc', 
    n_jobs=4, 
    cv=skf.split(X,y), 
    verbose=3,
)

# Running the GridSearch, takes about 5 hours
start_time = timer(None) # timing starts from this point for "start_time" variable
random_search.fit(X, y)
timer(start_time)

print(random_search.best_score_)
print(random_search.best_estimator_)