In [1]:
"""
MOVE BACK ONE DIRECTORY BEFORE RUNNING
REPOPULATES TMP FOLDER
TAKES A LONG TIME
"""

import os,re
import json
import numpy as np
import pandas as pd
import pickle
import datetime
import matplotlib.pyplot as plt

from dataset import dataset

import xgboost as xgb
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import confusion_matrix, classification_report

import shap
from matplotlib.pyplot import figure

In [2]:
description, df = dataset()

In [3]:
model_categories = ['AircraftCategory', 'AmateurBuilt', 'NumberOfEngines', 'WeatherCondition', 'BroadPhaseOfFlight']

In [5]:
def dummy_vars(df, model_categories):
    dumb_cats = []
    for i in model_categories:
        if df[i].dtype == 'object':
            dumb = pd.get_dummies(df[i].replace('','Missing')).drop(['Missing'], axis=1)
            dumb.columns = [i+'_'+j for j in dumb.columns]
            dumb_cats.append(dumb)
        else:
            dumb_cats.append(pd.to_numeric(pd.Series(df[i].rename(i)), errors='coerce'))
    return dumb_cats

def model_dataset(df_, model_categories, over_sample=True):
    df = df_.copy()
    year_data = pd.DataFrame(data = [i.year for i in df.index], columns=['year'], index=df.index)
    
    dummies = dummy_vars(df, model_categories)
    dummies.append(year_data.fillna(year_data['year'].mean()))
    
    df_model = pd.concat(dummies, axis=1)
    
    df_target = (df[['TotalFatalInjuries','TotalSeriousInjuries']]>=1).max(axis=1)

    June_Mask = [i[:2]=='06' for i in df['EventDate']]

    x_train = df_model[np.logical_not(June_Mask)]
    y_train = df_target[np.logical_not(June_Mask)]
    x_test = df_model[June_Mask]
    y_test = df_target[June_Mask]
    
    if over_sample:
        oversample = RandomOverSampler(sampling_strategy='minority')
        x_over, y_over = oversample.fit_resample(x_train,y_train)
    
        return x_over, y_over, x_test, y_test
    
    else:
        return x_train, y_train, x_test, y_test

In [7]:
x_train, y_train, x_test, y_test = model_dataset(df, model_categories, over_sample=True)
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test,y_test)

num_round = 160
param = {'max_depth': 6, 
         'eta': .03,
         'objective': 'binary:logistic'}
param['nthread'] = 4
param['eval_metric'] = 'auc'
evallist = [(dtest, 'eval'), (dtrain, 'train')]

bst = xgb.train(param, dtrain, num_round, evallist)


[0]	eval-auc:0.73834	train-auc:0.75184
[1]	eval-auc:0.74618	train-auc:0.76050
[2]	eval-auc:0.74699	train-auc:0.76094
[3]	eval-auc:0.74730	train-auc:0.76102
[4]	eval-auc:0.74810	train-auc:0.76140
[5]	eval-auc:0.74814	train-auc:0.76141
[6]	eval-auc:0.74834	train-auc:0.76135
[7]	eval-auc:0.74813	train-auc:0.76133
[8]	eval-auc:0.75121	train-auc:0.76311
[9]	eval-auc:0.75133	train-auc:0.76326
[10]	eval-auc:0.75243	train-auc:0.76387
[11]	eval-auc:0.75214	train-auc:0.76395
[12]	eval-auc:0.75174	train-auc:0.76399
[13]	eval-auc:0.75156	train-auc:0.76392
[14]	eval-auc:0.75225	train-auc:0.76426
[15]	eval-auc:0.75377	train-auc:0.76584
[16]	eval-auc:0.75520	train-auc:0.77023
[17]	eval-auc:0.75540	train-auc:0.77039
[18]	eval-auc:0.75550	train-auc:0.77048
[19]	eval-auc:0.75549	train-auc:0.77051
[20]	eval-auc:0.75539	train-auc:0.77035
[21]	eval-auc:0.75552	train-auc:0.77041
[22]	eval-auc:0.75553	train-auc:0.77067
[23]	eval-auc:0.75669	train-auc:0.77150
[24]	eval-auc:0.75682	train-auc:0.77148
[25]	eval-

In [17]:
pickle.dump(bst, open("../tmp/bst.pkl", "wb"))
explainer = shap.Explainer(bst)
shap_values = explainer(x_test)

pickle.dump(shap_values, open("../tmp/shap_values.pkl", "wb"))
# visualize the first prediction's explanation
pickle.dump(x_test, open("../tmp/x_test_for_shaps.pkl", "wb"))

yhat = bst.predict(dtest)>.5
confusion = [confusion_matrix(y_test, yhat),pd.DataFrame.from_dict(classification_report(y_test, yhat, output_dict=True))]
pickle.dump(confusion,open("../tmp/confusion.pkl", "wb"))