In [45]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from model_trainer.trainers.xgboost_trainer import XGBoostTrainer
from model_trainer.preprocessing.feature_selection import FeatureSelector
from model_trainer.preprocessing.encoder import Encoder

from fastai.tabular.all import *

In [46]:
df = pd.read_csv('../../../data/dataset_initial.csv')
df['Weather_Condition_Arr'] = df['Weather_Condition_Arr'].apply(lambda x: literal_eval(x) if str(x)!='nan' else x)
X, y = df[df.columns.drop('Severity')], df['Severity']

In [47]:
X_train_val_test, Y_train_val_test = X.copy(),y.copy()
X_train_val,X_test,Y_train_val,Y_test = train_test_split(X_train_val_test, Y_train_val_test,train_size=0.8,stratify=Y_train_val_test)

In [51]:
fs = FeatureSelector()
enc = Encoder(categorical_columns)
X_train, X_val, Y_train, Y_val = train_test_split(X_train_val,Y_train_val,train_size=0.8,stratify=Y_train_val)
X_train = fs.fit_transform(X_train)
X_val = fs.transform(X_val)
X_train = enc.fit_transform(X_train)
X_val = enc.transform(X_val)
Y_train, Y_val = Y_train-1, Y_val-1

In [53]:
categorical_columns = ['Side', 'State', 'Amenity','Bump','Crossing',
                         'Give_Way', 'Junction','No_Exit',
                         'Railway','Roundabout','Station','Stop',
                         'Traffic_Calming','Traffic_Signal','Turning_Loop',
                         'Sunrise_Sunset','Civil_Twilight','Nautical_Twilight',
                         'Astronomical_Twilight']

In [54]:
from sklearn.utils import class_weight

weights = {0:27,1:1,2:4,3:5}

sample_weights = class_weight.compute_sample_weight(
            class_weight=weights,
            y=Y_train
)

In [55]:
from hyperopt import space_eval,hp
from hyperopt.pyll import scope

xgboost_large_space = {
    "eta":hp.loguniform("eta",np.log(0.01),np.log(0.3)),
    "gamma":hp.uniform("gamma",0,10),
    "max_depth":scope.int(hp.quniform("max_depth",3,10,1)),
    "min_child_weight":hp.uniform("min_child_weight",0,10),
    "max_delta_step":hp.uniform("max_delta_step",1,10),
    "subsample":hp.uniform("subsample",0.3,1),
    "lambda":hp.uniform("lambda",0,5),
    "alpha":hp.uniform("alpha",0,5)
}

best_trial = {}
trials = pickle.load(open("xgboost-trials.p",'rb'))
for key, val in trials.best_trial['misc']['vals'].items():
    best_trial[key] = val[0]

best_params = space_eval(xgboost_large_space, best_trial)


In [56]:
import xgboost 

model = xgboost.XGBClassifier(**best_params,verbosity=2)
model.fit(X_train,Y_train,sample_weight=sample_weights)

[16:57:31] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 380 extra nodes, 10 pruned nodes, max_depth=8
[16:57:34] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 446 extra nodes, 30 pruned nodes, max_depth=8
[16:57:38] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 448 extra nodes, 22 pruned nodes, max_depth=8
[16:57:41] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 462 extra nodes, 18 pruned nodes, max_depth=8
[16:57:45] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 394 extra nodes, 2 pruned nodes, max_depth=8
[16:57:48] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 444 extra nodes, 34 pruned nodes, max_depth=8
[16:57:52] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 460 extra nodes, 24 pruned nodes, max_depth=8
[16:57:55] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 502 extra nodes, 6 pruned nodes, max_depth=8
[16:57:59] INFO: ../src/tree/updater_prune.cc:101: tree pruning end, 406 extra nod

In [57]:
from sklearn.metrics import matthews_corrcoef

matthews_corrcoef(Y_train,model.predict(X_train))

0.47270814377747156

In [36]:
len(X_train), len(Y_train)


(1821018, 1821018)

In [37]:
len(X_val),len(Y_val)

(1821018, 455255)