## Imports

In [None]:
import pickle
import json

import pandas as pd
import numpy as np
import category_encoders
from xgboost import XGBClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.externals import joblib
from sklearn.model_selection import cross_validate

from pipeline.custom_transformers import NAEncoder, ColumnDropper

## Data handling

In [None]:
X_train = pd.read_csv('data/X_train.csv', na_values=['N/A or Unknown', 'unknown'])
y_train = pd.read_csv('data/y_train.csv', names=['injury'])

In [None]:
with open('pipeline/columns.json', 'w') as fh:
    json.dump(X_train.columns.tolist(), fh)
    
with open('pipeline/dtypes.pickle', 'wb') as fh:
    pickle.dump(X_train.dtypes, fh)

## Baseline

In [None]:
pipeline = make_pipeline(
    category_encoders.OneHotEncoder(),
    LogisticRegression(),
)

pipeline.fit(X_train, y_train.values.ravel())
    
joblib.dump(pipeline, 'pipeline/pipeline.pickle')

## Final pipeline

In [None]:
pipeline = make_pipeline(
    ColumnDropper('age_in_years'),
    NAEncoder(['other_person_location']),
    NAEncoder(['other_factor_1', 'other_factor_2', 'other_factor_3']),
    category_encoders.OneHotEncoder(), 
    XGBClassifier(base_score=np.mean(y_train.values), booster='dart',
       colsample_bylevel=1, colsample_bytree=0.55, gamma=1,
       learning_rate=0.1, max_delta_step=0, max_depth=7,
       min_child_weight=3, missing=None, n_estimators=100, n_jobs=1,
       nthread=1, objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, silent=True,
       subsample=1
    )
)

pipeline.fit(X_train, y_train.values.ravel())

joblib.dump(pipeline, 'pipeline/pipeline.pickle')