In [14]:
import pickle
import json

import pandas as pd
import numpy as np
import category_encoders

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.externals import joblib

# only dev
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_absolute_error

In [2]:
X_train = pd.read_csv('data/X_train.csv', na_values=['N/A or Unknown', 'unknown'])
y_train = pd.read_csv('data/y_train.csv', names=['injury'])

pipeline = make_pipeline(
    category_encoders.OneHotEncoder(),
    LogisticRegression(),
)
pipeline.fit(X_train, y_train.values.ravel())



Pipeline(memory=None,
     steps=[('onehotencoder', OneHotEncoder(cols=['m_or_f', 'person_attributes', 'seat', 'other_person_location', 'other_factor_1', 'other_factor_2', 'other_factor_3'],
       drop_invariant=False, handle_unknown='impute', impute_missing=True,
       return_df=True, use_cat_names=False, verbose=0)), ('lo...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [11]:
pipeline.predict_proba(X_train)

array([[0.38392799, 0.61607201],
       [0.00950265, 0.99049735],
       [0.35764692, 0.64235308],
       ...,
       [0.31969188, 0.68030812],
       [0.41521442, 0.58478558],
       [0.40260234, 0.59739766]])

## Export

In [3]:
with open('pipeline/columns.json', 'w') as fh:
    json.dump(X_train.columns.tolist(), fh)
    
with open('pipeline/dtypes.pickle', 'wb') as fh:
    pickle.dump(X_train.dtypes, fh)
    
joblib.dump(pipeline, 'pipeline/pipeline.pickle')

['pipeline/pipeline.pickle']

## Crossvalidate

In [4]:
cross_validate(
    pipeline, 
    X_train, y_train.values.ravel(), 
    scoring='roc_auc', n_jobs=-1, cv=5, return_train_score=True
)

{'fit_time': array([0.73312545, 0.70991421, 0.69916797, 0.71180272, 0.44470477]),
 'score_time': array([0.45597053, 0.47089839, 0.46401501, 0.44071221, 0.17825747]),
 'test_score': array([0.61592543, 0.62495886, 0.62403097, 0.58720851, 0.60856003]),
 'train_score': array([0.61733323, 0.61384798, 0.61454839, 0.62484153, 0.61905545])}

## XGBoost

In [5]:
import xgboost as xgb
import numpy as np
from sklearn.pipeline import TransformerMixin, make_pipeline

from pipeline.custom_transformers import NAEncoder, ColumnDropper

In [23]:
clf_xgb = xgb.XGBClassifier(max_depth=3, 
                             learning_rate=0.1, 
                             subsample=.95, 
                             colsample_bytree=0.25,
                             n_estimators=200,
                             random_state=30, 
                             min_child_weight = 2,
                             booster = 'dart',
                             objective='binary:logistic', 
                             base_score=np.mean(y_train.values)
                             )
clf_xgb = xgb.XGBClassifier(base_score=np.mean(y_train.values), booster='dart', # 0.6616844177928936
       colsample_bylevel=1, colsample_bytree=0.55, gamma=1,
       learning_rate=0.1, max_delta_step=0, max_depth=7,
       min_child_weight=3, missing=None, n_estimators=100, n_jobs=1,
       nthread=1, objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, silent=True,
       subsample=1)

clf = make_pipeline(ColumnDropper('age_in_years'),
                    NAEncoder(['other_person_location']),
                    NAEncoder(['other_factor_1', 'other_factor_2', 'other_factor_3']),
                    category_encoders.OneHotEncoder(), 
                    clf_xgb)

cvx = cross_validate(
    clf, 
    X_train, y_train.values.ravel(), 
    scoring='roc_auc', n_jobs=-1, cv=15, return_train_score=False
)
cvx['test_score'].mean(), cvx['test_score'].std()

(0.6111012639332903, 0.012417192105302857)

In [24]:
clf.fit(X_train, y_train.values.ravel())

Pipeline(memory=None,
     steps=[('columndropper', <pipeline.custom_transformers.ColumnDropper object at 0x7f9c394d3dd8>), ('naencoder-1', <pipeline.custom_transformers.NAEncoder object at 0x7f9c394d3b70>), ('naencoder-2', <pipeline.custom_transformers.NAEncoder object at 0x7f9c394d3ba8>), ('onehotencoder', OneHotEncoder(col... reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1))])

In [25]:
clf.predict_proba(X_train)

array([[0.39437127, 0.6056287 ],
       [0.0052529 , 0.9947471 ],
       [0.34424102, 0.655759  ],
       ...,
       [0.34424102, 0.655759  ],
       [0.39437127, 0.6056287 ],
       [0.39437127, 0.6056287 ]], dtype=float32)

In [26]:
joblib.dump(clf, 'pipeline/pipeline.pickle')

['pipeline/pipeline.pickle']

In [10]:
joblib.load('pipeline/pipeline.pickle').predict_proba(X_train)

array([[0.39437127, 0.6056287 ],
       [0.0052529 , 0.9947471 ],
       [0.34424102, 0.655759  ],
       ...,
       [0.34424102, 0.655759  ],
       [0.39437127, 0.6056287 ],
       [0.39437127, 0.6056287 ]], dtype=float32)

In [None]:
clf_xgb = xgb.XGBClassifier(
                            objective='binary:logistic',
                             n_estimators=100,
                             random_state=30, 
                             base_score=np.mean(y_train.values)
                             )

clf_xgb = xgb.XGBClassifier(base_score=0.6616844177928936, booster='dart',
       colsample_bylevel=1, colsample_bytree=0.55, gamma=1,
       learning_rate=0.1, max_delta_step=0, max_depth=7,
       min_child_weight=3, missing=None, n_estimators=100, n_jobs=1,
       nthread=1, objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

clf = make_pipeline(ColumnDropper('age_in_years'),
                    NAEncoder(['other_person_location']),
                    NAEncoder(['other_factor_1', 'other_factor_2', 'other_factor_3']),
                    category_encoders.OneHotEncoder(), 
                    clf_xgb)

cvx = cross_validate(
    clf, 
    X_train, y_train.values.ravel(), 
    scoring='roc_auc', n_jobs=-1, cv=15, return_train_score=False
)
cvx['test_score'].mean(), cvx['test_score'].std()

In [None]:
clf

In [24]:
joblib.load('pipeline/pipeline.pickle')

Pipeline(memory=None,
     steps=[('columndropper', <custom_transformers.ColumnDropper object at 0x7f4d4593fd68>), ('naencoder-1', <custom_transformers.NAEncoder object at 0x7f4d4593f748>), ('naencoder-2', <custom_transformers.NAEncoder object at 0x7f4d45bbd320>), ('onehotencoder', OneHotEncoder(cols=['m_or_f', 'person_attrib...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))])

In [102]:
clf = make_pipeline(
            ColumnDropper('age_in_years'),
            NAEncoder(['other_person_location']),
            NAEncoder(['other_factor_1', 'other_factor_2', 'other_factor_3']),
            category_encoders.OneHotEncoder(), 
            xgb.XGBClassifier(base_score=0.6616844177928936, booster='dart',
       colsample_bylevel=1, colsample_bytree=0.55, gamma=1,
       learning_rate=0.1, max_delta_step=0, max_depth=7,
       min_child_weight=3, missing=None, n_estimators=100, n_jobs=1,
       nthread=1, objective='binary:logistic', random_state=30, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, silent=True,
       subsample=1),
)
clf.fit(X_train, y_train.values.ravel())

Pipeline(memory=None,
     steps=[('columndropper', <custom_transformers.ColumnDropper object at 0x7f4d45819cc0>), ('naencoder-1', <custom_transformers.NAEncoder object at 0x7f4d45819710>), ('naencoder-2', <custom_transformers.NAEncoder object at 0x7f4d45819c88>), ('onehotencoder', OneHotEncoder(cols=['m_or_f', 'person_attrib...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))])

In [99]:
for x in range(5):
    print(mean_absolute_error(clf.predict_proba(X_train), 
                              clf.predict_proba(X_train)))

0.0
0.0
0.0
0.0
0.0


In [100]:
joblib.dump(clf, 'pipeline/pipeline.pickle')

['pipeline/pipeline.pickle']

In [101]:
for x in range(10):
    print(mean_absolute_error(joblib.load('pipeline/pipeline.pickle').predict_proba(X_train), 
                                              joblib.load('pipeline/pipeline.pickle').predict_proba(X_train)))

0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


In [46]:
float(clf.predict_proba(X_train)[0,1])
#clf.predict_proba(X_train)[0,1]


0.6056287288665771

In [51]:
int(np.nan)

ValueError: cannot convert float NaN to integer

In [64]:
observation =  {"m_or_f": "m", "person_attributes": "driving", "seat": "front_left",
                    "other_person_location": "N/A", "other_factor_1": "N/A", "other_factor_2": "N/A", 
                "other_factor_3": "N/A", "age_in_years": np.nan
               }
dtypes = X_train.dtypes.apply(lambda x: float if x == int else x)
pd.DataFrame([observation], columns=X_train.columns).astype(dtypes)

Unnamed: 0,m_or_f,person_attributes,seat,other_person_location,other_factor_1,other_factor_2,other_factor_3,age_in_years
0,m,driving,front_left,,,,,


In [66]:
for obs in observation:
    obs_conv = obs.astype(str)

AttributeError: 'str' object has no attribute 'astype'