In [1]:
%matplotlib inline
import pandas as pd, seaborn as sns, numpy as np, os, matplotlib.pyplot as plt
from tqdm import tqdm
import missingno as msno
from helper_funcs import *
import json, os
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, balanced_accuracy_score
from sklearn.externals import joblib
from imblearn.ensemble import BalancedRandomForestClassifier # !!!!!! balanced!
from IPython.display import display, Markdown

def md(txt): display(Markdown(txt))

if not 'df' in globals(): 
    df = pd.read_parquet('data/involved_hebrew.parquet')
    df, val_map, imputed, cols = remove_hebrew_and_impute(df)
    X,Y, x_cols = get_XY(df, cols)

X.shape, Y.shape

100%|████████████████████████████████████████████████████████████████████████████████| 289/289 [00:03<00:00, 79.12it/s]


((1623850, 131), (1623850,))

# Split the data and train model

In [2]:
if not 'X_train' in globals(): 
    X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=42, test_size=0.25)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1217887, 131) (405963, 131) (1217887,) (405963,)


In [3]:
if not 'brf' in globals(): 
    brf = BalancedRandomForestClassifier(n_estimators=30, random_state=0)
    brf.fit(X_train, y_train)
print('trained!')

trained!


# Evaluate model

In [4]:
if not 'y_pred' in globals(): 
    y_pred = brf.predict(X_test)

md('### f1 score = %.3f'%(f1_score(y_test, y_pred, average='weighted')))
md('### precision = %.3f'%(precision_score(y_test, y_pred, average='weighted')))
md('### recall = %.3f'%(recall_score(y_test, y_pred, average='weighted')))
md('### accuracy(balanced) = %.3f'%(balanced_accuracy_score(y_test, y_pred)))

### f1 score = 0.857

### precision = 0.982

### recall = 0.769

### accuracy(balanced) = 0.782

-----
# Save the model

In [None]:
model_fname = 'models/2018_11_20_injury_risk_balanced_RF_classifier_02.joblib'
if not os.path.isdir('models'): 
    os.mkdir('models')
if not os.path.isfile(model_fname):
    joblib.dump(brf, model_fname)

---
# Train model on all the data

In [5]:
brf.fit(X, Y)

BalancedRandomForestClassifier(bootstrap=True, class_weight=None,
                criterion='gini', max_depth=None, max_features='auto',
                max_leaf_nodes=None, min_impurity_decrease=0.0,
                min_samples_leaf=2, min_samples_split=2,
                min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
                oob_score=False, random_state=0, replacement=False,
                sampling_strategy='auto', verbose=0, warm_start=False)

# save the full model

In [6]:
model_fname = 'models/2018_11_20_injury_risk_balanced_RF_classifier_full_03.joblib'
if not os.path.isdir('models'): 
    os.mkdir('models')
if not os.path.isfile(model_fname):
    joblib.dump(brf, model_fname)

---
# Evaluate naive feature importance

In [7]:
importance = pd.DataFrame(list(zip(x_cols, brf.feature_importances_)), 
                          columns=['feature', 'importance']).sort_values('importance', ascending=False)
importance[0:30].style.bar()

Unnamed: 0,feature,importance
0,license_acquiring_date,0.082491
23,vehicle_type=1.0,0.0815605
20,age_group=99,0.0654334
1,accident_year,0.0522524
2,accident_month,0.0488292
69,home_district=99,0.0436107
121,car_id=1.0,0.0352574
40,safety_measures=5.0,0.0275678
122,car_id=2.0,0.0270691
38,safety_measures=2.0,0.0225927


# evaluate features using 'permutation_importances'

In [8]:
imp_fname = 'data/2018_11_20_injury_risk_balanced_RF_classifier_full_03_permutations_importnace.csv'
if not os.path.isfile(imp_fname):
    xx = pd.DataFrame(X, columns=x_cols)
    permu_imp = permutation_importances(brf, xx, Y)
    imp = pd.DataFrame(list(zip(x_cols, permu_imp)), columns=['feature', 'permutation_importance'])
    imp.to_csv(imp_fname, index=False)
else:
    imp = pd.read_csv(imp_fname)
imp.sort_values('permutation_importance', ascending=False)[0:30].style.bar()   

Unnamed: 0,feature,permutation_importance
23,vehicle_type=1.0,0.0768382
20,age_group=99,0.0426977
0,license_acquiring_date,0.0397759
121,car_id=1.0,0.0293862
40,safety_measures=5.0,0.0250811
1,accident_year,0.0235801
122,car_id=2.0,0.0180276
125,involve_id=1,0.0168936
2,accident_month,0.0159267
69,home_district=99,0.0154821


# combine importance to the 'parent' feature

In [9]:
splt = pd.concat([imp, pd.DataFrame(imp['feature'].str.split('=').tolist(), 
                             columns=['main_feature', 'num'])], 
          axis=1)
gb = splt.groupby('main_feature').sum()
gb = gb.sort_values('permutation_importance', ascending=False)
gb = gb.reset_index()
gb.style.bar()

Unnamed: 0,main_feature,permutation_importance
0,vehicle_type,0.10648
1,age_group,0.0764009
2,car_id,0.0488509
3,safety_measures,0.0409203
4,license_acquiring_date,0.0397759
5,home_district,0.0343374
6,involve_id,0.0327232
7,home_region,0.0260697
8,accident_year,0.0235801
9,sex,0.022308


In [10]:
splt.head()

Unnamed: 0,feature,permutation_importance,main_feature,num
0,license_acquiring_date,0.039776,license_acquiring_date,
1,accident_year,0.02358,accident_year,
2,accident_month,0.015927,accident_month,
3,age_group=1,0.001031,age_group,1.0
4,age_group=2,0.000631,age_group,2.0


In [14]:
translation = [f + '=' + val_map[f].get(str(v), v) if f in val_map.keys() else f +'=' + str(v) for f,v in splt[['main_feature', 'num']].values]
splt['translation'] = translation
splt = splt[['translation', 'permutation_importance']].sort_values('permutation_importance', ascending=False)
splt.style.bar()

Unnamed: 0,translation,permutation_importance
23,vehicle_type=רכב נוסעים פרטי,0.0768382
20,age_group=לא ידוע,0.0426977
0,license_acquiring_date=None,0.0397759
121,car_id=1.0,0.0293862
40,safety_measures= לא,0.0250811
1,accident_year=None,0.0235801
122,car_id=2.0,0.0180276
125,involve_id=1,0.0168936
2,accident_month=None,0.0159267
69,home_district=99,0.0154821
