In [52]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.externals import joblib
import pickle

In [53]:
features = pd.read_csv('Data/train.csv')
test = pd.read_csv('Data/test.csv')

In [54]:
labels = features['Response']
features.drop(['Response'], axis=1, inplace=True)

In [55]:
def turn_to_bin(x):
    if x == 'Yes':
        return 1
    else:
        return 0

def data_preprocessing(df):
    df_sex = pd.get_dummies(df['Gender'], prefix='Sex')
    df_Reg_Code = pd.get_dummies(df['Region_Code'], prefix='Reg_Code')
    df_Vic_Age = pd.get_dummies(df['Vehicle_Age'], prefix='Vic_Age')
    df_Sale_Channel = pd.get_dummies(df['Policy_Sales_Channel'], prefix='Sale_Channel')

    df = pd.concat([df,df_sex, df_Reg_Code, df_Vic_Age, df_Sale_Channel], axis=1)

    df.drop(['id', 'Gender', 'Region_Code', 'Vehicle_Age', 'Policy_Sales_Channel'], axis=1, inplace=True)

    df['Vehicle_Damage'] = df['Vehicle_Damage'].apply(turn_to_bin)

    return df

features = data_preprocessing(features)
test = data_preprocessing(test)


In [56]:
features.head()

Unnamed: 0,Age,Driving_License,Previously_Insured,Vehicle_Damage,Annual_Premium,Vintage,Sex_Female,Sex_Male,Reg_Code_0.0,Reg_Code_1.0,...,Sale_Channel_152.0,Sale_Channel_153.0,Sale_Channel_154.0,Sale_Channel_155.0,Sale_Channel_156.0,Sale_Channel_157.0,Sale_Channel_158.0,Sale_Channel_159.0,Sale_Channel_160.0,Sale_Channel_163.0
0,44,1,0,1,40454.0,217,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,76,1,0,0,33536.0,183,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,47,1,0,1,38294.0,27,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,21,1,1,0,28619.0,203,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
4,29,1,1,0,27496.0,39,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [57]:
print(features.columns)

Index(['Age', 'Driving_License', 'Previously_Insured', 'Vehicle_Damage',
       'Annual_Premium', 'Vintage', 'Sex_Female', 'Sex_Male', 'Reg_Code_0.0',
       'Reg_Code_1.0',
       ...
       'Sale_Channel_152.0', 'Sale_Channel_153.0', 'Sale_Channel_154.0',
       'Sale_Channel_155.0', 'Sale_Channel_156.0', 'Sale_Channel_157.0',
       'Sale_Channel_158.0', 'Sale_Channel_159.0', 'Sale_Channel_160.0',
       'Sale_Channel_163.0'],
      dtype='object', length=219)


In [58]:
len(features['Age'])

381109

In [59]:
len(labels)

381109

In [60]:
len(test)

127037

In [61]:
test.head()

Unnamed: 0,Age,Driving_License,Previously_Insured,Vehicle_Damage,Annual_Premium,Vintage,Sex_Female,Sex_Male,Reg_Code_0.0,Reg_Code_1.0,...,Sale_Channel_152.0,Sale_Channel_153.0,Sale_Channel_154.0,Sale_Channel_155.0,Sale_Channel_156.0,Sale_Channel_157.0,Sale_Channel_158.0,Sale_Channel_159.0,Sale_Channel_160.0,Sale_Channel_163.0
0,25,1,1,0,35786.0,53,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
1,40,1,0,1,33762.0,111,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,47,1,0,1,40050.0,199,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,24,1,1,1,37356.0,187,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
4,27,1,1,0,59097.0,297,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0


In [62]:
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.33, random_state=42)


In [63]:
# Create the model with 100 trees
bin_model = RandomForestClassifier(n_estimators=10,
                                   n_jobs=-1,
                                   random_state=50,
                                   max_features=3,
                                   verbose=True)
# Fit on training data
bin_model.fit(X_train, y_train)
y_pred = bin_model.predict(X_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    7.8s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.5s finished


In [64]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

train_rf_predictions = bin_model.predict(X_train)
train_rf_probs = bin_model.predict_proba(X_train)[:, 1]

# Actual class predictions
rf_predictions = bin_model.predict(X_test)
# Probabilities for each class
rf_probs = bin_model.predict_proba(X_test)[:, 1]

# Calculate roc auc
roc_value = roc_auc_score(y_test, rf_probs)
roc_value_train = roc_auc_score(y_train, train_rf_probs)

print("Model ROC value")
print(roc_value)

print("Training ROC value")
print(roc_value_train)

[[106004   4247]
 [ 13221   2294]]
              precision    recall  f1-score   support

           0       0.89      0.96      0.92    110251
           1       0.35      0.15      0.21     15515

    accuracy                           0.86    125766
   macro avg       0.62      0.55      0.57    125766
weighted avg       0.82      0.86      0.84    125766

0.8611071354738165
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.9s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    1.0s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.9s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
Model ROC value
0.781959158186415
Training ROC value
0.9993668641236721
[Parallel(n_jobs=

In [65]:
importances = bin_model.feature_importances_

In [66]:
print(importances.max())
print(importances)

0.28927114054282116
[2.06975201e-01 5.47882209e-04 7.26805226e-02 4.36759448e-02
 2.56574865e-01 2.89271141e-01 2.39448947e-03 3.09288756e-03
 7.01883390e-04 4.60274266e-04 6.88028372e-04 1.58476110e-03
 7.84854743e-04 5.66986747e-04 9.47694628e-04 1.06499062e-03
 2.25244789e-03 7.95811522e-04 7.76405857e-04 1.54058423e-03
 7.98305836e-04 8.91464363e-04 9.69746086e-04 1.31625650e-03
 5.15296874e-04 7.18947521e-04 1.25457762e-03 7.39279435e-04
 6.28455387e-04 8.43298216e-04 4.91963574e-04 5.76683172e-04
 8.66159942e-04 4.30740803e-04 7.99869251e-04 7.55872440e-04
 4.66034459e-03 1.70347673e-03 1.43075433e-03 5.26723946e-04
 6.36085652e-04 1.43784669e-03 4.34380568e-04 1.34053059e-03
 1.16959532e-03 9.10039722e-04 8.72296672e-04 1.19725694e-03
 6.88460819e-04 2.24537084e-03 2.61616057e-04 8.81753561e-04
 2.37238064e-04 1.24497790e-03 1.90704780e-03 1.18171012e-03
 1.07487806e-03 5.29076166e-04 1.33824573e-03 1.96254375e-04
 2.00701750e-04 7.29328458e-03 5.88082511e-03 4.08465641e-03
 3.7

In [67]:
features.columns[4]

'Annual_Premium'

In [68]:
features.columns[5]

'Vintage'