In [7]:
import numpy as np
import pandas as pd
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib
from sklearn.linear_model import LinearRegression
from sklearn import linear_model,dummy,metrics

In [8]:
ds = pd.read_csv('SBI_Life_insurance.csv')
ds

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,insuranceclaim
0,19,0,27.900,0,1,3,16884.92400,1
1,18,1,33.770,1,0,2,1725.55230,1
2,28,1,33.000,3,0,2,4449.46200,0
3,33,1,22.705,0,0,1,21984.47061,0
4,32,1,28.880,0,0,1,3866.85520,1
...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,1,10600.54830,0
1334,18,0,31.920,0,0,0,2205.98080,1
1335,18,0,36.850,0,0,2,1629.83350,1
1336,21,0,25.800,0,0,3,2007.94500,0


In [10]:
ds.head(20)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,insuranceclaim
0,19,0,27.9,0,1,3,16884.924,1
1,18,1,33.77,1,0,2,1725.5523,1
2,28,1,33.0,3,0,2,4449.462,0
3,33,1,22.705,0,0,1,21984.47061,0
4,32,1,28.88,0,0,1,3866.8552,1
5,31,0,25.74,0,0,2,3756.6216,0
6,46,0,33.44,1,0,2,8240.5896,1
7,37,0,27.74,3,0,1,7281.5056,0
8,37,1,29.83,2,0,0,6406.4107,0
9,60,0,25.84,0,0,1,28923.13692,0


In [11]:
ds.shape

(1338, 8)

In [12]:
ds.describe()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,insuranceclaim
count,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0
mean,39.207025,0.505232,30.663397,1.094918,0.204783,1.515695,13270.422265,0.585202
std,14.04996,0.50016,6.098187,1.205493,0.403694,1.104885,12110.011237,0.492871
min,18.0,0.0,15.96,0.0,0.0,0.0,1121.8739,0.0
25%,27.0,0.0,26.29625,0.0,0.0,1.0,4740.28715,0.0
50%,39.0,1.0,30.4,1.0,0.0,2.0,9382.033,1.0
75%,51.0,1.0,34.69375,2.0,0.0,2.0,16639.912515,1.0
max,64.0,1.0,53.13,5.0,1.0,3.0,63770.42801,1.0


In [13]:
y = ds.insuranceclaim
x = ds.drop('insuranceclaim', axis=1)

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.3,random_state=111,stratify=y)

In [15]:
# Standardization
x_train_scaled = preprocessing.scale(x_train)
x_train_scaled

array([[-0.94338187, -0.97467943, -0.12354801, ..., -0.49966611,
         0.45774833, -0.82045141],
       [-0.66068444, -0.97467943,  1.38371744, ...,  2.00133645,
         0.45774833,  2.35580033],
       [-0.66068444, -0.97467943, -1.28330045, ..., -0.49966611,
        -1.37716576, -0.70821089],
       ...,
       [ 1.74224373, -0.97467943,  0.37914895, ..., -0.49966611,
        -0.45970871,  0.13562721],
       [ 1.31819759, -0.97467943,  0.9594379 , ..., -0.49966611,
        -0.45970871, -0.07224772],
       [ 1.17684887,  1.02597835,  0.18516901, ...,  2.00133645,
         0.45774833,  2.59960164]])

In [16]:
x_train_scaled.mean(axis=0)

array([ 1.55621005e-16, -1.13869028e-17, -8.91974054e-17,  6.45257826e-17,
       -2.65694399e-17,  1.13869028e-17, -1.29051565e-16])

In [17]:
x_train_scaled.std(axis=0)

array([1., 1., 1., 1., 1., 1., 1.])

In [18]:
#Cross validation Pipeline:
scaler = preprocessing.StandardScaler().fit(x_train)

In [19]:
x_train_scaled = scaler.transform(x_train)

In [20]:
x_train_scaled.mean(axis=0)

array([ 1.55621005e-16, -1.13869028e-17, -8.91974054e-17,  6.45257826e-17,
       -2.65694399e-17,  1.13869028e-17, -1.29051565e-16])

In [21]:
x_train_scaled.std(axis=0)

array([1., 1., 1., 1., 1., 1., 1.])

In [23]:
x_test_scaled = scaler.transform(x_test)
x_test_scaled.mean(axis=0)

array([-0.03322975,  0.12020791, -0.00273302,  0.02625004,  0.04159564,
        0.04466444,  0.05088865])

In [24]:
x_test_scaled.std(axis=0)

array([0.97475912, 0.99584968, 1.02104255, 1.01942911, 1.0299188 ,
       1.0429598 , 1.07778593])

In [25]:
pipeline = make_pipeline(preprocessing.StandardScaler(),RandomForestRegressor(n_estimators=110))

In [26]:
# Hyperparameter
pipeline.get_params()

{'memory': None,
 'steps': [('standardscaler',
   StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('randomforestregressor',
   RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                         max_features='auto', max_leaf_nodes=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=2,
                         min_weight_fraction_leaf=0.0, n_estimators=110,
                         n_jobs=None, oob_score=False, random_state=None,
                         verbose=0, warm_start=False))],
 'verbose': False,
 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True),
 'randomforestregressor': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                       max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1,

In [27]:
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],'randomforestregressor__max_depth': [None,4,2,1]}

In [33]:
# cross-validation
r= GridSearchCV(pipeline,hyperparameters, cv=10)
r.fit(x_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('randomforestregressor',
                                        RandomForestRegressor(bootstrap=True,
                                                              criterion='mse',
                                                              max_depth=None,
                                                              max_features='auto',
                                                              max_leaf_nodes=None,
                                                              min_impurity_decrease=0.0,
                                                              min_impurity_split

In [36]:
r.best_params_

{'randomforestregressor__max_depth': None,
 'randomforestregressor__max_features': 'auto'}

In [37]:
# confirming Model be Retained
r.refit

True

In [38]:
# start predicting the data 
y_pred = r.predict(x_test)

In [39]:
#Apply metrics on the completed Model
r2_score(y_test, y_pred)

0.9628429078189673

In [40]:
mean_squared_error(y_test, y_pred)

0.009023477653057029

In [41]:
# To save the model
joblib.dump(r, 'rf_regressor.pkl')

['rf_regressor.pkl']

In [42]:
# To load the model from pickle file
r1 = joblib.load('rf_regressor.pkl')

In [44]:
r1.predict(x_test)

array([0.        , 0.96363636, 1.        , 0.95454545, 0.        ,
       0.05454545, 1.        , 1.        , 0.        , 0.00909091,
       1.        , 0.        , 0.02727273, 0.92727273, 0.        ,
       0.06363636, 0.        , 0.98181818, 1.        , 0.02727273,
       0.        , 1.        , 1.        , 0.        , 0.        ,
       0.        , 1.        , 0.68181818, 0.95454545, 0.00909091,
       1.        , 0.26363636, 0.        , 0.98181818, 1.        ,
       0.02727273, 1.        , 1.        , 0.68181818, 0.93636364,
       0.9       , 1.        , 1.        , 1.        , 1.        ,
       0.09090909, 0.95454545, 0.        , 0.4       , 0.        ,
       0.86363636, 1.        , 0.        , 1.        , 0.99090909,
       1.        , 0.92727273, 0.50909091, 0.05454545, 0.13636364,
       0.        , 0.97272727, 0.4       , 0.78181818, 0.98181818,
       0.06363636, 0.07272727, 0.        , 0.06363636, 1.        ,
       0.92727273, 0.00909091, 0.        , 1.        , 1.     