In [3]:
# load sample dataset
import pandas as pd
import seaborn as sns

from ydata_profiling import ProfileReport

from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score

from yellowbrick.regressor import PredictionError

df = pd.read_csv('./data/insurance.csv')
print(df.shape)
df

(1338, 7)


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [4]:
# simple check for nulls
df.isna().sum()[df.isna().sum() > 0]

Series([], dtype: int64)

In [10]:
# eda (automated)
# profile.to_widgets() -- research to fix...

profile = ProfileReport(df)
# profile.to_notebook_iframe()
# profile.to_file('./eda/reg_insurance.html')


In [11]:
# take care of any missing values 
# n/a in this case

In [12]:
# set aside and save unseen data set
data_unseen = df.sample(n=100, random_state=42)
data        = df.drop(data_unseen.index)
print(f'Data for model: {data.shape},\nData for unseen predictions: {data_unseen.shape}')
data_unseen.to_csv('../data/insurance_unseen.csv', index=False)

Data for model: (1238, 7),
Data for unseen predictions: (100, 7)


In [13]:
# data.columns!='charges'
X = data.loc[: , data.columns!='charges']
y = data.loc[: , data.columns=='charges']

In [14]:
# split the data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# encoding 
# get the categorical and numeric column names
num_cols = X_train.select_dtypes(exclude=['object']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()
print(num_cols, '\n', cat_cols)

['age', 'bmi', 'children'] 
 ['sex', 'smoker', 'region']


In [16]:
# pipeline for numerical columns
num_pipe = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)
num_pipe

In [17]:
# pipeline for categorical columns
cat_pipe = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='N/A'),
    OneHotEncoder(handle_unknown='ignore', sparse=False)
)
cat_pipe

In [18]:
# combine both the pipelines
full_pipe = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
])
full_pipe

In [19]:
# build the model
gbr_insurance = make_pipeline(full_pipe, GradientBoostingRegressor(random_state=42))
gbr_insurance

In [20]:
# train the model
gbr_insurance.fit(X_train, y_train)


  y = column_or_1d(y, warn=True)


In [21]:
# make predictions on the test set
y_pred = gbr_insurance.predict(X_test)

In [22]:
# measure accuracy
print('R2:', r2_score(y_test, y_pred))

R2: 0.7795169872357341


In [23]:
# done manually to break out the example above
y_test['y_pred'] = y_pred
test_scores = y_test.copy()
test_scores

Unnamed: 0,charges,y_pred
659,14394.39815,13425.827498
183,7419.47790,9480.202686
946,7160.09400,7445.930078
1020,8798.59300,9419.609322
1061,11554.22360,13501.033685
...,...,...
953,38998.54600,39455.332135
187,5325.65100,6242.769364
31,2198.18985,3040.883118
950,11534.87265,10686.499242


In [24]:
r2_score(test_scores['charges'], test_scores['y_pred'])

0.7795169872357341

In [25]:
import joblib
joblib.dump(gbr_insurance, './models/insurance.pkl')
print(gbr_insurance)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  ['age', 'bmi', 'children']),
                                                 ('cat',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(fill_value='N/A',
                                                                                 strategy='constant')),
                                                                  ('onehotencoder',
                                       