In [27]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

In [28]:
health = pd.read_excel('insurance_cleaned.xlsx')

In [29]:
health.head()

Unnamed: 0.1,Unnamed: 0,age,sex,bmi,children,smoker,region,charges,children?
0,0,19,female,27.9,0,True,southwest,16884.924,False
1,1,18,male,33.77,1,False,southeast,1725.5523,True
2,2,28,male,33.0,3,False,southeast,4449.462,True
3,3,33,male,22.705,0,False,northwest,21984.47061,False
4,4,32,male,28.88,0,False,northwest,3866.8552,False


# Feature Engineering

In [30]:
health.drop(columns=['Unnamed: 0','children?'],inplace=True)

## to build a model we have to elminate all the categorical variables

### `sex` column, `smoker` column and `region` columns

### let's make all the columns into dummies columns

In [31]:
health['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [32]:
health['smoker'] = np.where(health['smoker']==True,'Yes','No')

In [33]:
health_corrected = pd.get_dummies(health)
#health_corrected.drop(columns=['region','smoker','sex'],inplace=True)

In [34]:
health_corrected.head()

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_No,smoker_Yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,1,0,0,1,0,0,0,1
1,18,33.77,1,1725.5523,0,1,1,0,0,0,1,0
2,28,33.0,3,4449.462,0,1,1,0,0,0,1,0
3,33,22.705,0,21984.47061,0,1,1,0,0,1,0,0
4,32,28.88,0,3866.8552,0,1,1,0,0,1,0,0


In [35]:
features = health_corrected.drop(columns=['charges']).copy()
labels = health_corrected['charges']

# First, let's try predict with all data together

In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=1)

### Linear Regression

In [37]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics

lreg = LinearRegression()
lreg.fit(X_train,y_train)

print("test data accuracy was ",lreg.score(X_test,y_test))

test data accuracy was  0.7339256331232803


### Ridge Regression

In [38]:
#ridge tries to control the coefficients
from sklearn.linear_model import Ridge

# Create linear regression
regr = Ridge(alpha=1.0)

# Train the linear regression
regr_model = regr.fit(X_train, y_train)

# Score the model
print("test data accuracy was ",regr.score(X_test,y_test))

test data accuracy was  0.7340735290114273


### Lasso Regression

In [39]:
from sklearn.linear_model import Lasso

# Create linear regression
lass = Lasso(alpha=1.0)

# Train the linear regression
lass_model = lass.fit(X_train, y_train)

# Score the model
print("test data accuracy was ",lass.score(X_test,y_test))

test data accuracy was  0.7339192069733206


### Polynomial Features

In [40]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)
lm_poly=LinearRegression()

X_train_poly = poly.fit_transform(X_train.values)
X_test_poly = poly.transform(X_test.values)

lm_poly.fit(X_train_poly,y_train)

pred = lm_poly.predict(X_test_poly)

# Score the model
print("test data accuracy was ",lm_poly.score(X_test_poly,y_test))

test data accuracy was  0.840680977448351


### Bagging Regression

In [41]:
# bagging means we train many "weak" predictors but then we combine their predictions and some will hopefully make up for the others' failures
# big jump in quality!
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor

bagging_reg = BaggingRegressor(
    DecisionTreeRegressor(max_depth=3), # depth 3 to force tree to be "weak"
    n_estimators=10, # 10 trees
    max_samples=100, # we limit each weaker tree to 100 datapoints
    random_state=1) # fixing random state because I want my examples to work and to look smart

bagging_reg.fit(X_train, y_train)
bagging_reg.score(X_test,y_test)

0.8413078664479368

### Random Forest

In [42]:
# random forests not only shuffle the dataset, they also randomly select some features
# some trees will focus one one part of the data, some in others, then they meet to vote and get a holistic result 
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor(n_estimators=10, # same 10 trees
                               max_depth=3, # depth 3 to force tree to be "weak"
                               random_state=1) # fixing rand because I'm insecure and afraid you will judge me if I get a bad random selection that does not prove my point
forest.fit(X_train, y_train)
forest.score(X_test,y_test)

0.8450462951460791

In [43]:
print(f'Linear Regression val R^2: {lreg.score(X_test,y_test):.3f}')
print(f'Ridge Regression val R^2: {regr.score(X_test,y_test):.3f}')
print(f'Lasso Regression val R^2: {lass.score(X_test,y_test):.3f}')
print(f'Bagging Regression val R^2: {bagging_reg.score(X_test,y_test):.3f}')
print(f'Random Forest val R^2: {forest.score(X_test,y_test):.3f}')

Linear Regression val R^2: 0.734
Ridge Regression val R^2: 0.734
Lasso Regression val R^2: 0.734
Bagging Regression val R^2: 0.841
Random Forest val R^2: 0.845


## Before to export the model, let's do `cross validation` in the choosen model

In [96]:
from sklearn.model_selection import cross_validate

print(cross_validate(forest,X_train, y_train, cv=5)) # checking for overfitting

{'fit_time': array([0.03825378, 0.01896596, 0.02296329, 0.02291346, 0.01898193]), 'score_time': array([0.00699878, 0.00695038, 0.00299191, 0.00302863, 0.00299311]), 'test_score': array([0.85147033, 0.86315467, 0.83154635, 0.81825298, 0.89321974])}


## Export the best model for all data together

In [44]:
pickle.dump(forest, open('model.pkl','wb'))

# The best idea for the prediction of charges is separate a model for smokers and other to non-smokers. Let's try and check if we get better accurancy

In [59]:
health_smokers = health_corrected[health_corrected['smoker_Yes']==1]
features_smokers = health_smokers.drop(columns=['charges'])
labels_smokers = health_smokers['charges']

In [60]:
health_non_smokers = health_corrected[health_corrected['smoker_Yes']==0]
features_non_smokers = health_non_smokers.drop(columns=['charges'])
labels_non_smokers = health_non_smokers['charges']

## For smokers, a good idea to model is Decision tree

In [27]:
from sklearn.model_selection import train_test_split
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(features_smokers, labels_smokers, test_size=0.25, random_state=1)

In [38]:
from sklearn.tree import DecisionTreeRegressor
smoker_model = DecisionTreeRegressor()

model = smoker_model.fit(X_train_s, y_train_s)
model.score(X_test_s, y_test_s)

-0.5658218345420358

### Non-smokers

In [61]:
from sklearn.model_selection import train_test_split
X_train_ns, X_test_ns, y_train_ns, y_test_ns = train_test_split(features_non_smokers, labels_non_smokers, test_size=0.25, random_state=1)

In [63]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics

non_smoker_model = LinearRegression()
non_smoker_model.fit(X_train_ns,y_train_ns)
y_pred_ns = non_smoker_model.predict(X_test_ns)

print("test data accuracy was ",non_smoker_model.score(X_test_ns,y_test_ns))


test data accuracy was  0.31398157333140475


In [64]:
# cross validation

from sklearn.model_selection import cross_validate

print(cross_validate(non_smoker_model,X_test_ns, y_test_ns, cv=5)) # checking for overfitting

{'fit_time': array([0.00699377, 0.00797629, 0.00897574, 0.00400305, 0.00495434]), 'score_time': array([0.00397968, 0.00299263, 0.00199652, 0.00500536, 0.00099659]), 'test_score': array([0.05623915, 0.40087583, 0.35752694, 0.26131733, 0.39566462])}


In [65]:
from sklearn.metrics import r2_score

print('R-squared',r2_score(y_test_ns, y_pred_ns))

R-squared 0.31398157333140475
