# Libraries

In [1]:
# Data wrangling
import pandas as pd

# Preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# Evaluation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn import metrics

# Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.dummy import DummyRegressor
from xgboost import XGBRegressor

# Pipeline
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline

# Hyperparameter tuning
from sklearn.model_selection import GridSearchCV

# Dataviz
import matplotlib.pyplot as plt
import seaborn as sns

# Serialize model
import pickle

## Libraries Settings

In [2]:
sns.set_theme(
    context='notebook',
    style='ticks',
    font_scale=.8,
    palette='colorblind',
    rc={
        'axes.grid': True,
        'grid.alpha': .2,
        'axes.labelpad': 20,
        'axes.titlepad': 30,
    }
)

# Functions

In [3]:
def assign_pipeline(model, preprocessor):
    model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    (model_name, model)
    ])
    return model_pipeline


def print_metrics(y, y_pred):
    dict_metrics = {
        'MAE': metrics.mean_absolute_error(y, y_pred),
        'MAPE': metrics.mean_absolute_percentage_error(y, y_pred),
        'RSMAE': metrics.root_mean_squared_error(y, y_pred),
        'r2': metrics.r2_score(y, y_pred)
    }

    return dict_metrics

# 1. Load Data

In [4]:
NUM_FEATURES = [
    'age',
    'bmi',
    'children'
    ]

CAT_FEATURES = [
    #'gender',
    'smoker',
    'region',
    'medical_history',
    'family_medical_history',
    'exercise_frequency',
    'occupation',
    'coverage_level'
]

TARGET = 'charges'

In [5]:
df = pd.read_csv('../data/processed/insurance_dataset.csv',
                 usecols=[*NUM_FEATURES, *CAT_FEATURES, TARGET])
df.head(5)

Unnamed: 0,age,bmi,children,smoker,region,medical_history,family_medical_history,exercise_frequency,occupation,coverage_level,charges
0,46,21.45,5,yes,southeast,Diabetes,No history,Never,Blue collar,Premium,20460.307669
1,25,25.38,2,yes,northwest,Diabetes,High blood pressure,Occasionally,White collar,Premium,20390.899218
2,38,44.88,2,yes,southwest,No history,High blood pressure,Occasionally,Blue collar,Premium,20204.476302
3,25,19.89,0,no,northwest,No history,Diabetes,Rarely,White collar,Standard,11789.029843
4,49,38.21,3,yes,northwest,Diabetes,High blood pressure,Rarely,White collar,Standard,19268.309838


# 2. Modeling

In [6]:
models = [
    LinearRegression(),
    RandomForestRegressor(random_state=42),
    MLPRegressor(random_state=42),
    XGBRegressor(),
    DummyRegressor()
]

## 2.1. Pipeline

In [7]:
X, y = df[[*NUM_FEATURES, *CAT_FEATURES]], df[TARGET]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3,
                                                    random_state=42)

In [8]:
# Preprocessing
preprocessor = make_column_transformer(
    (StandardScaler(), NUM_FEATURES),
    (OneHotEncoder(drop='if_binary'), CAT_FEATURES)
)

In [31]:
for model in models:
    print(model.__class__.__name__)

LinearRegression
RandomForestRegressor
MLPRegressor
XGBRegressor
DummyRegressor


In [9]:
# Cross validation with stratified k-folds
kf = KFold(n_splits=10, shuffle=True, random_state=42)
cv_results = []

for model in models:
    model_name = model.__class__.__name__
    print(model_name)
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        (model_name, model)
    ])
    
    result = cross_validate(model_pipeline, X_train, y_train,
                            scoring=['r2', 'neg_mean_absolute_error',
                                     'neg_root_mean_squared_error',
                                     'neg_mean_absolute_percentage_error'],
                                     cv=kf, n_jobs=-1)
    df_cv = pd.DataFrame(result)
    df_cv = df_cv.apply(lambda x: f'{x.mean():.2f} ± {x.std():.2f}').to_frame(model_name).T
    cv_results.append(df_cv)
    
df_cv_results = pd.concat(cv_results).sort_values('test_r2', ascending=False)    
df_cv_results.columns = df_cv_results.columns.str.replace('test_','')
df_cv_results.columns = df_cv_results.columns.str.replace('neg_','')
df_cv_results = df_cv_results.rename_axis('model')
display(df_cv_results)

LinearRegression
RandomForestRegressor


4 fits failed out of a total of 10.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "e:\workspace\ciencia-dados\insurance\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "e:\workspace\ciencia-dados\insurance\.venv\lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "e:\workspace\ciencia-dados\insurance\.venv\lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "e:\workspace\ciencia-dados\insurance\.venv\lib\site-packages\

MLPRegressor
XGBRegressor
DummyRegressor


Unnamed: 0_level_0,fit_time,score_time,r2,mean_absolute_error,root_mean_squared_error,mean_absolute_percentage_error
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LinearRegression,3.77 ± 0.46,0.21 ± 0.07,0.98 ± 0.00,-499.46 ± 0.98,-576.87 ± 0.84,-0.03 ± 0.00
RandomForestRegressor,422.70 ± 52.89,43.49 ± 37.80,0.98 ± 0.00,-535.48 ± 1.41,-636.17 ± 1.24,-0.03 ± 0.00
MLPRegressor,81.11 ± 9.69,0.16 ± 0.04,0.98 ± 0.00,-499.59 ± 1.02,-577.09 ± 0.86,-0.03 ± 0.00
XGBRegressor,18.82 ± 0.29,0.36 ± 0.08,0.98 ± 0.00,-511.46 ± 1.27,-597.15 ± 1.35,-0.03 ± 0.00
DummyRegressor,1.45 ± 0.05,0.14 ± 0.03,-0.00 ± 0.00,-3588.57 ± 7.85,-4419.71 ± 9.51,-0.25 ± 0.00


In [30]:
models_dict

{'LinearRegression': Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('standardscaler',
                                                   StandardScaler(),
                                                   ['age', 'bmi', 'children']),
                                                  ('onehotencoder',
                                                   OneHotEncoder(drop='if_binary'),
                                                   ['smoker', 'region',
                                                    'medical_history',
                                                    'family_medical_history',
                                                    'exercise_frequency',
                                                    'occupation',
                                                    'coverage_level'])])),
                 ('DummyRegressor', LinearRegression())]),
 'RandomForestRegressor': Pipeline(steps=[('preprocessor',
                  Colu

In [10]:
models_dict = {model.__class__.__name__: assign_pipeline(model, preprocessor) for model in models}
for model in models_dict.values():
    model.fit(X_train, y_train)

# 3. Evaluation

## 3.1. Linear Regression

In [11]:
y_pred = models_dict['LinearRegression'].predict(X_test)
lr_metrics = print_metrics(y_test, y_pred)
lr_metrics

{'MAE': 500.36206594684364,
 'MAPE': 0.0323996501435989,
 'RSMAE': 577.7652866497053,
 'r2': 0.9828098241121286}

## 3.2. Random Forest Regressor

In [12]:
y_pred = models_dict['RandomForestRegressor'].predict(X_test)
rf_metrics = print_metrics(y_test, y_pred)
rf_metrics

{'MAE': 536.1139955335282,
 'MAPE': 0.034699673970479274,
 'RSMAE': 636.8149002087191,
 'r2': 0.9791164722664004}

## 3.3 MLP Regressor

In [13]:
y_pred = models_dict['MLPRegressor'].predict(X_test)
mlp_metrics = print_metrics(y_test, y_pred)
mlp_metrics

{'MAE': 500.4775943082231,
 'MAPE': 0.0323819280386908,
 'RSMAE': 578.0226378346256,
 'r2': 0.9827945068284605}

## 3.4. XGB Regressor

In [14]:
y_pred = models_dict['XGBRegressor'].predict(X_test)
xgb_metrics = print_metrics(y_test, y_pred)
xgb_metrics

{'MAE': 511.84905013913414,
 'MAPE': 0.033132948966546304,
 'RSMAE': 597.3039017282479,
 'r2': 0.9816275051734087}

## 3.5. Dummy Regressor

In [15]:
y_pred = models_dict['DummyRegressor'].predict(X_test)
dummy_metrics = print_metrics(y_test, y_pred)
dummy_metrics

{'MAE': 3577.732284197109,
 'MAPE': 0.24477026181452524,
 'RSMAE': 4406.681735794761,
 'r2': -1.8455755319557454e-06}

In [16]:
df_metrics = pd.DataFrame([lr_metrics, rf_metrics, mlp_metrics, xgb_metrics,
                           dummy_metrics], index=models_dict.keys()).sort_values('r2', ascending=False,)
display(df_metrics)

Unnamed: 0,MAE,MAPE,RSMAE,r2
LinearRegression,500.362066,0.0324,577.765287,0.98281
MLPRegressor,500.477594,0.032382,578.022638,0.982795
XGBRegressor,511.84905,0.033133,597.303902,0.981628
RandomForestRegressor,536.113996,0.0347,636.8149,0.979116
DummyRegressor,3577.732284,0.24477,4406.681736,-2e-06


## 3.6. Linear Regression with Selected Features

In [17]:
# Selected features according to mutual information
sel_features = [
    'smoker',
    'coverage_level',
    'medical_history',
    'family_medical_history',
    'occupation',
    'exercise_frequency'
]

In [18]:
# df with selected features
df_sel = pd.read_csv('../data/processed/insurance_dataset.csv',
                 usecols=[*sel_features, TARGET])
df_sel.head(5)

Unnamed: 0,smoker,medical_history,family_medical_history,exercise_frequency,occupation,coverage_level,charges
0,yes,Diabetes,No history,Never,Blue collar,Premium,20460.307669
1,yes,Diabetes,High blood pressure,Occasionally,White collar,Premium,20390.899218
2,yes,No history,High blood pressure,Occasionally,Blue collar,Premium,20204.476302
3,no,No history,Diabetes,Rarely,White collar,Standard,11789.029843
4,yes,Diabetes,High blood pressure,Rarely,White collar,Standard,19268.309838


In [19]:
# Preprocessing
X, y = df[[*NUM_FEATURES, *CAT_FEATURES]], df[TARGET]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3,
                                                    random_state=42)

preprocessor = make_column_transformer(
    (OneHotEncoder(drop='if_binary'), sel_features)
)

In [20]:
# Pipeline
lr_sel = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('lr_sel', LinearRegression())
    ])
lr_sel_result = cross_validate(lr_sel, X_train, y_train,
                            scoring=['r2', 'neg_mean_absolute_error',
                                     'neg_root_mean_squared_error',
                                     'neg_mean_absolute_percentage_error'],
                                     cv=kf, n_jobs=-1)

In [21]:
df_lr_sel = pd.DataFrame(lr_sel_result)
df_lr_sel = df_lr_sel.apply(lambda x: f'{x.mean():.2f} ± {x.std():.2f}').to_frame('Linear Regression').T
df_lr_sel.columns = df_lr_sel.columns.str.replace('test_','')
df_lr_sel.columns = df_lr_sel.columns.str.replace('neg_','')
df_lr_sel

Unnamed: 0,fit_time,score_time,r2,mean_absolute_error,root_mean_squared_error,mean_absolute_percentage_error
Linear Regression,1.41 ± 0.15,0.11 ± 0.03,0.96 ± 0.00,-739.21 ± 1.91,-913.38 ± 1.98,-0.05 ± 0.00


In [22]:
lr_sel.fit(X_train, y_train)
y_pred = lr_sel.predict(X_test)
lr_sel_metrics = print_metrics(y_test, y_pred)
lr_sel_metrics

{'MAE': 738.0120358976932,
 'MAPE': 0.0478670507012029,
 'RSMAE': 911.8156515858747,
 'r2': 0.9571854369754007}

# 4. Hyperparameter Tuning

## 4.1. Linear Regression

In [26]:
lr_param_grid = {
    'LinearRegression__fit_intercept': [True, False],
    'LinearRegression__copy_X': [True, False],
    'LinearRegression__positive': [True, False],
}

In [29]:
models_dict['LinearRegression'].get_params()

{'memory': None,
 'steps': [('preprocessor',
   ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                    ['age', 'bmi', 'children']),
                                   ('onehotencoder',
                                    OneHotEncoder(drop='if_binary'),
                                    ['smoker', 'region', 'medical_history',
                                     'family_medical_history',
                                     'exercise_frequency', 'occupation',
                                     'coverage_level'])])),
  ('DummyRegressor', LinearRegression())],
 'verbose': False,
 'preprocessor': ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                  ['age', 'bmi', 'children']),
                                 ('onehotencoder',
                                  OneHotEncoder(drop='if_binary'),
                                  ['smoker', 'region', 'medical_history',
                     

In [27]:
lr_tuned = GridSearchCV(models_dict['LinearRegression'], lr_param_grid,
                        scoring='r2', n_jobs=-1)


In [28]:
X, y = df[[*NUM_FEATURES, *CAT_FEATURES]], df[TARGET]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3,
                                                    random_state=42)
lr_tuned.fit(X_train, y_train)

ValueError: Invalid parameter 'LinearRegression' for estimator Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['age', 'bmi', 'children']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(drop='if_binary'),
                                                  ['smoker', 'region',
                                                   'medical_history',
                                                   'family_medical_history',
                                                   'exercise_frequency',
                                                   'occupation',
                                                   'coverage_level'])])),
                ('DummyRegressor', LinearRegression())]). Valid parameters are: ['memory', 'steps', 'verbose'].

In [None]:
y_pred = lr_tuned.predict(X_test)
print_metrics(y_test, y_pred)

{'MAE': 500.3527829815191,
 'MAPE': 0.032399104592429166,
 'RSMAE': 577.7528493480927,
 'r2': 0.982810564195017}

## 4.2. XGBoost

In [None]:
xgb_param_grid = {
    'XGBRegressor__eta': [.01, .05, .1, .15, .2],
    'XGBRegressor__max_depth': [3 ,6, 10],
    'XGBRegressor__number_of_estimators': [50, 100],
    'XGBRegressor__gamma': [0, .5],
    'XGBRegressor__subsample': [.5, 1],
}

In [None]:
xgb_tuned = GridSearchCV(models_dict['XGBRegressor'], xgb_param_grid,
                        scoring='r2', n_jobs=-1)

In [None]:
xgb_tuned.fit(X_train, y_train)

Parameters: { "number_of_estimators" } are not used.



In [None]:
y_pred = xgb_tuned.predict(X_test)
print_metrics(y_test, y_pred)

{'MAE': 505.89465270988234,
 'MAPE': 0.03282224292839247,
 'RSMAE': 587.3659746032725,
 'r2': 0.9822337814622827}

In [None]:
lr_mode

27

In [None]:
lr_model = Pipeline(steps=[
    (StandardScaler(), NUM_FEATURES),
    (OneHotEncoder(drop='if_binary'), CAT_FEATURES),
    ('LR', LinearRegression())
])