# Libraries

In [32]:
# Data wrangling
import pandas as pd

# Preprocessing
from sklearn.preprocessing import OneHotEncoder

# Evaluation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn import metrics

# Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.dummy import DummyRegressor
from xgboost import XGBRegressor

# Pipeline
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline

# Dataviz
import matplotlib.pyplot as plt
import seaborn as sns

# Serialize model
import pickle

## Libraries Settings

In [33]:
sns.set_theme(
    context='notebook',
    style='ticks',
    font_scale=.8,
    palette='colorblind',
    rc={
        'axes.grid': True,
        'grid.alpha': .2,
        'axes.labelpad': 20,
        'axes.titlepad': 30,
    }
)

# Functions

In [27]:
def assign_pipeline(model, preprocessor):
    model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
    ])
    return model_pipeline


def print_metrics(y, y_pred):
    dict_metrics = {
        'MAE': metrics.mean_absolute_error(y, y_pred),
        'MAPE': metrics.mean_absolute_percentage_error(y, y_pred),
        'RSMAE': metrics.root_mean_squared_error(y, y_pred),
        'r2': metrics.r2_score(y, y_pred)
    }

    return dict_metrics

# 1. Load Data

In [2]:
FEATURES = [
    'smoker',
    'coverage_level',
    'medical_history',
    'family_medical_history',
]

TARGET = 'charges'

In [3]:
df = pd.read_csv('../data/processed/insurance_dataset.csv',
                 usecols=[*FEATURES, TARGET])
df.head(5)

Unnamed: 0,smoker,medical_history,family_medical_history,coverage_level,charges
0,yes,Diabetes,No history,Premium,20460.307669
1,yes,Diabetes,High blood pressure,Premium,20390.899218
2,yes,No history,High blood pressure,Premium,20204.476302
3,no,No history,Diabetes,Standard,11789.029843
4,yes,Diabetes,High blood pressure,Standard,19268.309838


# 2. Modeling

In [4]:
models = [
    LinearRegression(),
    RandomForestRegressor(random_state=42),
    MLPRegressor(random_state=42),
    DummyRegressor(),
    XGBRegressor()
]

## 2.1. Pipeline

In [5]:
X, y = df[FEATURES], df[TARGET]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3,
                                                    random_state=42)

In [6]:
# Preprocessing
preprocessor = make_column_transformer(
    (OneHotEncoder(drop='if_binary'), FEATURES)
)

In [7]:
# Cross validation with stratified k-folds
kf = KFold(n_splits=10, shuffle=True, random_state=42)
cv_results = []

for model in models:
    model_name = model.__class__.__name__
    print(model_name)
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    result = cross_validate(model_pipeline, X_train, y_train,
                            scoring=['r2', 'neg_mean_absolute_error',
                                     'neg_root_mean_squared_error',
                                     'neg_mean_absolute_percentage_error'],
                                     cv=kf, n_jobs=-1)
    df_cv = pd.DataFrame(result)
    df_cv = df_cv.apply(lambda x: f'{x.mean():.2f} ± {x.std():.2f}').to_frame(model_name).T
    cv_results.append(df_cv)
    
df_cv_results = pd.concat(cv_results).sort_values('test_r2', ascending=False)    
df_cv_results.columns = df_cv_results.columns.str.replace('test_','')
df_cv_results = df_cv_results.rename_axis('model')
display(df_cv_results)

LinearRegression
RandomForestRegressor
MLPRegressor
DummyRegressor
XGBRegressor


Unnamed: 0_level_0,fit_time,score_time,r2,neg_mean_absolute_error,neg_root_mean_squared_error,neg_mean_absolute_percentage_error
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LinearRegression,0.99 ± 0.08,0.07 ± 0.02,0.90 ± 0.00,-1145.30 ± 4.17,-1416.08 ± 4.16,-0.07 ± 0.00
RandomForestRegressor,295.34 ± 2.75,0.44 ± 0.07,0.90 ± 0.00,-1145.35 ± 4.23,-1416.14 ± 4.19,-0.07 ± 0.00
MLPRegressor,42.28 ± 8.25,0.10 ± 0.02,0.90 ± 0.00,-1145.34 ± 4.14,-1416.18 ± 4.14,-0.07 ± 0.00
XGBRegressor,19.64 ± 0.19,0.37 ± 0.03,0.90 ± 0.00,-1145.36 ± 4.22,-1416.14 ± 4.19,-0.07 ± 0.00
DummyRegressor,0.79 ± 0.05,0.07 ± 0.01,-0.00 ± 0.00,-3588.57 ± 7.85,-4419.71 ± 9.51,-0.25 ± 0.00


In [19]:
models_dict = {model.__class__.__name__: assign_pipeline(model, preprocessor) for model in models}
for model in models_dict.values():
    model.fit(X_train, y_train)

# 3. Evaluation

## 3.1. Linear Regression

In [43]:
y_pred = models_dict['LinearRegression'].predict(X_test)
lr_metrics = print_metrics(y_test, y_pred)
lr_metrics

{'MAE': 1144.1315116163576,
 'MAPE': 0.07446737146955482,
 'RSMAE': 1414.3061389901118,
 'r2': 0.8969935823883497}

## 3.2. Random Forest Regressor

In [44]:
y_pred = models_dict['RandomForestRegressor'].predict(X_test)
rf_metrics = print_metrics(y_test, y_pred)
rf_metrics

{'MAE': 1144.15638273732,
 'MAPE': 0.07446395090139091,
 'RSMAE': 1414.357987721011,
 'r2': 0.8969860297803578}

## 3.3 MLP Regressor

In [45]:
y_pred = models_dict['MLPRegressor'].predict(X_test)
mlp_metrics = print_metrics(y_test, y_pred)
mlp_metrics

{'MAE': 1144.1537955138217,
 'MAPE': 0.07446826672942013,
 'RSMAE': 1414.330715952368,
 'r2': 0.8969900023899827}

## 3.4. XGB Regressor

In [46]:
y_pred = models_dict['XGBRegressor'].predict(X_test)
xgb_metrics = print_metrics(y_test, y_pred)
xgb_metrics

{'MAE': 1144.157305514877,
 'MAPE': 0.0744647195018875,
 'RSMAE': 1414.358738823685,
 'r2': 0.8969859203680454}

## 3.5. Dummy Regressor

In [47]:
y_pred = models_dict['DummyRegressor'].predict(X_test)
dummy_metrics = print_metrics(y_test, y_pred)
dummy_metrics

{'MAE': 3577.732284197109,
 'MAPE': 0.24477026181452524,
 'RSMAE': 4406.681735794761,
 'r2': -1.8455755319557454e-06}

In [53]:
df_metrics = pd.DataFrame([lr_metrics, rf_metrics, mlp_metrics, xgb_metrics,
                           dummy_metrics], index=models_dict.keys()).sort_values('r2', ascending=False,)
display(df_metrics)

Unnamed: 0,MAE,MAPE,RSMAE,r2
LinearRegression,1144.131512,0.074467,1414.306139,0.896994
MLPRegressor,1144.153796,0.074468,1414.330716,0.89699
RandomForestRegressor,1144.156383,0.074464,1414.357988,0.896986
DummyRegressor,1144.157306,0.074465,1414.358739,0.896986
XGBRegressor,3577.732284,0.24477,4406.681736,-2e-06
