In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('clean.csv', index_col = 0)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,0,27.9,0,1,16884.924
1,18,1,33.77,1,0,1725.5523
2,28,1,33.0,3,0,4449.462
3,33,1,22.705,0,0,21984.47061
4,32,1,28.88,0,0,3866.8552


# Import Package

In [3]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LinearRegression,ElasticNet
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

#### Datasets splitting

In [4]:
X = df.drop(columns="charges")
y = df.charges

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1070, 5), (268, 5), (1070,), (268,))

#### Scalling

In [5]:
scaler = RobustScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

Penggunaan scaler untuk me-normalize data

In [6]:
pd.DataFrame(X_train_scaled).head()

Unnamed: 0,0,1,2,3,4
0,0.270833,-1.0,-1.237449,0.5,0.0
1,0.3125,-1.0,-0.710387,-0.5,0.0
2,0.520833,-1.0,-0.645259,-0.5,0.0
3,-0.020833,-1.0,0.495703,2.0,0.0
4,0.604167,-1.0,-1.054123,1.0,0.0


In [7]:
def evaluation_metrics(model, x_test, y_test):
    prediksi = model.predict(x_test)
    MAE = mean_absolute_error(y_test, prediksi)
    MSE = mean_squared_error(y_test, prediksi)
    RMSE = np.sqrt(MSE)
    r2 = r2_score(y_test, prediksi)
    print('Evaluation Metrics Model', model)
    print(f'MAE: {round(MAE, 3)}')
    print(f'MSE: {round(MSE, 3)}')
    print(f'RMSE: {round(RMSE, 3)}')
    print(f'R2_SCORE: {round(r2, 3)}')
    val = dict()
    val['MAE'] = MAE
    val['MSE'] = MSE
    val['RMSE'] = RMSE
    val['R2'] = r2
    return val

# Training Dataset

### Linear Regression

#### Base

In [8]:
LinReg = LinearRegression()

LinReg.fit(X_train_scaled, y_train)

LinRegAcc = evaluation_metrics(LinReg, X_test_scaled, y_test)

Evaluation Metrics Model LinearRegression()
MAE: 4253.334
MSE: 34088339.546
RMSE: 5838.522
R2_SCORE: 0.78


#### Hyperparameter Tuning

In [9]:
paramLinReg = {
    'fit_intercept': [True, False],
    'normalize': [True, False]
}

In [10]:
LinReg_param = GridSearchCV(LinearRegression(), paramLinReg, cv=5, n_jobs=-1, verbose=1)
LinReg_param.fit(X_train_scaled, y_train)
print(LinReg_param.score(X_test_scaled, y_test))
print(LinReg_param.best_params_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


0.7804275808343701
{'fit_intercept': True, 'normalize': True}


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    3.9s finished


In [11]:
LinReg_best = LinReg_param.best_estimator_

LinReg_best.fit(X_train_scaled, y_train)
LinReg_best_acc = evaluation_metrics(LinReg_best, X_test_scaled, y_test)

Evaluation Metrics Model LinearRegression(normalize=True)
MAE: 4253.334
MSE: 34088339.546
RMSE: 5838.522
R2_SCORE: 0.78


### ElasticNet

#### Base

In [12]:
en = ElasticNet()

en.fit(X_train_scaled, y_train)

enAcc = evaluation_metrics(en, X_test_scaled, y_test)

Evaluation Metrics Model ElasticNet()
MAE: 7839.834
MSE: 99728725.025
RMSE: 9986.427
R2_SCORE: 0.358


#### Hyperparameter Tuning

In [13]:
param_en = {'alpha':[0.001,0.01,0.02, 0.024, 0.025, 0.026, 0.03, 1.0],
               'fit_intercept':[True, False],
               'normalize':[True, False]}

In [14]:
en_param = GridSearchCV(en, param_en, cv=5, n_jobs=-1, verbose=1)
en_param.fit(X_train_scaled, y_train)
print(en_param.score(X_test_scaled, y_test))
print(en_param.best_params_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 32 candidates, totalling 160 fits
0.7802695681104839
{'alpha': 0.001, 'fit_intercept': True, 'normalize': False}


[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:    0.5s finished


In [15]:
en_best = en_param.best_estimator_
en_best.fit(X_train_scaled, y_train)
en_best_acc = evaluation_metrics(en_best, X_test_scaled, y_test)

Evaluation Metrics Model ElasticNet(alpha=0.001)
MAE: 4261.085
MSE: 34112870.821
RMSE: 5840.622
R2_SCORE: 0.78


### RandomForestRegressor

#### Base

In [16]:
rfg =RandomForestRegressor()

rfg.fit(X_train_scaled, y_train)

rfgAcc = evaluation_metrics(rfg, X_test_scaled, y_test)

Evaluation Metrics Model RandomForestRegressor()
MAE: 2623.865
MSE: 20416583.356
RMSE: 4518.471
R2_SCORE: 0.868


#### Hyperparameter Tuning

In [17]:
param_rfg = {'n_estimators': [100, 250, 500, 750, 1000, 1200, 1400, 1600],
             'max_depth': [None, 10, 30, 50, 70, 90, 100],
             'max_features': ['auto', 0.3, 0.6, 0.8],
             'min_samples_split' : np.arange(1,20),
             'min_samples_leaf': np.arange(1,10),
             'bootstrap' : [True, False],
             'n_jobs' : [None, 1, 3, 5, 7, 9],
            }

In [18]:
rfg_param = RandomizedSearchCV(RandomForestRegressor(), param_rfg, cv=5, n_iter=30, n_jobs=-1, verbose=1, random_state=42)
rfg_param.fit(X_train_scaled, y_train)
print(rfg_param.score(X_test_scaled, y_test))
print(rfg_param.best_params_)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   58.4s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  3.2min finished


0.8822109650540595
{'n_jobs': 3, 'n_estimators': 750, 'min_samples_split': 2, 'min_samples_leaf': 9, 'max_features': 0.8, 'max_depth': 10, 'bootstrap': True}


In [19]:
rfg_best = rfg_param.best_estimator_
rfg_best.fit(X_train_scaled, y_train)
rfg_best_acc = evaluation_metrics(rfg_best, X_test_scaled, y_test)

Evaluation Metrics Model RandomForestRegressor(max_depth=10, max_features=0.8, min_samples_leaf=9,
                      n_estimators=750, n_jobs=3)
MAE: 2588.258
MSE: 18353396.252
RMSE: 4284.086
R2_SCORE: 0.882


### XGBoostRegressor

#### Base

In [20]:
xgb = XGBRegressor()
xgb.fit(X_train_scaled, y_train)
xgbAcc = evaluation_metrics(xgb, X_test_scaled, y_test)

Evaluation Metrics Model XGBRegressor()
MAE: 2547.741
MSE: 18314411.499
RMSE: 4279.534
R2_SCORE: 0.882


### Hyperparameter Tuning

In [21]:
param_xgb = {'n_estimators': [100, 150, 200],
             'learning_rate': [0.01, 0.05, 0.1, 0.5, 1], 
             'max_depth': [3, 4, 5, 6, 7],
             'colsample_bytree': [0.4, 0.6, 0.7, 0.8, 1],
             'gamma': [0.0, 0.1, 0.2],
             'subsample': [0.4, 0.6, 0.8, 1],
             'reg_alpha': [0, 0.01, 0.1, 1, 10],
             'reg_lambda': [0, 0.01, 0.1, 1, 10]}

In [22]:
xgb_param = RandomizedSearchCV(XGBRegressor(), param_xgb, cv=5, n_iter=30, n_jobs=-1, verbose=1, random_state=42)
xgb_param.fit(X_train_scaled, y_train)
print(xgb_param.score(X_test_scaled, y_test))
print(xgb_param.best_params_)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.7s


0.879294867761957
{'subsample': 0.4, 'reg_lambda': 10, 'reg_alpha': 0, 'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0.0, 'colsample_bytree': 1}


[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   11.5s finished


In [23]:
xgb_best = xgb_param.best_estimator_
xgb_best.fit(X_train_scaled, y_train)
xgb_best_acc = evaluation_metrics(xgb_best, X_test_scaled, y_test)

Evaluation Metrics Model XGBRegressor(gamma=0.0, reg_lambda=10, subsample=0.4)
MAE: 2450.515
MSE: 18739318.665
RMSE: 4328.893
R2_SCORE: 0.879


# Evaluation

In [24]:
MAE_default = [LinRegAcc['MAE'], enAcc['MAE'], rfgAcc['MAE'], xgbAcc['MAE']]

MSE_default = [LinRegAcc['MSE'], enAcc['MSE'], rfgAcc['MSE'], xgbAcc['MSE']]

RMSE_default = [LinRegAcc['RMSE'], enAcc['RMSE'], rfgAcc['RMSE'], xgbAcc['RMSE']]

R2_default = [LinRegAcc['R2'], enAcc['R2'], rfgAcc['R2'], xgbAcc['R2']]

MAE_HPT = [LinReg_best_acc['MAE'], en_best_acc['MAE'], rfg_best_acc['MAE'], xgb_best_acc['MAE']]

MSE_HPT = [LinReg_best_acc['MSE'], en_best_acc['MSE'], rfg_best_acc['MSE'], xgb_best_acc['MSE']]

RMSE_HPT = [LinReg_best_acc['RMSE'], en_best_acc['RMSE'], rfg_best_acc['RMSE'], xgb_best_acc['RMSE']]

R2_HPT = [LinReg_best_acc['R2'], en_best_acc['R2'], rfg_best_acc['R2'], xgb_best_acc['R2']]

best_model = pd.DataFrame({'MAE_default': MAE_default,
                           'MAE_HPT': MAE_HPT,
                          "MSE_default" : MSE_default,
                            "MSE_HPT" : MSE_HPT,
                          "RMSE_default": RMSE_default,
                           "RMSE_HPT": RMSE_HPT,
                          "R2_default" : R2_default,
                          "R2_HPT" : R2_HPT},
                           index = "LinearRegression,ElasticNet,RandomForest,XGBoost".split(','))

In [25]:
best_model["MAE_default"] = round(best_model["MAE_default"], 3)
best_model["MSE_default"] = round(best_model["MSE_default"], 3)
best_model["RMSE_default"] = round(best_model["RMSE_default"], 3)
best_model["R2_default"] = round(best_model["R2_default"], 3)
best_model['MAE_HPT'] = round(best_model["MAE_HPT"], 3)
best_model["MSE_HPT"] = round(best_model["MSE_HPT"], 3)
best_model["RMSE_HPT"] = round(best_model["RMSE_HPT"], 3)
best_model["R2_HPT"] = round(best_model["R2_HPT"], 3)

In [26]:
best_model

Unnamed: 0,MAE_default,MAE_HPT,MSE_default,MSE_HPT,RMSE_default,RMSE_HPT,R2_default,R2_HPT
LinearRegression,4253.334,4253.334,34088340.0,34088340.0,5838.522,5838.522,0.78,0.78
ElasticNet,7839.834,4261.085,99728730.0,34112870.0,9986.427,5840.622,0.358,0.78
RandomForest,2623.865,2588.258,20416580.0,18353400.0,4518.471,4284.086,0.868,0.882
XGBoost,2547.741,2450.515,18314410.0,18739320.0,4279.534,4328.893,0.882,0.879


Setelah dilakukan training pada dataset, didapatkan best score yaitu **0.88**, dengan nilai MSE dan RMSE yang lebih rendah maka diputuskan menggunakan model `RandomForest dengan best parameter`

# Save Model

In [27]:
import joblib

In [28]:
model = rfg_best
print(round((model.predict([[52,1,25,0,0]])[0]),2))

17339.16


In [29]:
joblib.dump(model, 'ModelJoblib')

['ModelJoblib']