In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
%matplotlib inline

In [2]:
df = pd.read_csv('insurance.csv')

In [3]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [5]:
df['charges_log'] = np.log(df['charges'])

In [6]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,charges_log
0,19,female,27.9,0,yes,southwest,16884.924,9.734176
1,18,male,33.77,1,no,southeast,1725.5523,7.453302
2,28,male,33.0,3,no,southeast,4449.462,8.400538
3,33,male,22.705,0,no,northwest,21984.47061,9.998092
4,32,male,28.88,0,no,northwest,3866.8552,8.260197


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X = df.drop('charges',axis=1)
y = df['charges']

In [273]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.48, random_state=101)

In [274]:
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [275]:
from catboost import CatBoostRegressor

In [276]:
cbr = CatBoostRegressor(loss_function='RMSE',iterations=2000,learning_rate=0.01,depth=2,random_state=10,cat_features=[1,4,5])

In [277]:
cbr.fit(X_train,y_train)

0:	learn: 12337.6867449	total: 6.58ms	remaining: 13.2s
1:	learn: 12233.0357691	total: 13.5ms	remaining: 13.5s
2:	learn: 12134.5726813	total: 20.5ms	remaining: 13.7s
3:	learn: 12035.5874502	total: 31.8ms	remaining: 15.9s
4:	learn: 11943.6750751	total: 48.6ms	remaining: 19.4s
5:	learn: 11843.5686540	total: 63.4ms	remaining: 21.1s
6:	learn: 11757.9732922	total: 78.8ms	remaining: 22.4s
7:	learn: 11663.0820079	total: 91.3ms	remaining: 22.7s
8:	learn: 11562.6902807	total: 114ms	remaining: 25.2s
9:	learn: 11463.5680733	total: 129ms	remaining: 25.6s
10:	learn: 11368.6733135	total: 142ms	remaining: 25.6s
11:	learn: 11287.2837785	total: 154ms	remaining: 25.5s
12:	learn: 11199.9967620	total: 167ms	remaining: 25.5s
13:	learn: 11120.4841926	total: 194ms	remaining: 27.5s
14:	learn: 11025.7747496	total: 211ms	remaining: 28s
15:	learn: 10934.2581305	total: 230ms	remaining: 28.5s
16:	learn: 10854.7920783	total: 244ms	remaining: 28.4s
17:	learn: 10777.7009965	total: 256ms	remaining: 28.2s
18:	learn: 106

<catboost.core.CatBoostRegressor at 0x2894a595640>

In [278]:
y_pred = cbr.predict(X_val)

In [279]:
from sklearn import metrics

In [280]:
print('MAE:', metrics.mean_absolute_error(y_val, y_pred))
print('MSE:', metrics.mean_squared_error(y_val, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_val, y_pred)))

MAE: 155.21509818118565
MSE: 97513.87759839102
RMSE: 312.27212107133585


In [281]:
y_pred = cbr.predict(X_test)

In [282]:
from sklearn import metrics

In [283]:
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

MAE: 215.36456683402136
MSE: 310409.0309462307
RMSE: 557.1436358303222


# K-fold cross validation

In [145]:
from sklearn.metrics import make_scorer, mean_squared_error

In [146]:
from sklearn.model_selection import cross_val_score,KFold

In [147]:
def rmse_score(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [148]:
rmse_scorer = make_scorer(rmse_score, greater_is_better=False)

In [149]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [150]:
cv_scores = cross_val_score(cbr, X, y, cv=kf, scoring=rmse_scorer)

0:	learn: 11925.4084142	total: 4.51ms	remaining: 9.02s
1:	learn: 11829.3381076	total: 9.53ms	remaining: 9.52s
2:	learn: 11743.7366432	total: 16.7ms	remaining: 11.1s
3:	learn: 11649.7332240	total: 24.5ms	remaining: 12.2s
4:	learn: 11555.2832630	total: 29.4ms	remaining: 11.8s
5:	learn: 11456.0096945	total: 37ms	remaining: 12.3s
6:	learn: 11368.1251946	total: 44.7ms	remaining: 12.7s
7:	learn: 11273.7049547	total: 52ms	remaining: 12.9s
8:	learn: 11187.6865607	total: 60.2ms	remaining: 13.3s
9:	learn: 11103.1773810	total: 67.3ms	remaining: 13.4s
10:	learn: 11020.5269609	total: 74.3ms	remaining: 13.4s
11:	learn: 10934.5715591	total: 82.3ms	remaining: 13.6s
12:	learn: 10844.9138794	total: 90.2ms	remaining: 13.8s
13:	learn: 10754.7659548	total: 97.7ms	remaining: 13.9s
14:	learn: 10665.4398008	total: 105ms	remaining: 13.9s
15:	learn: 10585.8965974	total: 114ms	remaining: 14.1s
16:	learn: 10496.3785799	total: 122ms	remaining: 14.2s
17:	learn: 10422.9234046	total: 130ms	remaining: 14.4s
18:	learn:

In [151]:
print(f"Cross-validated RMSE scores: {cv_scores}")
print(f"Mean RMSE: {-np.mean(cv_scores)}")

Cross-validated RMSE scores: [-475.51836997 -216.84820525 -259.91659807 -404.07589901 -582.28378147]
Mean RMSE: 387.7285707559896


# Gradient Boosting

In [16]:
df['smoker_int'] = df['smoker'].map({'yes':1, 'no':0})

In [17]:
df['gender_int'] = df['sex'].map({'male':1, 'female':0})

In [18]:
df.pop('smoker')

0       yes
1        no
2        no
3        no
4        no
       ... 
1333     no
1334     no
1335     no
1336     no
1337    yes
Name: smoker, Length: 1205, dtype: object

In [19]:
df.pop('sex')

0       female
1         male
2         male
3         male
4         male
         ...  
1333      male
1334    female
1335    female
1336    female
1337    female
Name: sex, Length: 1205, dtype: object

In [20]:
df.pop('region')

0       southwest
1       southeast
2       southeast
3       northwest
4       northwest
          ...    
1333    northwest
1334    northeast
1335    southeast
1336    southwest
1337    northwest
Name: region, Length: 1205, dtype: object

In [21]:
X = df.drop('charges',axis=1)
y = df['charges']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.43, random_state=500)

In [23]:
from sklearn.ensemble import GradientBoostingRegressor

In [24]:
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=450,criterion='squared_error'
                               ,max_depth=2, min_samples_split=8, min_samples_leaf=1,subsample=1)

In [25]:
gbr.fit(X_train,y_train)

In [26]:
y_pred = gbr.predict(X_test)

In [27]:
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

MAE: 2332.9863296233348
MSE: 16579920.5365567
RMSE: 4071.8448566413604


# Extra Tree

In [28]:
from sklearn.ensemble import ExtraTreesRegressor

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=500)

In [30]:
extr = ExtraTreesRegressor(n_estimators=800,random_state=4,max_depth=6,bootstrap=True,warm_start=True,oob_score=True,
                           min_samples_split=4, min_samples_leaf=1)

In [31]:
extr.fit(X_train,y_train)

In [32]:
y_pred = extr.predict(X_test)

In [33]:
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

MAE: 2247.2935795181825
MSE: 15673509.765955377
RMSE: 3958.9783740196635
