In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
%matplotlib inline

In [2]:
df = pd.read_csv('insurance.csv')

In [3]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [4]:
df.pop('region')

0       southwest
1       southeast
2       southeast
3       northwest
4       northwest
          ...    
1333    northwest
1334    northeast
1335    southeast
1336    southwest
1337    northwest
Name: region, Length: 1338, dtype: object

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 62.8+ KB


In [6]:
df['charges_log'] = np.log(df['charges'])

In [7]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,charges_log
0,19,female,27.9,0,yes,16884.924,9.734176
1,18,male,33.77,1,no,1725.5523,7.453302
2,28,male,33.0,3,no,4449.462,8.400538
3,33,male,22.705,0,no,21984.47061,9.998092
4,32,male,28.88,0,no,3866.8552,8.260197


In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X = df.drop('charges',axis=1)
y = df['charges']

In [10]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

In [11]:
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Catboost Regressor

In [12]:
from catboost import CatBoostRegressor

In [13]:
cbr = CatBoostRegressor(loss_function='RMSE',iterations=100,learning_rate=0.1,depth=2,random_state=10,cat_features=[1,4])

In [14]:
cbr.fit(X_train,y_train)

0:	learn: 11077.6397644	total: 138ms	remaining: 13.7s
1:	learn: 10254.5576249	total: 139ms	remaining: 6.83s
2:	learn: 9507.0984526	total: 140ms	remaining: 4.53s
3:	learn: 8763.2498970	total: 141ms	remaining: 3.38s
4:	learn: 8068.7851085	total: 141ms	remaining: 2.68s
5:	learn: 7404.2879952	total: 142ms	remaining: 2.22s
6:	learn: 6800.2753079	total: 143ms	remaining: 1.89s
7:	learn: 6354.7137641	total: 143ms	remaining: 1.65s
8:	learn: 5920.9582181	total: 144ms	remaining: 1.45s
9:	learn: 5534.3595632	total: 144ms	remaining: 1.3s
10:	learn: 5116.7413456	total: 145ms	remaining: 1.17s
11:	learn: 4717.0737475	total: 146ms	remaining: 1.07s
12:	learn: 4349.1256081	total: 147ms	remaining: 983ms
13:	learn: 4073.0241070	total: 148ms	remaining: 906ms
14:	learn: 3824.2972866	total: 148ms	remaining: 840ms
15:	learn: 3550.8871624	total: 149ms	remaining: 782ms
16:	learn: 3305.9723823	total: 150ms	remaining: 731ms
17:	learn: 3073.7178679	total: 150ms	remaining: 685ms
18:	learn: 2868.5645546	total: 151ms	

<catboost.core.CatBoostRegressor at 0x229c4978770>

In [15]:
y_pred = cbr.predict(X_val)

In [16]:
from sklearn import metrics

In [17]:
print('MAE:', metrics.mean_absolute_error(y_val, y_pred))
print('MSE:', metrics.mean_squared_error(y_val, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_val, y_pred)))

MAE: 327.5618466836034
MSE: 384443.1853056424
RMSE: 620.0348258812907


In [18]:
y_pred = cbr.predict(X_test)

In [19]:
from sklearn import metrics

In [20]:
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

MAE: 389.45038030531185
MSE: 396899.63928481494
RMSE: 629.999713718042


# K-fold cross validation

In [21]:
from sklearn.metrics import make_scorer, mean_squared_error

In [22]:
from sklearn.model_selection import cross_val_score,KFold

In [23]:
def rmse_score(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [24]:
rmse_scorer = make_scorer(rmse_score, greater_is_better=False)

In [25]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [26]:
cv_scores = cross_val_score(cbr, X, y, cv=kf, scoring=rmse_scorer)

0:	learn: 11140.5738312	total: 904us	remaining: 89.5ms
1:	learn: 10329.4155824	total: 2.28ms	remaining: 112ms
2:	learn: 9452.4636446	total: 3.25ms	remaining: 105ms
3:	learn: 8751.6674487	total: 4.16ms	remaining: 99.7ms
4:	learn: 8129.1046018	total: 5ms	remaining: 95.1ms
5:	learn: 7454.7806503	total: 6.08ms	remaining: 95.3ms
6:	learn: 6912.5858098	total: 6.9ms	remaining: 91.7ms
7:	learn: 6355.7809672	total: 7.83ms	remaining: 90.1ms
8:	learn: 5881.7850335	total: 8.59ms	remaining: 86.9ms
9:	learn: 5457.7662919	total: 9.48ms	remaining: 85.3ms
10:	learn: 5098.2558138	total: 10.3ms	remaining: 83.3ms
11:	learn: 4709.2063490	total: 11.3ms	remaining: 83.1ms
12:	learn: 4374.3900419	total: 12.2ms	remaining: 81.7ms
13:	learn: 4050.2883437	total: 13ms	remaining: 79.9ms
14:	learn: 3761.9603177	total: 13.9ms	remaining: 78.7ms
15:	learn: 3521.3744310	total: 14.7ms	remaining: 77ms
16:	learn: 3268.3585120	total: 15.5ms	remaining: 75.9ms
17:	learn: 3050.6699702	total: 16.4ms	remaining: 74.7ms
18:	learn: 

In [27]:
print(f"Cross-validated RMSE scores: {cv_scores}")
print(f"Mean RMSE: {-np.mean(cv_scores)}")

Cross-validated RMSE scores: [-710.02983071 -555.39390951 -529.5863975  -657.06429823 -772.2099934 ]
Mean RMSE: 644.8568858681476


# Gradient Boosting

In [28]:
df['smoker_int'] = df['smoker'].map({'yes':1, 'no':0})

In [29]:
df['gender_int'] = df['sex'].map({'male':1, 'female':0})

In [30]:
df.pop('smoker')

0       yes
1        no
2        no
3        no
4        no
       ... 
1333     no
1334     no
1335     no
1336     no
1337    yes
Name: smoker, Length: 1338, dtype: object

In [31]:
df.pop('sex')

0       female
1         male
2         male
3         male
4         male
         ...  
1333      male
1334    female
1335    female
1336    female
1337    female
Name: sex, Length: 1338, dtype: object

In [32]:
X = df.drop('charges',axis=1)
y = df['charges']

In [33]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [35]:
from sklearn.ensemble import GradientBoostingRegressor

In [36]:
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=450,criterion='squared_error'
                               ,max_depth=2, min_samples_split=8, min_samples_leaf=1,subsample=1)

In [37]:
gbr.fit(X_train,y_train)

In [38]:
y_pred = gbr.predict(X_val)

In [39]:
print('MAE:', metrics.mean_absolute_error(y_val, y_pred))
print('MSE:', metrics.mean_squared_error(y_val, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_val, y_pred)))

MAE: 121.73786742960964
MSE: 29829.10432322258
RMSE: 172.7110428525709


In [40]:
y_pred = gbr.predict(X_test)

In [41]:
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

MAE: 126.55318110365496
MSE: 50400.980515066505
RMSE: 224.50162697643532


# K-fold cross validation

In [42]:
from sklearn.metrics import make_scorer, mean_squared_error

In [43]:
from sklearn.model_selection import cross_val_score,KFold

In [44]:
def rmse_score(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [45]:
rmse_scorer = make_scorer(rmse_score, greater_is_better=False)

In [46]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [47]:
cv_scores = cross_val_score(gbr, X, y, cv=kf, scoring=rmse_scorer)

In [48]:
print(f"Cross-validated RMSE scores: {cv_scores}")
print(f"Mean RMSE: {-np.mean(cv_scores)}")

Cross-validated RMSE scores: [-200.28739955 -187.26774061 -203.62416066 -255.42169159 -220.69393602]
Mean RMSE: 213.4589856848216


# Extra Tree

In [49]:
from sklearn.ensemble import ExtraTreesRegressor

In [110]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, random_state=42)

In [111]:
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [154]:
extr = ExtraTreesRegressor(n_estimators=400,random_state=4,max_depth=6,bootstrap=True,warm_start=True,oob_score=True,
                           min_samples_split=4, min_samples_leaf=1)

In [155]:
extr.fit(X_train,y_train)

In [156]:
y_pred = extr.predict(X_val)

In [157]:
print('MAE:', metrics.mean_absolute_error(y_val, y_pred))
print('MSE:', metrics.mean_squared_error(y_val, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_val, y_pred)))

MAE: 258.8362271854872
MSE: 116727.35247238193
RMSE: 341.6538489061435


In [158]:
y_pred = extr.predict(X_test)

In [159]:
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

MAE: 258.6331659669034
MSE: 266413.9507489178
RMSE: 516.1530303591347


In [160]:
cv_scores = cross_val_score(extr, X, y, cv=kf, scoring=rmse_scorer)

In [161]:
print(f"Cross-validated RMSE scores: {cv_scores}")
print(f"Mean RMSE: {-np.mean(cv_scores)}")

Cross-validated RMSE scores: [-494.85426886 -351.916752   -371.44178323 -449.88940531 -702.69009704]
Mean RMSE: 474.1584612874809
