In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

data source: https://github.com/stedy/Machine-Learning-with-R-datasets

# import dataset insurance

In [2]:
os.listdir('data/')

['insurance.csv']

In [3]:
data = pd.read_csv('data/insurance.csv')

In [4]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


checking null values

In [5]:
data.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [7]:
data.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


encode categorical data

In [8]:
from sklearn.preprocessing import LabelEncoder

In [9]:
# sex
le = LabelEncoder()
le.fit(data.sex.drop_duplicates())
data.sex = le.transform(data.sex)

# smoker or not
le.fit(data.smoker.drop_duplicates())
data.smoker = le.transform(data.smoker)

# region
le.fit(data.region.drop_duplicates())
data.region = le.transform(data.region)

In [10]:
# after encoding
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


# get the correlation score

In [11]:
data[['charges', 'smoker', 'sex', 'region']].corr(method='spearman')['charges'].\
append(data[[i for i in data.columns if i not in ['smoker', 'sex', 'region']]].corr()['charges'].drop(labels=['charges'])).\
sort_values(ascending=False)

charges     1.000000
smoker      0.663460
age         0.299008
bmi         0.198341
children    0.067998
sex         0.009490
region     -0.043531
Name: charges, dtype: float64

# regression modelling

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, scale
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from numpy import asarray

In [13]:
data_s = data.drop(['region'], axis=1)
data_s.shape

(1338, 6)

In [14]:
scaler = StandardScaler().fit(data_s)

In [15]:
scaled_data = pd.DataFrame(scaler.transform(data.drop(['region'], axis=1)), columns=data.drop(['region'], axis=1).columns)

# linear regression

In [16]:
x = scaled_data.drop(['charges'], axis=1)
y = scaled_data.charges

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0)
lr = LinearRegression().fit(x_train, y_train)

y_train_pred = lr.predict(x_train)
y_test_pred = lr.predict(x_test)

print(lr.score(x_test, y_test))

0.7952171980481992


# random forest and xgb regressor

In [17]:
import xgboost as xgb

In [35]:
# adding polynomial feature
X = data.drop(['charges', 'region'], axis=1)
Y = data.charges

quad = PolynomialFeatures(degree=2)
x_fit = quad.fit(X)
x_quad = x_fit.transform(X)

x_train, x_test, y_train, y_test = train_test_split(x_quad, Y, random_state=0)

forest = RandomForestRegressor(n_estimators=100, criterion='mse', random_state=1)

forest.fit(x_train, y_train)
forest_train_pred = forest.predict(x_train)
forest_test_pred = forest.predict(x_test)

print(f"random forest MSE train data: {mean_squared_error(y_train, forest_train_pred)}")
print(f"random forest MSE test data: {mean_squared_error(y_test, forest_test_pred)}")

print(f"random forest R2 train data: {r2_score(y_train, forest_train_pred)}")
print(f"random forest R2 test data: {r2_score(y_test, forest_test_pred)}")

xgb_reg = xgb.XGBRegressor(n_estimators=100, max_depth=2, learning_rate=0.2)
xgb_reg.fit(x_train, y_train)
xgb_train_pred = xgb_reg.predict(x_train)
xgb_test_pred = xgb_reg.predict(x_test)

print("\n\n")
print(f"xgb MSE train data: {mean_squared_error(y_train, xgb_train_pred)}")
print(f"xgb MSE test data: {mean_squared_error(y_test, xgb_test_pred)}")

print(f"xgb R2 train data: {r2_score(y_train, xgb_train_pred)}")
print(f"xgb R2 test data: {r2_score(y_test, xgb_test_pred)}")

random forest MSE train data: 3969212.165107516
random forest MSE test data: 20081745.32134489
random forest R2 train data: 0.9722224978790875
random forest R2 test data: 0.8724438182874924



xgb MSE train data: 15263252.754417975
xgb MSE test data: 16232329.829252696
xgb R2 train data: 0.8931840833591763
xgb R2 test data: 0.8968947180543743


# predicting my insurance cost

In [30]:
me = dict(age=26, sex=0, bmi=22, children=0, smoker=1)

In [34]:
print(f"predicted charge: {xgb_reg.predict(x_fit.transform(asarray([list(me.values())])))[0]}")

predicted charge: 15484.46875


In [33]:
data.charges.mean()

13270.422265141257