## Insurance Company Charge Prediction Based on Machine Learning

### One of the most evil things anyone can do, but I (Carter Roberts) know how to do it


In [26]:
# import basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [27]:
# read dataset into "data"
data = pd.read_csv("./insurance.csv")
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [28]:
# check for nulls
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [29]:
data.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [30]:
# keep checking for those nulls by looking for invalid unique values in non-int columns
for name in data:
    if data[name].dtype == 'object':
        print(name, data[name].unique())
# there are none, so there's no invalid or missing entries

sex ['female' 'male']
smoker ['yes' 'no']
region ['southwest' 'southeast' 'northwest' 'northeast']


In [31]:
# hunt for duplicates
data.duplicated()
data.drop_duplicates()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [32]:
# check data size & indexes after dropping duplicates if there are any, seems there wasn't
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [33]:
# Hunt for outliers in 'age' using boxplot / IQR method
q1 = data['age'].quantile(0.25)
q3 = data['age'].quantile(0.75)
iqr = q3 - q1
min_val = q1 - (1.5 * iqr)
max_val = q3 + (1.5 * iqr)
print("maximum non-outlier for age is", max_val)
print("minimum non-outlier for age is", min_val)
# fix minimum to 0.0 because -9.0 doesn't make sense
min_val = 0.0
print("minimum non-outlier for age changed to", min_val)

# filter anything outside the outliers out of data
outliers = data[(data['age'] < min_val) | (data['age'] > max_val)]
data = data.drop(outliers.index)
print("data updated to remove outliers for age")

maximum non-outlier for age is 87.0
minimum non-outlier for age is -9.0
minimum non-outlier for age changed to 0.0
data updated to remove outliers for age


In [34]:
# check data size & indexes after dropping rows with outlier ages
# seems that there were no rows with outlier ages, which is correct given my glance through the csv
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [35]:
# Hunt for outliers in 'bmi' using boxplot / IQR method
q1 = data['bmi'].quantile(0.25)
q3 = data['bmi'].quantile(0.75)
iqr = q3 - q1
min_val = q1 - (1.5 * iqr)
max_val = q3 + (1.5 * iqr)
print("maximum non-outlier for bmi is", max_val)
print("minimum non-outlier for bmi is", min_val)

# filter anything outside the outliers out of data
outliers = data[(data['bmi'] < min_val) | (data['bmi'] > max_val)]
data = data.drop(outliers.index)
print("data updated to remove outliers for bmi")

maximum non-outlier for bmi is 47.290000000000006
minimum non-outlier for bmi is 13.7
data updated to remove outliers for bmi


In [36]:
# check data size & indexes after dropping rows with outlier bmis
# seems only 8 rows had outlier bmis; sensical, but how to prevent hollow indexes here, I don't know
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1329 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1329 non-null   int64  
 1   sex       1329 non-null   object 
 2   bmi       1329 non-null   float64
 3   children  1329 non-null   int64  
 4   smoker    1329 non-null   object 
 5   region    1329 non-null   object 
 6   charges   1329 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 83.1+ KB


In [37]:
# Hunt for outliers in 'children' using boxplot / IQR method
q1 = data['children'].quantile(0.25)
q3 = data['children'].quantile(0.75)
iqr = q3 - q1
min_val = q1 - (1.5 * iqr)
max_val = q3 + (1.5 * iqr)
print("maximum non-outlier for children is", max_val)
print("minimum non-outlier for children is", min_val)
# fix minimum to 0.0 because -3.0 doesn't make sense
min_val = 0.0
print("minimum non-outlier for children changed to", min_val)

# filter anything outside the outliers out of data
outliers = data[(data['children'] < min_val) | (data['children'] > max_val)]
data = data.drop(outliers.index)
print("data updated to remove outliers for children")

maximum non-outlier for children is 5.0
minimum non-outlier for children is -3.0
minimum non-outlier for children changed to 0.0
data updated to remove outliers for children


In [38]:
# check data size & indexes after dropping rows with outlier children
# seems correct
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1329 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1329 non-null   int64  
 1   sex       1329 non-null   object 
 2   bmi       1329 non-null   float64
 3   children  1329 non-null   int64  
 4   smoker    1329 non-null   object 
 5   region    1329 non-null   object 
 6   charges   1329 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 83.1+ KB


In [39]:
# with all outliers dropped, drop null index values in the dataset just to be sure
# but before that, I ran this with only data.info() to check the size & indexes of the data after 
# dropping outliers for children, and it seems correct
data.dropna(axis=0, how='any', inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1329 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1329 non-null   int64  
 1   sex       1329 non-null   object 
 2   bmi       1329 non-null   float64
 3   children  1329 non-null   int64  
 4   smoker    1329 non-null   object 
 5   region    1329 non-null   object 
 6   charges   1329 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 83.1+ KB


In [40]:
# encode non-numeric regional data using get_dummies
data = pd.get_dummies(data, columns=['region'], dtype=int)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1329 entries, 0 to 1337
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               1329 non-null   int64  
 1   sex               1329 non-null   object 
 2   bmi               1329 non-null   float64
 3   children          1329 non-null   int64  
 4   smoker            1329 non-null   object 
 5   charges           1329 non-null   float64
 6   region_northeast  1329 non-null   int64  
 7   region_northwest  1329 non-null   int64  
 8   region_southeast  1329 non-null   int64  
 9   region_southwest  1329 non-null   int64  
dtypes: float64(2), int64(6), object(2)
memory usage: 114.2+ KB


In [41]:
# encode non-numeric biological sex data using get_dummies
data = pd.get_dummies(data, columns=['sex'], prefix='sex', dtype=int)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1329 entries, 0 to 1337
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               1329 non-null   int64  
 1   bmi               1329 non-null   float64
 2   children          1329 non-null   int64  
 3   smoker            1329 non-null   object 
 4   charges           1329 non-null   float64
 5   region_northeast  1329 non-null   int64  
 6   region_northwest  1329 non-null   int64  
 7   region_southeast  1329 non-null   int64  
 8   region_southwest  1329 non-null   int64  
 9   sex_female        1329 non-null   int64  
 10  sex_male          1329 non-null   int64  
dtypes: float64(2), int64(8), object(1)
memory usage: 124.6+ KB


In [42]:
# use mapping to encode non-numeric smoker data
category = {'yes': 1, 'no': 0}
data['smoker'] = data['smoker'].map(category)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1329 entries, 0 to 1337
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               1329 non-null   int64  
 1   bmi               1329 non-null   float64
 2   children          1329 non-null   int64  
 3   smoker            1329 non-null   int64  
 4   charges           1329 non-null   float64
 5   region_northeast  1329 non-null   int64  
 6   region_northwest  1329 non-null   int64  
 7   region_southeast  1329 non-null   int64  
 8   region_southwest  1329 non-null   int64  
 9   sex_female        1329 non-null   int64  
 10  sex_male          1329 non-null   int64  
dtypes: float64(2), int64(9)
memory usage: 124.6 KB


In [43]:
# scale data for age column
max_val = data['age'].max()
data['age'] = data['age'] / max_val
# scale data for bmi column
max_val = data['bmi'].max()
data['bmi'] = data['bmi'] / max_val
# scale data for children column
max_val = data['children'].max()
data['children'] = data['children'] / max_val

# show first 5 entries of data again
data.head()

Unnamed: 0,age,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest,sex_female,sex_male
0,0.296875,0.596791,0.0,1,16884.924,0,0,0,1,1,0
1,0.28125,0.722353,0.2,0,1725.5523,0,0,1,0,0,1
2,0.4375,0.705882,0.6,0,4449.462,0,0,1,0,0,1
3,0.515625,0.485668,0.0,0,21984.47061,0,1,0,0,0,1
4,0.5,0.617754,0.0,0,3866.8552,0,1,0,0,0,1


In [44]:
# now that everything is organized, scaled & checked, check indexes and entries one final time
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1329 entries, 0 to 1337
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               1329 non-null   float64
 1   bmi               1329 non-null   float64
 2   children          1329 non-null   float64
 3   smoker            1329 non-null   int64  
 4   charges           1329 non-null   float64
 5   region_northeast  1329 non-null   int64  
 6   region_northwest  1329 non-null   int64  
 7   region_southeast  1329 non-null   int64  
 8   region_southwest  1329 non-null   int64  
 9   sex_female        1329 non-null   int64  
 10  sex_male          1329 non-null   int64  
dtypes: float64(4), int64(7)
memory usage: 124.6 KB


In [45]:
# define dependent and independent variables
X = data.drop(columns=['charges'])
y = data['charges']

In [46]:
# try linear regression on the data
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

sum_test_error = 0
sum_train_error = 0
epochs = 100
for k in range(epochs):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
    linReg = LinearRegression()
    linReg.fit(X_train, y_train)

    y_pred = linReg.predict(X_train)
    error = mean_squared_error(y_train, y_pred)
    error = error**0.5
    sum_train_error += error

    y_pred = linReg.predict(X_test)
    error = mean_squared_error(y_test, y_pred)
    error = error**0.5
    sum_test_error += error
print("sum of train error =", (sum_train_error / epochs), "sum of test error =", (sum_test_error / epochs))

sum of train error = 5998.692529979803 sum of test error = 6002.208542219112


In [57]:
# didn't work, try polynomial regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

sum_test_error = 0
sum_train_error = 0
polyf = PolynomialFeatures(degree = 3, include_bias = False)
regModel = LinearRegression()
epochs = 100
for k in range(epochs):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
    X_train_poly = polyf.fit_transform(X_train)
    X_test_poly = polyf.fit_transform(X_test)
    regModel.fit(X_train_poly, y_train)
    
    y_pred = regModel.predict(X_train_poly)
    error = mean_squared_error(y_train, y_pred)
    error = error ** 0.5
    sum_train_error += error

    y_pred = regModel.predict(X_test_poly)
    error = mean_squared_error(y_test, y_pred)
    error = error ** 0.5
    sum_test_error += error
print("sum of train error =", (sum_train_error / epochs), "sum of test error =", (sum_test_error / epochs))

sum of train error = 4516.731704111608 sum of test error = 5015.721470845017


In [58]:
# didn't work, try random forest
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
sum_train_error = 0
sum_test_error = 0

for k in range(5, 20):
    rf = RandomForestRegressor(n_estimators = 200, max_depth = 10, max_features = k)
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_train)
    error = mean_squared_error(y_train, y_pred)
    error = error ** 0.5
    sum_train_error += error

    y_pred = rf.predict(X_test)
    error = mean_squared_error(y_test, y_pred)
    error = error ** 0.5
    sum_test_error += error
print("sum of train error =", (sum_train_error / 15), "sum of test error =", (sum_test_error / 15))

sum of train error = 2086.2935336051896 sum of test error = 4880.912645249479
