In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("teertha/ushealthinsurancedataset")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\OLUSOLADE EMMANUEL\.cache\kagglehub\datasets\teertha\ushealthinsurancedataset\versions\1


In [3]:
import os
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
os.listdir(path)

['insurance.csv']

In [5]:
data = pd.read_csv(os.path.join(path, 'insurance.csv'))

In [6]:
data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [8]:
data['sex'] = data['sex'].map({'male':0, 'female':1})
data['smoker'] = data['smoker'].map({'no':0, 'yes':1})

In [9]:
data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,1,27.900,0,1,southwest,16884.92400
1,18,0,33.770,1,0,southeast,1725.55230
2,28,0,33.000,3,0,southeast,4449.46200
3,33,0,22.705,0,0,northwest,21984.47061
4,32,0,28.880,0,0,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,0,30.970,3,0,northwest,10600.54830
1334,18,1,31.920,0,0,northeast,2205.98080
1335,18,1,36.850,0,0,southeast,1629.83350
1336,21,1,25.800,0,0,southwest,2007.94500


In [10]:
data = pd.get_dummies(data, columns=['region'], drop_first=True).astype(int)

In [11]:
data

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northwest,region_southeast,region_southwest
0,19,1,27,0,1,16884,0,0,1
1,18,0,33,1,0,1725,0,1,0
2,28,0,33,3,0,4449,0,1,0
3,33,0,22,0,0,21984,1,0,0
4,32,0,28,0,0,3866,1,0,0
...,...,...,...,...,...,...,...,...,...
1333,50,0,30,3,0,10600,1,0,0
1334,18,1,31,0,0,2205,0,0,0
1335,18,1,36,0,0,1629,0,1,0
1336,21,1,25,0,0,2007,0,0,1


In [12]:
data['children'].value_counts()

children
0    574
1    324
2    240
3    157
4     25
5     18
Name: count, dtype: int64

In [113]:
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, accuracy_score, f1_score, recall_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from scikeras.wrappers import KerasClassifier
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

In [114]:
X = data.drop('charges', axis=1)
y = data['charges']

In [115]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [116]:
cv = KFold(n_splits=5, shuffle=True, random_state=41)

In [117]:
#Create the models

In [118]:
linear_reg = Pipeline([
    ('scaler', StandardScaler()),
    ('linear', LinearRegression())
])

In [119]:
forest = RandomForestClassifier()

In [120]:
gbr = GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=42)

In [121]:
xgb = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42)

In [122]:
def ann_model():
    model = Sequential()
    model.add(Dense(64, activation='relu', input_dim=X.shape[1]))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(1))

    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [123]:
ann = Pipeline([
    ('scaler', StandardScaler()),
    ('keras', KerasClassifier(model=ann_model, epochs=50, batch_size=32, verbose=0))
])

In [124]:
#combine models
models = {
    'linear': linear_reg,
    'forest': forest,
    'gbr': gbr,
    'xgb': xgb
}

In [125]:
#define evaluation metrics
scoring = {
    'mae': 'neg_mean_absolute_error',
    'mse': 'neg_mean_squared_error',
    'r2': 'r2'
}

In [126]:
#perform cross-validation
result = {}
for name, model in models.items():
    cv_res = cross_validate(model, X_train, y_train, cv=cv, scoring=scoring, return_train_score=False)
    summary = {
        metric: {
            'mean': cv_res[f'test_{metric}'].mean(),
            'std': cv_res[f'test_{metric}'].std()
        }
        for metric in scoring
    }
    result[name] = summary

In [129]:
for name, summary in result.items():
    print(f'\n === {name}===')
    for metric, (mean_,std_) in summary.items():
        print(f'{metric:7s} : {mean_:.6}~{std_}')


 === linear===


ValueError: Unknown format code 'f' for object of type 'str'