# Predicting medical expenses

## USA patients demographic information
### Uses the dataset *medical_expenses.csv*

This notebook is an example. It does not show an exhaustive detail on all CRISP-DM phases.

(c) Nuno António 2020-2023 - Rev. 1.01

### Load packages, model and the dataset to do prediction for

In [17]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import category_encoders as ce
from sklearn import preprocessing
import pickle

In [18]:
# Load the trained model
file = open("model.pickle", "rb")
lr_regr = pickle.load(file)
file.close()

In [19]:
# Load the dataset
# Load data 
ds = pd.read_csv('medical_expenses_new_patients.csv', sep=',')
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       10 non-null     int64  
 1   sex       10 non-null     object 
 2   bmi       10 non-null     float64
 3   children  10 non-null     int64  
 4   smoker    10 non-null     object 
 5   region    10 non-null     object 
dtypes: float64(1), int64(2), object(3)
memory usage: 608.0+ bytes


In [20]:
# Visualizing the dataset
ds.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region
0,38,male,31.83,0,no,northeast
1,48,female,37.84,0,no,northwest
2,31,male,41.88,2,no,northwest
3,27,female,22.74,0,no,southeast
4,29,female,20.9,0,yes,southwest
5,18,male,31.77,1,no,southeast
6,38,male,32.0,1,no,southeast
7,27,male,19.705,0,no,northwest
8,36,female,23.44,1,no,southeast
9,57,female,24.74,3,no,northwest


### Data preparation

In [21]:
# Create a modeling dataset from the original dataset
X = ds.copy(deep=True)

In [22]:
# Bin the children for children >= 3
binsInterval = [-1, 0, 1, 2, 999]
intervalLabels = ['0', '1',  '2', '3+']
X['children_binned'] = pd.cut(X['children'], bins = binsInterval, labels=intervalLabels)

In [23]:
# Bin the age
binsInterval = [0, 26, 38, 50, 99] # Create bins interval based on distribution/quartiles
intervalLabels = ['(0, 26]', '(27, 38]',  '(39, 50]', '(51, 99]']
X['age_binned'] = pd.cut(X['age'], bins = binsInterval, labels=intervalLabels)

In [24]:
# Feature engineering
# Create a feature with the ratio of the BMI by the BMI mean of people from the same age group
bmiAgeMean = X.groupby('age_binned')['bmi'].agg(meanBMI='mean') # calculate mean by age group
X['BMIByMeanBMIRatio'] = X['bmi'].divide(bmiAgeMean.meanBMI.loc[X.age_binned].values,0) # divide ADR by mean ADR of the day

In [25]:
# Encode boolean features into dummy variables
cols = ['sex', 'smoker']
X = pd.get_dummies(data=X,columns=cols,drop_first=True)

In [26]:
# Encode categorical values into dummy variables
cols = ['region', 'age_binned', 'children_binned']
ce_one_hot = ce.OneHotEncoder(cols = cols, use_cat_names=True)
X = ce_one_hot.fit_transform(X)

In [27]:
# Drop not used columns
cols = ['age', 'children']
X = X.drop(columns=cols)

In [28]:
# Normalize all columns
# MinMax scaler returns an array, so the dataframe must be recreated
# Warning! Take into consideration the type of normalization based on the values that were min and max on the training data
X = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(X.values), columns=X.columns,index=X.index)

In [29]:
# Summary statistics for all variables after data preparation - Transpose
X.describe(include='all').T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
bmi,10.0,0.404938,0.338797,0.0,0.144758,0.385569,0.552537,1.0
region_northeast,10.0,0.1,0.316228,0.0,0.0,0.0,0.0,1.0
region_northwest,10.0,0.4,0.516398,0.0,0.0,0.0,1.0,1.0
region_southeast,10.0,0.4,0.516398,0.0,0.0,0.0,1.0,1.0
region_southwest,10.0,0.1,0.316228,0.0,0.0,0.0,0.0,1.0
children_binned_0,10.0,0.5,0.527046,0.0,0.0,0.5,1.0,1.0
children_binned_1,10.0,0.3,0.483046,0.0,0.0,0.0,0.75,1.0
children_binned_2,10.0,0.1,0.316228,0.0,0.0,0.0,0.0,1.0
children_binned_3+,10.0,0.1,0.316228,0.0,0.0,0.0,0.0,1.0
"age_binned_(0, 26]",10.0,0.1,0.316228,0.0,0.0,0.0,0.0,1.0


### Do the prediction

In [30]:
# Run the prediction
y_pred = lr_regr.predict(X) 

Feature names must be in the same order as they were in fit.



In [31]:
# Show predictions
ds['estimated_expenses'] = y_pred
ds

Unnamed: 0,age,sex,bmi,children,smoker,region,estimated_expenses
0,38,male,31.83,0,no,northeast,6272.0
1,48,female,37.84,0,no,northwest,-6144.0
2,31,male,41.88,2,no,northwest,11776.0
3,27,female,22.74,0,no,southeast,1536.0
4,29,female,20.9,0,yes,southwest,23680.0
5,18,male,31.77,1,no,southeast,-3328.0
6,38,male,32.0,1,no,southeast,6144.0
7,27,male,19.705,0,no,northwest,-128.0
8,36,female,23.44,1,no,southeast,1792.0
9,57,female,24.74,3,no,northwest,18048.0


In [32]:
# Export results
ds.to_excel("export_results.xlsx")