# Experimentation

## Data Ingestion

In [1]:
import pandas as pd
import numpy as np
import os

import warnings
warnings.simplefilter(action='ignore', category=RuntimeWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.options.display.max_colwidth = None
pd.set_option("display.float_format", lambda x: '%.2f' % x)

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
pwd

'C:\\Users\\djoguns\\Documents\\workspace_datahackerman\\insurance_premium_prediction'

In [3]:
FILE_NAME = "insurance.csv"
DATA_FOLDER = "data"

In [4]:
main_path = os.getcwd()
main_path

'C:\\Users\\djoguns\\Documents\\workspace_datahackerman\\insurance_premium_prediction'

In [5]:
file_path = os.path.join(os.path.join(main_path, DATA_FOLDER), FILE_NAME)

In [6]:
file_path

'C:\\Users\\djoguns\\Documents\\workspace_datahackerman\\insurance_premium_prediction\\data\\insurance.csv'

In [7]:
%%time

insurance_data = pd.read_csv(file_path)

CPU times: total: 0 ns
Wall time: 12.5 ms


In [8]:
insurance_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.77,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.88,0,no,northwest,3866.86


In [9]:
insurance_data.shape

(1338, 7)

## Data Inspection

In [10]:
insurance_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


## Exploration Data Analysis

- Statistical Analysis
- Plots
- Correlation

## Model Building

In [11]:
insurance_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.77,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.88,0,no,northwest,3866.86


### Train Test Split

In [12]:
# 0.3, 0.7

In [13]:
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularDataset, TabularPredictor

In [14]:
# y = mX + c

In [15]:
X = insurance_data[["age", "sex", "bmi", "children", "smoker", "region"]]

In [16]:
type(X)

pandas.core.frame.DataFrame

In [17]:
X.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.77,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.7,0,no,northwest
4,32,male,28.88,0,no,northwest


In [18]:
y = insurance_data["charges"]

In [19]:
type(y)

pandas.core.series.Series

In [20]:
y[0:5]

0   16884.92
1    1725.55
2    4449.46
3   21984.47
4    3866.86
Name: charges, dtype: float64

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [22]:
X_train.shape, X_test.shape

((896, 6), (442, 6))

### Training Model

In [23]:
train_data, test_data = train_test_split(insurance_data, test_size=0.33, random_state=42)

In [24]:
train_data.shape, test_data.shape

((896, 7), (442, 7))

In [25]:
train_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1046,43,female,25.08,0,no,northeast,7325.05
682,39,male,35.3,2,yes,southwest,40103.89
1037,45,female,30.5,1,yes,northwest,39725.52
490,19,female,32.9,0,no,southwest,1748.77
39,60,male,39.9,0,yes,southwest,48173.36


In [26]:
test_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
764,45,female,25.18,2,no,northeast,9095.07
887,36,female,30.02,0,no,northwest,5272.18
890,64,female,26.89,0,yes,northwest,29330.98
1293,46,male,25.75,3,no,northwest,9301.89
259,19,male,31.92,0,yes,northwest,33750.29


In [27]:
label = 'charges'
print("Summary of class variable: \n", train_data[label].describe())

Summary of class variable: 
 count     896.00
mean    13379.69
std     12110.71
min      1121.87
25%      4819.84
50%      9556.30
75%     17092.92
max     62592.87
Name: charges, dtype: float64


In [28]:
%%time

save_path = 'models'
predictor = TabularPredictor(label=label, path=save_path).fit(train_data) # , presets="best_quality")

Beginning AutoGluon training ...
AutoGluon will save models to "models\"
AutoGluon Version:  0.6.2
Python Version:     3.9.16
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Train Data Rows:    896
Train Data Columns: 6
Label Column: charges
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (62592.87309, 1121.8739, 13379.68825, 12110.71371)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    10688.15 MB
	Train Data (Original)  Memory Usage: 0.19 MB (0.0% of available memory)
	Inferring data type of each feature based on colu

CPU times: total: 1min 6s
Wall time: 1min 29s


In [29]:
predictor.leaderboard(silent=True)

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-4964.32,0.14,71.42,0.0,1.12,2,True,12
1,LightGBM,-5040.53,0.01,1.91,0.01,1.91,1,True,4
2,CatBoost,-5048.36,0.01,57.96,0.01,57.96,1,True,6
3,RandomForestMSE,-5103.44,0.09,1.39,0.09,1.39,1,True,5
4,XGBoost,-5164.96,0.02,1.85,0.02,1.85,1,True,9
5,LightGBMXT,-5173.92,0.01,3.72,0.01,3.72,1,True,3
6,ExtraTreesMSE,-5194.26,0.15,1.27,0.15,1.27,1,True,7
7,LightGBMLarge,-5337.11,0.02,3.56,0.02,3.56,1,True,11
8,NeuralNetFastAI,-5444.88,0.04,4.35,0.04,4.35,1,True,8
9,NeuralNetTorch,-5603.32,0.02,9.04,0.02,9.04,1,True,10


## Model Evaluation

In [30]:
test_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
764,45,female,25.18,2,no,northeast,9095.07
887,36,female,30.02,0,no,northwest,5272.18
890,64,female,26.89,0,yes,northwest,29330.98
1293,46,male,25.75,3,no,northwest,9301.89
259,19,male,31.92,0,yes,northwest,33750.29


In [31]:
y_test = test_data[label]  # values to predict

In [32]:
y_test[0:5]

764     9095.07
887     5272.18
890    29330.98
1293    9301.89
259    33750.29
Name: charges, dtype: float64

In [33]:
test_data_nolab = test_data.drop(columns=[label])  # delete label column to prove we're not cheating
test_data_nolab.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
764,45,female,25.18,2,no,northeast
887,36,female,30.02,0,no,northwest
890,64,female,26.89,0,yes,northwest
1293,46,male,25.75,3,no,northwest
259,19,male,31.92,0,yes,northwest


In [34]:
save_path

'models'

Now, load in the stored models.

In [35]:
predictor

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x288b4c618e0>

## Load the save model

In [36]:
save_model_predictor = TabularPredictor.load(save_path)  # unnecessary, just demonstrates how to load previously-trained predictor from file

In [37]:
save_model_predictor

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x288d5e64e80>

## Predict the target

In [39]:
y_pred = save_model_predictor.predict(test_data_nolab)

In [40]:
y_pred[0:5]

764    10069.94
887     5864.03
890    28119.39
1293    9637.64
259    33401.29
Name: charges, dtype: float32

In [42]:
# print("Predictions:  \n", y_pred)
perf = save_model_predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)

Evaluation: root_mean_squared_error on test data: -4496.912233850297
	Note: Scores are always higher_is_better. This metric score can be multiplied by -1 to get the metric value.
Evaluations on test data:
{
    "root_mean_squared_error": -4496.912233850297,
    "mean_squared_error": -20222219.638952468,
    "mean_absolute_error": -2456.856548138645,
    "r2": 0.8620064340583384,
    "pearsonr": 0.9286603196781178,
    "median_absolute_error": -1307.9873554687501
}


In [43]:
save_model_predictor.leaderboard(test_data, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBMXT,-4440.94,-5173.92,0.03,0.01,3.72,0.03,0.01,3.72,1,True,3
1,CatBoost,-4454.78,-5048.36,0.01,0.01,57.96,0.01,0.01,57.96,1,True,6
2,WeightedEnsemble_L2,-4496.91,-4964.32,0.27,0.14,71.42,0.01,0.0,1.12,2,True,12
3,NeuralNetFastAI,-4511.28,-5444.88,0.07,0.04,4.35,0.07,0.04,4.35,1,True,8
4,LightGBM,-4569.54,-5040.53,0.02,0.01,1.91,0.02,0.01,1.91,1,True,4
5,XGBoost,-4661.41,-5164.96,0.06,0.02,1.85,0.06,0.02,1.85,1,True,9
6,NeuralNetTorch,-4714.74,-5603.32,0.03,0.02,9.04,0.03,0.02,9.04,1,True,10
7,ExtraTreesMSE,-4744.93,-5194.26,0.18,0.15,1.27,0.18,0.15,1.27,1,True,7
8,RandomForestMSE,-4795.65,-5103.44,0.2,0.09,1.39,0.2,0.09,1.39,1,True,5
9,LightGBMLarge,-5017.06,-5337.11,0.07,0.02,3.56,0.07,0.02,3.56,1,True,11


## Features Importance

In [44]:
%%time

save_model_predictor.feature_importance(train_data)

Computing feature importance via permutation shuffling for 6 features using 896 rows with 5 shuffle sets...
	11.29s	= Expected runtime (2.26s per shuffle set)
	2.82s	= Actual runtime (Completed 5 of 5 shuffle sets)


CPU times: total: 6 s
Wall time: 2.84 s


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
smoker,11681.26,584.07,0.0,5,12883.86,10478.66
bmi,4138.38,299.21,0.0,5,4754.46,3522.3
age,3344.1,96.61,0.0,5,3543.01,3145.18
children,487.7,66.86,0.0,5,625.37,350.03
region,369.66,25.98,0.0,5,423.15,316.17
sex,165.43,22.42,0.0,5,211.59,119.27


In [45]:
%%time

save_model_predictor.feature_importance(test_data)

Computing feature importance via permutation shuffling for 6 features using 442 rows with 5 shuffle sets...
	11.01s	= Expected runtime (2.2s per shuffle set)
	2.32s	= Actual runtime (Completed 5 of 5 shuffle sets)


CPU times: total: 5.47 s
Wall time: 2.34 s


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
smoker,10780.79,292.72,0.0,5,11383.51,10178.08
bmi,3190.94,179.46,0.0,5,3560.46,2821.42
age,2059.35,150.78,0.0,5,2369.82,1748.89
children,183.37,21.84,0.0,5,228.34,138.41
region,76.39,39.95,0.01,5,158.65,-5.88
sex,-42.33,18.62,1.0,5,-3.98,-80.67


### Bringing it all together

In [46]:
test_data["predicted_charges"] = y_pred

In [48]:
test_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,predicted_charges
764,45,female,25.18,2,no,northeast,9095.07,10069.94
887,36,female,30.02,0,no,northwest,5272.18,5864.03
890,64,female,26.89,0,yes,northwest,29330.98,28119.39
1293,46,male,25.75,3,no,northwest,9301.89,9637.64
259,19,male,31.92,0,yes,northwest,33750.29,33401.29


In [49]:
test_data["error_values"] = abs(test_data["charges"] - test_data["predicted_charges"])

In [50]:
test_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,predicted_charges,error_values
764,45,female,25.18,2,no,northeast,9095.07,10069.94,974.87
887,36,female,30.02,0,no,northwest,5272.18,5864.03,591.85
890,64,female,26.89,0,yes,northwest,29330.98,28119.39,1211.59
1293,46,male,25.75,3,no,northwest,9301.89,9637.64,335.74
259,19,male,31.92,0,yes,northwest,33750.29,33401.29,349.0


## Using input for prediction

In [51]:
test_data["sex"].unique()

array(['female', 'male'], dtype=object)

In [52]:
test_data["smoker"].unique()

array(['no', 'yes'], dtype=object)

In [53]:
test_data["region"].unique()

array(['northeast', 'northwest', 'southwest', 'southeast'], dtype=object)

## Creating Sample Input

In [62]:
input_data_dict = {
    "age": 45,
    "sex": "female",
    "bmi": 25.175,
    "children": 2, 
    "smoker": "no",
    "region": "northeast"
}

In [63]:
input_data_dict

{'age': 45,
 'sex': 'female',
 'bmi': 25.175,
 'children': 2,
 'smoker': 'no',
 'region': 'northeast'}

In [64]:
input_data = pd.DataFrame([input_data_dict])

In [65]:
input_data

Unnamed: 0,age,sex,bmi,children,smoker,region
0,45,female,25.18,2,no,northeast


In [66]:
save_model_predictor.predict(input_data)

0   10069.94
Name: charges, dtype: float32

In [67]:
save_model_predictor.predict(input_data)[0]

10069.939

## Streamlit

```python
# Display interactive widgets
st.button('Click me')
st.checkbox('I agree')
st.radio('Pick one', ['cats', 'dogs'])
st.selectbox('Pick one', ['cats', 'dogs'])
st.multiselect('Buy', ['milk', 'apples', 'potatoes'])
st.slider('Pick a number', 0, 100)
st.select_slider('Pick a size', ['S', 'M', 'L'])
st.text_input('First name')
st.number_input('Pick a number', 0, 10)
st.text_area('Text to translate')
st.date_input('Your birthday')
st.time_input('Meeting time')
st.file_uploader('Upload a CSV')
st.download_button('Download file', data)
st.camera_input("Take a picture")
st.color_picker('Pick a color')

# Use widgets' returned values in variables:
>>> for i in range(int(st.number_input('Num:'))):
>>>   foo()
>>> if st.sidebar.selectbox('I:',['f']) == 'f':
>>>   b()
>>> my_slider_val = st.slider('Quinn Mallory', 1, 88)
>>> st.write(slider_val)

# Disable widgets to remove interactivity:
>>> st.slider('Pick a number', 0, 100, disabled=True)
```

In [68]:
input_data_dict

{'age': 45,
 'sex': 'female',
 'bmi': 25.175,
 'children': 2,
 'smoker': 'no',
 'region': 'northeast'}

```python
## This is going to be a Streamlit App

import streamlit as st

st.title('Insurance Premium Charge Prediction')

age = st.slider('age', 16, 100)
sex = st.selectbox("sex", options=["male", "female"])
bmi = st.slider('bmi', 15, 100)
children = st.number_input('children', 0, 20)
smoker = st.selectbox('smoker', options=["yes"])
region = st.selectbox('region', options=['northeast', 'northwest', 'southwest', 'southeast'])
```