## 0. Setting up Environment

### 0.1 Libraries

In [330]:
import pandas as pd
import numpy as np
import re

### 0.2 Functions

In [332]:

# Standardizes strings for columns names
def scrub_colnames(string):
    return re.sub(r'[($)]', '', string.lower().replace(' ', '_')).rstrip("_")

## 1. Reading in Data

### 1.1 Reading Raw Data

In [335]:
dat_raw = pd.read_csv('Medicalpremium.csv')

### 1.2 Standardize Column Names

In [337]:
outcols = ['BloodPressureProblems',
           'AnyTransplants',
           'AnyChronicDiseases',
           'KnownAllergies',
           'HistoryOfCancerInFamily',
           'NumberOfMajorSurgeries',
           'PremiumPrice']

incols = ['Blood_Pressure_Problems',
          'Any_Transplants',
          'Any_Chronic_Diseases',
          'Known_Allergies',
          'History_Of_Cancer_In_Family',
          'Number_Of_Major_Surgeries',
          'Premium_Price']

# Replace all columns labels of outcols with incols
dat = dat_raw
for incol, outcol in zip(incols, outcols):
    dat[incol] = dat[outcol]
    dat = dat.drop(outcol, axis=1)
    
dat.columns = dat.columns.map(scrub_colnames)

## 3. Predictive Modelling

### 3.1 AutoML using Lazy Predict (No Hyper parameter tuning, No Feature Selection, No Cross Validation)

In [340]:
models = pd.read_csv('data/lpmodels.csv')

### 3.3 Top Models as per AutoML

In [342]:

results_df = pd.read_csv('data/results_df.csv')
# Display the table
print("\nModel Performance Summary:")
print(results_df.to_string(index=False))


Model Performance Summary:
 Unnamed: 0                Model                                                Best Params    RMSE   R²  Adjusted R²  CV RMSE  CV R²  CV Adjusted R²
          0     GradientBoosting {'model__learning_rate': 0.05, 'model__n_estimators': 100} 2832.32 0.81         0.80  3427.15   0.69            0.68
          1                 LGBM {'model__learning_rate': 0.05, 'model__n_estimators': 100} 2784.71 0.82         0.81  3555.40   0.66            0.66
          2             CatBoost          {'model__depth': 4, 'model__learning_rate': 0.05} 2891.82 0.80         0.79  3714.07   0.63            0.63
          3 HistGradientBoosting     {'model__learning_rate': 0.05, 'model__max_iter': 100} 2784.72 0.82         0.81  3556.03   0.66            0.66
          4         RandomForest       {'model__max_depth': 10, 'model__n_estimators': 100} 3003.53 0.79         0.78  3804.69   0.61            0.61


### 3.5 Best Model

asets.


### SHAP Analysis Commentary – Insurance Premium Prediction - HistGradientBoosting

This SHAP summary plot explains the impact of each feature on insurance premium predictions:

#### 1. `age`
- Higher `age` values (pink dots on far right side of vertical 0 line) significantly increase predicted premiums.
- Lower `age` values (blue dots on the far left side of vertical 0 line) decrease premiums.
- Clear positive correlation: **older individuals → higher insurance costs**.

#### 2. `any_transplants`
- Having undergone a transplant leads to a strong increase in predicted premiums (pink dots on far right).
- Those without (blue values - low feature value were not on extremes) have minimal or even negative influence on premiums.

#### 3. `any_chronic_diseases`
- Individuals with chronic conditions (pink dots on right side of vertical 0 line) show increase in premiums.
- Those without (blue values - low feature value were not on extremes) have minimal or even negative influence on premiums.

#### 4. `number_of_major_surgeries`
- More major surgeries tend to increase premiums slightly, but the effect is less consistent since low (blue) and high values (pink) overlap around 0

#### 5. `blood_pressure_problems`
- Least influential among the top 5 features.
- High blood pressure (small pink - High Value dots on little right of vertical 0 lines) contributes marginally to higher premiums. The blue and pink dots overlapping around 0 indicate little to no effect on prediction 



### 3.8 Scatter Plot of actual Vs predicted premium

## 4. Dashboard

### 4.1

In [349]:
import streamlit as st
import altair as alt
import plotly.express as px

#best_model.get_params()
tmodels = models.T
models

Unnamed: 0,Model,Adjusted R-Squared,R-Squared,RMSE,Time Taken
0,HistGradientBoostingRegressor,0.88,0.88,2227.06,0.15
1,LGBMRegressor,0.87,0.88,2255.57,0.04
2,RandomForestRegressor,0.86,0.87,2340.35,0.12
3,BaggingRegressor,0.86,0.86,2419.73,0.02
4,GradientBoostingRegressor,0.85,0.86,2482.45,0.06
5,ExtraTreesRegressor,0.83,0.84,2586.82,0.09
6,XGBRegressor,0.8,0.81,2857.32,0.07
7,ExtraTreeRegressor,0.73,0.74,3299.07,0.0
8,TransformedTargetRegressor,0.7,0.71,3495.95,0.0
9,LinearRegression,0.7,0.71,3495.95,0.0


In [360]:
def make_heatmap(input_df, input_y, input_x, input_color, input_color_theme):
    heatmap = alt.Chart(input_df).mark_rect().encode(
        y=alt.Y(f'{input_y}:O', axis=alt.Axis(title="", titleFontSize=18, titlePadding=15, titleFontWeight=900, labelAngle=0)),
        x=alt.X(f'{input_x}:O', axis=alt.Axis(title="", titleFontSize=18, titlePadding=15, titleFontWeight=900)),
        color=alt.Color(f'max({input_color}):Q',
                        legend=None,
                        scale=alt.Scale(scheme=input_color_theme)),
        stroke=alt.value('black'),
        strokeWidth=alt.value(0.25),
        ).properties(width=900
                     ).configure_axis(
        labelFontSize=12,
        titleFontSize=12
        ) 
    # height=300
    return heatmap

st.set_page_config(
    page_title="US Population Dashboard",
    page_icon="🏂",
    layout="wide",
    initial_sidebar_state="expanded")

alt.theme.enable("dark")

with st.sidebar:
    st.title('🏂 US Population Dashboard')
    
    model_list = list(models.Model.unique())[::-1]
    
    selected_model = st.selectbox('Select a model', model_list, index=len(model_list)-1)
    df_selected_model = models[models.Model == selected_model]
    df_selected_model_sorted = df_selected_model.sort_values(by="RMSE", ascending=False)

    color_theme_list = ['blues', 'cividis', 'greens', 'inferno', 'magma', 'plasma', 'reds', 'rainbow', 'turbo', 'viridis']
    selected_color_theme = st.selectbox('Select a color theme', color_theme_list)

col = st.columns((5), gap='medium')

with col[0]:
    st.markdown('#### Top States')

    st.dataframe(df_selected_model_sorted,
                 column_order=("Model", "Time Taken"),
                 hide_index=True,
                 width=None,
                 column_config={
                    "Model": st.column_config.TextColumn(
                        "Model",
                    ),
                    "RMSE": st.column_config.ProgressColumn(
                        "RMSE",
                        format="%f",
                        min_value=0,
                        max_value=max(df_selected_model_sorted.xs('RMSE', axis=1)),
                     ),
                     "R-Squared": st.column_config.ProgressColumn(
                        "R-Squared",
                        format="%f",
                        min_value=0,
                        max_value=max(df_selected_model_sorted.xs('R-Squared', axis=1)),
                     ),
                     "Adjusted R-Squared": st.column_config.ProgressColumn(
                        "Adjusted R-Squared",
                        format="%f",
                        min_value=0,
                        max_value=max(df_selected_model_sorted.xs('Adjusted R-Squared', axis=1)),
                     ),
                     "Time Taken": st.column_config.ProgressColumn(
                        "Time Taken",
                        format="%f",
                        min_value=0,
                        max_value=max(df_selected_model_sorted.xs('Time Taken', axis=1)),
                     )}
                 )
    
    with st.expander('About', expanded=True):
        st.write('''
            - Data: [Kaggle Medical Insurance Premium](<https://www.kaggle.com/datasets/tejashvi14/medical-insurance-premium-prediction/data>).
            - :orange[**Age**]: Age of customer.
            - :orange[**Height**]: Height of customer.
            - :orange[**Weight**]: Weight of customer.
            - :orange[**Diabetes**]: Whether the person has abnormal blood sugar levels.
            - :orange[**Blood Pressure Problems**]: Whether the person Has abnormal blood pressure levels.
            - :orange[**Any Transplants**]: Any major organ transplants.
            - :orange[**Any Chronice Disease**]: Whether customer suffers from chronic ailments like asthama, etc.
            - :orange[**Known Allergies**]: Whether the customer has any known allergies.
            - :orange[**History of Cancer**]: Whether any blood relative of the customer has had any form of cancer.
            - :orange[**Number of Major Surgeries**]: The number of major surgeries that the person has had.
            - :green[**Premium Price**]: Target variable for prediction to create a model that predicts the yearly medical cover cost
            ''')



In [464]:
df_selected_measure = results_df[results_df.Model == 'RMSE']
error_cols = ['RMSE', 'CV RMSE', 'R²', 'Adjusted R²', 'CV R²', 'CV Adjusted R²']
model_list = list(results_df.Model)[::-1]
#results_df.set_index('Model')[error_cols]
results_df[error_cols]


Unnamed: 0,RMSE,CV RMSE,R²,Adjusted R²,CV R²,CV Adjusted R²
0,2832.32,3427.15,0.81,0.8,0.69,0.68
1,2784.71,3555.4,0.82,0.81,0.66,0.66
2,2891.82,3714.07,0.8,0.79,0.63,0.63
3,2784.72,3556.03,0.82,0.81,0.66,0.66
4,3003.53,3804.69,0.79,0.78,0.61,0.61


In [476]:
select_measure = 'RMSE'

rmse_chart = (alt.Chart(results_df)
              .mark_bar()
              .encode(
                  x='Model:O',
                  y=f'{select_measure}:Q',
                  color=alt.Color('Model:N', legend=None)
                  )
              .properties(
                  width=500,
                  height=500
                  )
             )

rmse_chart.display()