In [None]:
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

In [None]:
df = pd.read_csv('insurance.csv')
df

# DATA CLEANING

### Duplicates

In [None]:
df.duplicated().sum()

1


In [None]:
df.drop_duplicates(inplace = True)
df.shape

### Leading and Trailing Spaces

In [None]:
for col in df.select_dtypes(include = ['object']):
  df[col] = df[col].str.strip()

### Null Values

In [None]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64


### Outlier Detection and Resolution

In [None]:
numerical_features = ['age', 'bmi', 'children', 'charges']

# Create the box plot
plt.figure(figsize=(15, 10))  # Adjust figure size as needed
sns.boxplot(data = df[numerical_features])
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.title('Box Plot of All Numerical Features')
plt.tight_layout()
plt.show()

In [None]:
percentile_75 = df['charges'].quantile(0.75)
filtered_df = df[df['charges'] > percentile_75]
filtered_df

In [None]:
df.describe(include = 'all')

                age   sex          bmi  ...  smoker     region       charges
count   1337.000000  1337  1337.000000  ...    1337       1337   1337.000000
unique          NaN     2          NaN  ...       2          4           NaN
top             NaN  male          NaN  ...      no  southeast           NaN
freq            NaN   675          NaN  ...    1063        364           NaN
mean      39.222139   NaN    30.663452  ...     NaN        NaN  13279.121487
std       14.044333   NaN     6.100468  ...     NaN        NaN  12110.359656
min       18.000000   NaN    15.960000  ...     NaN        NaN   1121.873900
25%       27.000000   NaN    26.290000  ...     NaN        NaN   4746.344000
50%       39.000000   NaN    30.400000  ...     NaN        NaN   9386.161300
75%       51.000000   NaN    34.700000  ...     NaN        NaN  16657.717450
max       64.000000   NaN    53.130000  ...     NaN        NaN  63770.428010

[11 rows x 7 columns]


# EXPLORATORY DATA ANALYSIS

### Check Feature Distribution
Check if the data set is balanced

In [None]:
# create histograms for all features of df
plt.figure(figsize=(15, 10))
for column in df.columns:
    plt.hist(df[column], bins = 50) # Adjust the number of bins as needed
    plt.title(f'Histogram of {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.show()


From the histograms above, we can see that:

1. Sex, BMI, Region Features are well balanced or normally-distributed
2. Age has more young people (in their 20s)
3. More people have no children
4. Smoker is unbalanced with more non-smokers

### Correlation

In [None]:
# Create correlation ship heat map for numerical features
# Calculate the correlation matrix
correlation_matrix = df.corr(numeric_only = True)

# Create the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix Heatmap')
plt.show()

The numerical features in the data set generally seem to have low correlation with each other. The highest correlation was between age and charges at 0.30. This was followed by BMI and charges at 0.20. Age and BMI has 0.11 correlation.

### One-Hot Encoding

In [None]:
# Define categorical features
categorical_features = ['sex', 'smoker', 'region']
# one-hot encode categorical features
df = pd.get_dummies(df, columns = ['sex', 'smoker', 'region'])

# DATA SPLIT

In [None]:
X = df.drop('charges', axis=1)
y = df['charges']  # Target variable

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)  # Adjust test_size and random_state as needed

# DATA PREPROCESSING

### Standardization

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# MODEL TRAINING

### Linear Regression

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

### Decision Tree

In [None]:
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)

### Random Forest

In [None]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

# PREDICTIONS

In [None]:
y_pred_lr = lr.predict(X_test)
y_pred_dt = dt.predict(X_test)
y_pred_rf = rf.predict(X_test)

# EVALUATION

In [None]:
mae_lr = mean_absolute_error(y_test, y_pred_lr)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

mae_dt = mean_absolute_error(y_test, y_pred_dt)
mse_dt = mean_squared_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

# Create a comparison table
model_comparison = pd.DataFrame({
    'Model': ['Linear Regression', 'Decision Tree', 'Random Forest'],
    'Mean Absolute Error': [mae_lr, mae_dt, mae_rf],
    'Mean Squared Error': [mse_lr, mse_dt, mse_rf],
    'R-squared': [r2_lr, r2_dt, r2_rf]
})

model_comparison

# RANDOM FOREST MODEL OPTIMIZATION

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator = RandomForestRegressor(random_state = 42),
                           param_grid = param_grid,
                           scoring = 'neg_mean_squared_error',
                           cv = 5,
                           verbose = 2)

In [None]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.3s
[CV] END ma

In [None]:
# Identify the best parameters
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Score:", best_score)

Best Parameters: {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
Best Score: -21499958.381350126


In [None]:
# Use the best parameters to create the model
best_rf_model = RandomForestRegressor(**best_params, random_state=42)
best_rf_model.fit(X_train, y_train)

In [None]:
# Predict
y_pred_rf_best = best_rf_model.predict(X_test)

In [None]:
mae_lr = mean_absolute_error(y_test, y_pred_lr)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

mae_dt = mean_absolute_error(y_test, y_pred_dt)
mse_dt = mean_squared_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

mae_rf_best = mean_absolute_error(y_test, y_pred_rf_best)
mse_rf_best = mean_squared_error(y_test, y_pred_rf_best)
r2_rf_best = r2_score(y_test, y_pred_rf_best)

# Create a comparison table
model_comparison = pd.DataFrame({
    'Model': ['Linear Regression', 'Decision Tree', 'Random Forest', 'Random Forest Best'],
    'Mean Absolute Error': [mae_lr, mae_dt, mae_rf, mae_rf_best],
    'Mean Squared Error': [mse_lr, mse_dt, mse_rf, mse_rf_best],
    'R-squared': [r2_lr, r2_dt, r2_rf, r2_rf_best]
})

model_comparison