# PROJECT: PREDICTING MEDICAL EXPENSES. 




## Required libraries


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats.mstats import normaltest
from scipy.stats import boxcox
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (StandardScaler, PolynomialFeatures)
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.metrics import r2_score 
from sklearn.model_selection import KFold, cross_val_predict, cross_val_score
import matplotlib.pyplot as plt
import ipywidgets as widget
from ipywidgets import interact


## Dataset


In [None]:
data = pd.read_csv(r"C:\Users\BAYEE\Desktop\Online Courses\IBM Machine Learning Engineer\1. Regression\final project_regression\insurance.csv", encoding='ISO-8859-1')
data.head()

# About the Data


The insurance dataset comprises 1,338 observations and 7 distinct features represented as columns. This dataset encompasses 4 numerical attributes, namely age, BMI, children, and expenses, along with 3 categorical attributes denoted as sex, smoker, and region.

# Objectives


The primary aim of this project is to predict the medical expenses of individuals, a task that aids medical insurance providers in determining their premium charges.

# Data Cleaning


Missing Values

In [None]:
# no missing data
data.info()
data.isnull().sum()
#no missing values

Outliers

In [None]:
numeric_columns = data.select_dtypes(include=[np.number]).columns.tolist() #creating number columns
qualitative_columns=data.select_dtypes(include=[np.object_]).columns.tolist()

In [None]:
#used box plot to detect outliers
fig, axes = plt.subplots(nrows=1, ncols=len(numeric_columns), figsize=(15, 6))  # Create subplots

for i, column in enumerate(numeric_columns):
    axes[i].boxplot(data[column], vert=False)  # Create a box plot for the column in the i-th subplot
    axes[i].set_title(f'Box Plot for {column}')  # Set the title for the subplot
    axes[i].set_xlabel(column)  # Set the x-axis label

plt.tight_layout()  # Adjust subplot layout for better spacing
plt.show()  # Display the figure with subplots


In [None]:
#used z-score to detect and remove outliers

treshold=3
for column in numeric_columns:
    # Calculate the z-scores
    z_scores = stats.zscore(data[column])
    
    # Find the rows where z-scores are greater than treshold
    outliers = np.abs(z_scores) > treshold
    
    # Replace outliers with the mean value of the column
    mean_value = data[column].mean()
    data.loc[outliers, column] = mean_value

# Now, df contains the data with outliers replaced by the mean value for each numeric column
data

Normality of the Dependent Variable

In [None]:
#function to return plots for the feature
def normality(data,feature):
    plt.figure(figsize=(10,5))
    plt.subplot(1,2,1)
    sns.kdeplot(data[feature])
    plt.subplot(1,2,2)
    stats.probplot(data[feature],dist="norm", plot=plt)
    plt.show()
    statistic, pvalue=normaltest(np.log(data[feature])) # not normally distributed
    print("Statistic = {}".format(statistic))
    print("pvalue = {}".format(pvalue))

In [None]:
## Log transforation of numeric olumns to make the data more normal
for col in numeric_columns:
    data["log_"+col]=np.log1p(data[col])
#data

In [None]:
normality(data, 'expenses')
normality(data, 'log_expenses')  #the log transformed data looks more normal

One Hot Encoding

In [None]:

# Get a Pd.Series consisting of all the string categoricals
one_hot_encode_cols = data.dtypes[data.dtypes == object]  # filtering by string categoricals
one_hot_encode_cols = one_hot_encode_cols.index.tolist()  # list of categorical fields

# Encode these columns as categoricals so one hot encoding works on split data (if desired)
for col in one_hot_encode_cols:
    data[col] = pd.Categorical(data[col])

# Do the one hot encoding
data = pd.get_dummies(data, columns=one_hot_encode_cols, drop_first=True)
data

In [None]:
# Drop the non-log numerical columns
data.drop(numeric_columns, axis=1, inplace=True)
data

# 3. Linear Regression Models


In [None]:
## Defining Target and Feature Variables
y=data['log_expenses']
X=data.drop('log_expenses', axis=1)

## Spliting data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## Normalizing the training data using `StandardScaler` on `X_train`. Use fit_transform() function**
s = StandardScaler()
X_train = s.fit_transform(X_train)

X_test=s.transform(X_test)


In [None]:
## Linear Regression
lm = LinearRegression() 
lm.fit(X_train,y_train) 
lm_predictions = lm.predict(X_test)
r2_lm=r2_score(y_test,lm_predictions)

# ## Ridge Regression
rr=Ridge()
rr.fit(X_train, y_train)
rr_predictions = rr.predict(X_test)
r2_rr=r2_score(y_test, rr_predictions)

# ## Lasso Regression
ls=Lasso()
ls.fit(X_train, y_train)
ls_predictions = rr.predict(X_test)
r2_ls=r2_score(y_test, ls_predictions)

# ## Elastic Net Regression
en=ElasticNet()
en.fit(X_train, y_train)
en_predictions = en.predict(X_test)
r2_en=r2_score(y_test, en_predictions)


In [None]:
print("R-squared results")
print("*******************************************************")
print("linear Regression = {}".format(r2_lm))
print("ridge Regression = {}".format(r2_rr))
print("lasso Regression = {}".format(r2_ls))
print("elastic net Regression = {}".format(r2_en))

Visual representation of actual and predicted values

In [None]:
def  plot_dis(y,yhat):
    
    plt.figure()
    ax1 = sns.distplot(y, hist=False, color="r", label="Actual Value")
    sns.distplot(yhat, hist=False, color="b", label="Fitted Values" , ax=ax1)
    plt.legend()

    plt.title('Actual vs Fitted Values')
    # plt.xlabel('Price (in dollars)')
    # plt.ylabel('Proportion of Cars')

    plt.show()
    plt.close()

In [None]:
plot_dis(y,ls_predictions)

## Deep Learning - Artificial Neural Networks

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.metrics import MeanSquaredError
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow_addons.metrics import RSquare

# Assuming X_train, X_test, y_train, y_test are your training and testing data
# Assuming they are numpy arrays

# Normalize your input data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define your neural network architecture
ann = Sequential([
  Dense(64, activation='relu', input_shape=(8,)),
  Dense(32, activation='relu'),
  Dense(1)
])

# Compile the model with Mean Squared Error loss and RSquare metric
ann.compile(optimizer='adam', loss='mean_squared_error', metrics=[RSquare()])

# Fit the model to your training data
# model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))
history = ann.fit(X_train, y_train, epochs=75, validation_data=(X_test, y_test))

# Evaluate the model on the testing data
loss, r2 = ann.evaluate(X_test, y_test)

print("Mean Squared Error on Test Data:", loss)
print("R-squared on Test Data:", r2)

#plot the training and validation accuracy and loss at each epoch
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1,len(loss) + 1)
plt.plot(epochs, loss, 'y', label = 'Training loss')
plt.plot(epochs, val_loss, 'r', label = 'Validation loss')
plt.title('Training and Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()


# Insights and key findings


1. With the exception of Lassor Regression, the the R-squared of the linear regression models are generally 77%
2. The R-squared of the Artificial Neural Networks (ANN) model is about 80%
2. Therefore, ANN model was selected for this project