<a href="https://colab.research.google.com/github/dibakar75/DeepCNNClassifierProject/blob/main/insurance_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install keras-tuner

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [20]:
import tensorflow as tf
import pandas as pd
from tensorflow import keras
import keras_tuner as kt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.losses import MeanSquaredLogarithmicError
from tensorflow.keras.layers import Dense, Dropout

In [3]:
df = pd.read_csv('/content/sample_data/insurance_pred.csv')

In [4]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86
...,...,...,...,...,...,...,...
1333,50,male,31.0,3,no,northwest,10600.55
1334,18,female,31.9,0,no,northeast,2205.98
1335,18,female,36.9,0,no,southeast,1629.83
1336,21,female,25.8,0,no,southwest,2007.95


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   expenses  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [6]:
X = df.iloc[:, :6]
y = df.iloc[:,6]

In [7]:
#create dummy variables
Sex = pd.get_dummies(df["sex"], drop_first = 'True')
Region = pd.get_dummies(df["region"], drop_first = 'True')
Smoker = pd.get_dummies(df["smoker"], drop_first = 'True')

In [8]:
df = pd.concat([df, Sex, Region, Smoker], axis=1)

In [9]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses,male,northwest,southeast,southwest,yes
0,19,female,27.9,0,yes,southwest,16884.92,0,0,0,1,1
1,18,male,33.8,1,no,southeast,1725.55,1,0,1,0,0
2,28,male,33.0,3,no,southeast,4449.46,1,0,1,0,0
3,33,male,22.7,0,no,northwest,21984.47,1,1,0,0,0
4,32,male,28.9,0,no,northwest,3866.86,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1333,50,male,31.0,3,no,northwest,10600.55,1,1,0,0,0
1334,18,female,31.9,0,no,northeast,2205.98,0,0,0,0,0
1335,18,female,36.9,0,no,southeast,1629.83,0,0,1,0,0
1336,21,female,25.8,0,no,southwest,2007.95,0,0,0,1,0


In [10]:
df = df.drop(['sex', 'region','smoker'], axis=1)

In [11]:
train_set, test_set = train_test_split(df, test_size = 0.2, random_state = 42)
print(f"Rows in train_set : {len(train_set)}\nRows in test_set : {len(test_set)}")

Rows in train_set : 1070
Rows in test_set : 268


In [12]:
#shape of train and test
print(train_set.shape)
print(test_set.shape)

(1070, 9)
(268, 9)


In [13]:
#Data description
train_set.describe()

Unnamed: 0,age,bmi,children,expenses,male,northwest,southeast,southwest,yes
count,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0
mean,39.357009,30.56215,1.107477,13346.089869,0.51215,0.239252,0.264486,0.246729,0.205607
std,14.07396,6.043266,1.215983,12019.510759,0.500086,0.426827,0.441265,0.431309,0.404334
min,18.0,16.0,0.0,1121.87,0.0,0.0,0.0,0.0,0.0
25%,27.0,26.2,0.0,4897.665,0.0,0.0,0.0,0.0,0.0
50%,39.5,30.2,1.0,9575.44,1.0,0.0,0.0,0.0,0.0
75%,51.0,34.5,2.0,16746.655,1.0,0.0,1.0,0.0,0.0
max,64.0,53.1,5.0,62592.87,1.0,1.0,1.0,1.0,1.0


In [14]:
ind_var = 'expenses'

In [15]:
# split the data into dependent and independent
x_train, y_train = train_set.drop(ind_var, axis=1), train_set[ind_var]
x_test, y_test = test_set.drop(ind_var, axis=1), test_set[ind_var]

In [16]:
# Feature Scaling
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [18]:
## KERAS tuner

In [None]:
import pandas as pd

In [22]:
msle = MeanSquaredLogarithmicError()


def model_builder(hp):
  model = tf.keras.Sequential()
  
  units1 = hp.Int('units1', min_value=25, max_value=512, step=32)
  units2 = hp.Int('units2', min_value=25, max_value=512, step=32)
  units3 = hp.Int('units3', min_value=25, max_value=512, step=32)
  model.add(Dense(units=units1, activation='relu'))
  model.add(tf.keras.layers.Dense(units=units2, activation='relu'))
  model.add(tf.keras.layers.Dense(units=units3, activation='relu'))
  model.add(Dense(1, kernel_initializer='normal', activation='linear'))

  hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

  model.compile(
      optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate),
      loss=msle,
      metrics=[msle]
  )

  return model

In [23]:
# HyperBand algorithm from keras tuner
tuner = kt.Hyperband(
    model_builder,
    objective='val_mean_squared_logarithmic_error',
    max_epochs=10,
    directory='keras_tuner_dir',
    project_name='keras_tuner_demo'
)

In [25]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [26]:
tuner.search(x_train, y_train, epochs=10, validation_split=0.2, callbacks = [stop_early])

Trial 42 Complete [00h 00m 03s]
val_mean_squared_logarithmic_error: 0.17927835881710052

Best val_mean_squared_logarithmic_error So Far: 0.17503680288791656
Total elapsed time: 00h 02m 22s


In [27]:
##Finding the best hyper parameter
for p in ['units1','units2','units3','learning_rate']:
  print(p, tuner.get_best_hyperparameters()[0].get(p))

units1 89
units2 217
units3 505
learning_rate 0.01


In [28]:
##Model Evaluation
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
best_hps

<keras_tuner.engine.hyperparameters.HyperParameters at 0x7f45365e7cd0>

In [29]:
model = tuner.hypermodel.build(best_hps)

In [30]:
history = model.fit(x_train, y_train, epochs=50, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [31]:
eval_result = model.evaluate(x_test, y_test)
print(eval_result)

[0.15048184990882874, 0.14285314083099365]


In [32]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 89)                801       
                                                                 
 dense_5 (Dense)             (None, 217)               19530     
                                                                 
 dense_6 (Dense)             (None, 505)               110090    
                                                                 
 dense_7 (Dense)             (None, 1)                 506       
                                                                 
Total params: 130,927
Trainable params: 130,927
Non-trainable params: 0
_________________________________________________________________
