In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import os 
import tensorflow as tf
import random

import warnings
warnings.filterwarnings('ignore')

In [2]:
# # setting seed for reproducible results
# def set_seed(seed_value=42):
#     os.environ['PYTHONHASHSEED']=str(seed_value)
#     random.seed(seed_value)
#     np.random.seed(seed_value)
#     tf.random.set_seed(seed_value)
# #     tf.random.uniform([1], seed=seed_value)

In [3]:
# not setting a seed because I want to compare the results with the previous non-seed models
# set_seed()

In [4]:
df = pd.read_csv('insurance.csv')

In [5]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [6]:
# so I read an article recently explaining the importance of 
# one hot encoding over labelencoding
# so that made me think of how the model performance would differ 
# if OH-encoding is used, so I'm taking a sabatical from NN specific improvement
# for this notebook, and will instead train the previous models with OH-encoding
# disclaimer: I'm not expecting huge difference between the model performances, 
# but ideally some improvement should be seen 

In [7]:
from sklearn.preprocessing import OneHotEncoder

In [8]:
df1 = df.copy()

In [9]:
OH_encoder_sex = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_sex = OH_encoder_sex.fit_transform(df[['sex']])

OH_encoder_smoker = OneHotEncoder(handle_unknown='ignore')
OH_smoker = OH_encoder_smoker.fit_transform(df[['smoker']])

OH_encoder_region = OneHotEncoder(handle_unknown='ignore')
OH_region = OH_encoder_region.fit_transform(df[['region']])

In [10]:
df['sex'][:5]

0    female
1      male
2      male
3      male
4      male
Name: sex, dtype: object

In [11]:
OH_sex[:5]

array([[1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.]])

In [12]:
# df1['OH_sex'] = pd.DataFrame(OH_sex, columns=OH_encoder_sex.get_feature_names())
# df1['OH_smoker'] = pd.DataFrame(OH_smoker, columns=OH_encoder_smoker.get_feature_names())
# df1['OH_region'] = pd.DataFrame(OH_region, columns=OH_encoder_region.get_feature_names())

In [13]:
df1['OH_sex_customFunc'] = df['sex'].apply(lambda x: [1, 0] if x=='male' else [0, 1])

In [14]:
df1.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,OH_sex_customFunc
0,19,female,27.9,0,yes,southwest,16884.924,"[0, 1]"
1,18,male,33.77,1,no,southeast,1725.5523,"[1, 0]"
2,28,male,33.0,3,no,southeast,4449.462,"[1, 0]"
3,33,male,22.705,0,no,northwest,21984.47061,"[1, 0]"
4,32,male,28.88,0,no,northwest,3866.8552,"[1, 0]"


In [15]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [16]:
df2 = df.copy()

In [17]:
# from https://stackoverflow.com/questions/58101126/using-scikit-learn-onehotencoder-with-a-pandas-dataframe

OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_region = pd.DataFrame(OH_encoder.fit_transform(df2[['region']]))

# One-hot encoding removed index; put it back
OH_region.index = df2.index

# Remove categorical columns (will replace with one-hot encoding)
num_regions = df2.drop(['region'], axis=1)

# Add one-hot encoded columns to numerical features
OH_Region = pd.concat([num_regions, OH_region], axis=1)

In [18]:
OH_Region.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,0,1,2,3
0,19,female,27.9,0,yes,16884.924,0.0,0.0,0.0,1.0
1,18,male,33.77,1,no,1725.5523,0.0,0.0,1.0,0.0
2,28,male,33.0,3,no,4449.462,0.0,0.0,1.0,0.0
3,33,male,22.705,0,no,21984.47061,0.0,1.0,0.0,0.0
4,32,male,28.88,0,no,3866.8552,0.0,1.0,0.0,0.0


In [19]:
# so I read up on OHE, and other alternatives like frequency encoding etc.

# I learnt 2 things from this mini experiment (which took way longer than I think it should've)
# 1) OHE is not useful for binary features 
# 2) OHE is terrible for huge categories within a single categorical feature (essetially it blows the dimensionality)
# [bonus] 3) OHE increases feature count, and as a result might introduce colinearity within those features (aka bad) 

# and as a result I'm dropping the idea of OHE and am sticking with labelencoding (FOR THIS DATASET*)