# Feature Engineering - Insurance Challenge

## Carregamento dos dados e verificações iniciais

In [1]:
import pandas as pd
import numpy as np


In [2]:
dataset_path = '/content/drive/MyDrive/curso_machine-learning-python/datasets/insurance.csv'

In [3]:
df = pd.read_csv(dataset_path)

In [4]:
df.head(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [6]:
# Calculo de moda estatística
df.mode()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,18,male,32.3,0,no,southeast,1639.5631


In [7]:
df.isnull().sum()

Unnamed: 0,0
age,0
sex,0
bmi,0
children,0
smoker,0
region,0
charges,0


In [8]:
df.isna().sum()

Unnamed: 0,0
age,0
sex,0
bmi,0
children,0
smoker,0
region,0
charges,0


Como podemos perceber das linhas anteriores, o dataset não tem valores missing e nem dados inconsistentes. Com isso, podemos partir para a geração das features.

In [9]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


## Feature Engineering

In [10]:
df2 = pd.DataFrame.copy(df)

In [11]:
df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [12]:
#Label encoding cria ordem artificial onde não existe.
#df2['sex'] = df2['sex'].replace({'male': 1, 'female': 2})

In [13]:
df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [14]:
# Label encoding cria ordem artificial onde não existe.
# df2['smoker'] = df2['smoker'].replace({'no': 0, 'yes': 1})

In [15]:
df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [16]:
# Label encoding cria ordem artificial onde não existe.
# df2['region'] = df2['region'].replace({'southwest': 1, 'southeast': 2, 'northwest': 3, 'northeast': 4})

In [17]:
df2.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


In [18]:
# One-hot encoding
encoded_df = pd.get_dummies(df, dtype = int)

In [19]:
encoded_df.head()

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,1,0,0,1,0,0,0,1
1,18,33.77,1,1725.5523,0,1,1,0,0,0,1,0
2,28,33.0,3,4449.462,0,1,1,0,0,0,1,0
3,33,22.705,0,21984.47061,0,1,1,0,0,1,0,0
4,32,28.88,0,3866.8552,0,1,1,0,0,1,0,0


In [20]:
reordered_encoded_df = encoded_df.iloc[:, [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 3]]
reordered_encoded_df.head()

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest,charges
0,19,27.9,0,1,0,0,1,0,0,0,1,16884.924
1,18,33.77,1,0,1,1,0,0,0,1,0,1725.5523
2,28,33.0,3,0,1,1,0,0,0,1,0,4449.462
3,33,22.705,0,0,1,1,0,0,1,0,0,21984.47061
4,32,28.88,0,0,1,1,0,0,1,0,0,3866.8552


In [21]:
all_features = reordered_encoded_df.iloc[:, 0:11].values

In [22]:
all_features

array([[19.  , 27.9 ,  0.  , ...,  0.  ,  0.  ,  1.  ],
       [18.  , 33.77,  1.  , ...,  0.  ,  1.  ,  0.  ],
       [28.  , 33.  ,  3.  , ...,  0.  ,  1.  ,  0.  ],
       ...,
       [18.  , 36.85,  0.  , ...,  0.  ,  1.  ,  0.  ],
       [21.  , 25.8 ,  0.  , ...,  0.  ,  0.  ,  1.  ],
       [61.  , 29.07,  0.  , ...,  1.  ,  0.  ,  0.  ]])

In [23]:
target = reordered_encoded_df.iloc[:, 11].values

In [24]:
target

array([16884.924 ,  1725.5523,  4449.462 , ...,  1629.8335,  2007.945 ,
       29141.3603])

### Criação de features padronizadas

In [25]:
from sklearn.preprocessing import StandardScaler

In [26]:
scaled_all_features = StandardScaler().fit_transform(all_features)

In [27]:
scaled_all_features_df = pd.DataFrame(scaled_all_features)
scaled_all_features_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
count,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0
mean,-1.805565e-16,-2.124194e-16,-5.576008e-17,4.779435e-17,-1.234687e-16,-1.606421e-16,1.0620970000000002e-17,-2.5224800000000002e-17,1.062097e-16,-1.0620970000000002e-17,1.035544e-16
std,1.000374,1.000374,1.000374,1.000374,1.000374,1.000374,1.000374,1.000374,1.000374,1.000374,1.000374
min,-1.509965,-2.412011,-0.9086137,-0.9895908,-1.010519,-1.970587,-0.5074631,-0.5652669,-0.5664179,-0.6113237,-0.5664179
25%,-0.8691547,-0.7164063,-0.9086137,-0.9895908,-1.010519,0.5074631,-0.5074631,-0.5652669,-0.5664179,-0.6113237,-0.5664179
50%,-0.01474046,-0.0432088,-0.07876719,-0.9895908,0.9895908,0.5074631,-0.5074631,-0.5652669,-0.5664179,-0.6113237,-0.5664179
75%,0.8396738,0.6611572,0.7510793,1.010519,0.9895908,0.5074631,-0.5074631,-0.5652669,-0.5664179,1.635795,-0.5664179
max,1.765289,3.685522,3.240619,1.010519,0.9895908,0.5074631,1.970587,1.769076,1.765481,1.635795,1.765481


In [28]:
high_corr_features = reordered_encoded_df.iloc[:, [0, 3, 4, 5, 6, 7, 8, 9]].values

In [29]:
scaled_high_corr_features = StandardScaler().fit_transform(high_corr_features)

Geramos 4 conjuntos de features distintos para avaliar os modelos. Esses conjuntos são:

- all_features: todas as features e já aplicando one-hot-encoding;
- scaled_all_features: todas as features, já aplicando one-hot-encoding e aplicando também padronização de escala;
- high_corr_features = apenas as features com maior correlação (Age, Sex, Smoker e Region) e já aplicando one-hot-encoding;
- scaled_high_corr_features =  apenas as features com maior correlação (Age, Sex, Smoker e Region) e já aplicando one-hot-encoding e também padronização de escala;

## Salvando as features e target para fornecer ao modelo

In [30]:
import pickle

In [31]:
processed_insurance_data = open('proccessed_insurance_data.pkl', 'wb')
pickle.dump(target, processed_insurance_data)
pickle.dump(all_features, processed_insurance_data)
pickle.dump(scaled_all_features, processed_insurance_data)
pickle.dump(high_corr_features, processed_insurance_data)
pickle.dump(scaled_high_corr_features, processed_insurance_data)
processed_insurance_data.close()