# Pré processamento

In [1]:
# packages
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

## data review

In [2]:
# data load
df = pd.read_csv('../data/heart_clean.csv', sep=',', encoding='utf-8')

In [3]:
# data head
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289.0,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180.0,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283.0,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214.0,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195.0,0,Normal,122,N,0.0,Up,0


In [4]:
# shape
df.shape

(917, 12)

In [5]:
# data types
df.dtypes

Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol       float64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
HeartDisease        int64
dtype: object

In [6]:
# info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 917 entries, 0 to 916
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             917 non-null    int64  
 1   Sex             917 non-null    object 
 2   ChestPainType   917 non-null    object 
 3   RestingBP       917 non-null    int64  
 4   Cholesterol     917 non-null    float64
 5   FastingBS       917 non-null    int64  
 6   RestingECG      917 non-null    object 
 7   MaxHR           917 non-null    int64  
 8   ExerciseAngina  917 non-null    object 
 9   Oldpeak         917 non-null    float64
 10  ST_Slope        917 non-null    object 
 11  HeartDisease    917 non-null    int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 86.1+ KB


In [7]:
# colunas categoricas
colunas_categoricas = df.select_dtypes(include='object')
colunas_categoricas.head()

Unnamed: 0,Sex,ChestPainType,RestingECG,ExerciseAngina,ST_Slope
0,M,ATA,Normal,N,Up
1,F,NAP,Normal,N,Flat
2,M,ATA,ST,N,Up
3,F,ASY,Normal,Y,Flat
4,M,NAP,Normal,N,Up


In [8]:
# colunas não categóricas
colunas_nao_categoricas = df.select_dtypes(exclude='object')
colunas_nao_categoricas.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
0,40,140,289.0,0,172,0.0,0
1,49,160,180.0,0,156,1.0,1
2,37,130,283.0,0,98,0.0,0
3,48,138,214.0,0,108,1.5,1
4,54,150,195.0,0,122,0.0,0


## encoder

In [9]:
# instancias
label = LabelEncoder()
one_hot = OneHotEncoder(sparse_output=False)

In [10]:
# label encoder
colunas_com_label = colunas_categoricas.copy()
colunas_com_label[colunas_categoricas.columns] = colunas_com_label.apply(label.fit_transform)

In [11]:
# check
colunas_com_label.head(), colunas_com_label.shape

(   Sex  ChestPainType  RestingECG  ExerciseAngina  ST_Slope
 0    1              1           1               0         2
 1    0              2           1               0         1
 2    1              1           2               0         2
 3    0              0           1               1         1
 4    1              2           1               0         2,
 (917, 5))

In [12]:
# one hot encoder
one_hot_encoder = one_hot.fit_transform(colunas_com_label)
colunas_com_one_hot = pd.DataFrame(one_hot_encoder, columns=one_hot.get_feature_names_out())

In [13]:
# check
colunas_com_one_hot.head(), colunas_com_one_hot.shape

(   Sex_0  Sex_1  ChestPainType_0  ChestPainType_1  ChestPainType_2  \
 0    0.0    1.0              0.0              1.0              0.0   
 1    1.0    0.0              0.0              0.0              1.0   
 2    0.0    1.0              0.0              1.0              0.0   
 3    1.0    0.0              1.0              0.0              0.0   
 4    0.0    1.0              0.0              0.0              1.0   
 
    ChestPainType_3  RestingECG_0  RestingECG_1  RestingECG_2  \
 0              0.0           0.0           1.0           0.0   
 1              0.0           0.0           1.0           0.0   
 2              0.0           0.0           0.0           1.0   
 3              0.0           0.0           1.0           0.0   
 4              0.0           0.0           1.0           0.0   
 
    ExerciseAngina_0  ExerciseAngina_1  ST_Slope_0  ST_Slope_1  ST_Slope_2  
 0               1.0               0.0         0.0         0.0         1.0  
 1               1.0      

## merge

In [14]:
# encoded
df_label = pd.concat([colunas_com_label, colunas_nao_categoricas], axis=1)
df_label.head()

Unnamed: 0,Sex,ChestPainType,RestingECG,ExerciseAngina,ST_Slope,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
0,1,1,1,0,2,40,140,289.0,0,172,0.0,0
1,0,2,1,0,1,49,160,180.0,0,156,1.0,1
2,1,1,2,0,2,37,130,283.0,0,98,0.0,0
3,0,0,1,1,1,48,138,214.0,0,108,1.5,1
4,1,2,1,0,2,54,150,195.0,0,122,0.0,0


In [15]:
# check types
df_label.dtypes

Sex                 int64
ChestPainType       int64
RestingECG          int64
ExerciseAngina      int64
ST_Slope            int64
Age                 int64
RestingBP           int64
Cholesterol       float64
FastingBS           int64
MaxHR               int64
Oldpeak           float64
HeartDisease        int64
dtype: object

In [16]:
# one hot encoded
df_one_hot = pd.concat([colunas_com_one_hot, colunas_nao_categoricas], axis=1)
df_one_hot.head()

Unnamed: 0,Sex_0,Sex_1,ChestPainType_0,ChestPainType_1,ChestPainType_2,ChestPainType_3,RestingECG_0,RestingECG_1,RestingECG_2,ExerciseAngina_0,...,ST_Slope_0,ST_Slope_1,ST_Slope_2,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,40,140,289.0,0,172,0.0,0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,49,160,180.0,0,156,1.0,1
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,37,130,283.0,0,98,0.0,0
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,48,138,214.0,0,108,1.5,1
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,54,150,195.0,0,122,0.0,0


In [17]:
# check types
df_one_hot.dtypes

Sex_0               float64
Sex_1               float64
ChestPainType_0     float64
ChestPainType_1     float64
ChestPainType_2     float64
ChestPainType_3     float64
RestingECG_0        float64
RestingECG_1        float64
RestingECG_2        float64
ExerciseAngina_0    float64
ExerciseAngina_1    float64
ST_Slope_0          float64
ST_Slope_1          float64
ST_Slope_2          float64
Age                   int64
RestingBP             int64
Cholesterol         float64
FastingBS             int64
MaxHR                 int64
Oldpeak             float64
HeartDisease          int64
dtype: object

## separação de variáveis

### alvo

In [21]:
# alvo
alvo = df.HeartDisease.values
alvo

array([0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,

### previsores

In [27]:
# sem encode
previsores = df.drop(['HeartDisease'], axis=1)
previsores.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,M,ATA,140,289.0,0,Normal,172,N,0.0,Up
1,49,F,NAP,160,180.0,0,Normal,156,N,1.0,Flat
2,37,M,ATA,130,283.0,0,ST,98,N,0.0,Up
3,48,F,ASY,138,214.0,0,Normal,108,Y,1.5,Flat
4,54,M,NAP,150,195.0,0,Normal,122,N,0.0,Up


In [28]:
# com label encoder
previsores_encoded = df_label.drop(['HeartDisease'], axis=1)
previsores_encoded.head()

Unnamed: 0,Sex,ChestPainType,RestingECG,ExerciseAngina,ST_Slope,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak
0,1,1,1,0,2,40,140,289.0,0,172,0.0
1,0,2,1,0,1,49,160,180.0,0,156,1.0
2,1,1,2,0,2,37,130,283.0,0,98,0.0
3,0,0,1,1,1,48,138,214.0,0,108,1.5
4,1,2,1,0,2,54,150,195.0,0,122,0.0


In [29]:
# com one hot
previsores_OHE = df_one_hot.drop(['HeartDisease'], axis=1)
previsores_OHE.head()

Unnamed: 0,Sex_0,Sex_1,ChestPainType_0,ChestPainType_1,ChestPainType_2,ChestPainType_3,RestingECG_0,RestingECG_1,RestingECG_2,ExerciseAngina_0,ExerciseAngina_1,ST_Slope_0,ST_Slope_1,ST_Slope_2,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,40,140,289.0,0,172,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,49,160,180.0,0,156,1.0
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,37,130,283.0,0,98,0.0
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,48,138,214.0,0,108,1.5
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,54,150,195.0,0,122,0.0


In [30]:
# com escalonamento
previsores_escalonados = StandardScaler().fit_transform(previsores_OHE)
previsores_escalonados

array([[-0.51630861,  0.51630861, -1.08542493, ..., -0.55173333,
         1.38333943, -0.83150225],
       [ 1.9368261 , -1.9368261 , -1.08542493, ..., -0.55173333,
         0.75473573,  0.10625149],
       [-0.51630861,  0.51630861, -1.08542493, ..., -0.55173333,
        -1.52395266, -0.83150225],
       ...,
       [-0.51630861,  0.51630861,  0.92129817, ..., -0.55173333,
        -0.85606123,  0.29380223],
       [ 1.9368261 , -1.9368261 , -1.08542493, ..., -0.55173333,
         1.46191489, -0.83150225],
       [-0.51630861,  0.51630861, -1.08542493, ..., -0.55173333,
         1.42262716, -0.83150225]])

## export

In [32]:
# alvo
np.savetxt('../data/heart_target.csv', alvo, delimiter=',', fmt='%d')

In [34]:
# previsores
previsores.to_csv('../data/heart_predictor.csv', sep=',', index=False)

In [35]:
# previsores com label
previsores_encoded.to_csv('../data/heart_predictor_le.csv', sep=',', index=False)

In [36]:
# previsores com one hot
previsores_OHE.to_csv('../data/heart_predictor_ohe.csv', sep=',', index=False)

In [40]:
# previsores escalonados
pd.DataFrame(previsores_escalonados).to_csv('../data/heart_predictor_esc.csv', sep=',', index=False, header=previsores_OHE.columns)