In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import pickle

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA, KernelPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [3]:
df = pd.read_csv('/content/drive/MyDrive/Udemy/ML com Python/1 - Aprendizado Supervisionado: Classificacao/heart_tratado.csv', sep=';', encoding='utf-8')

In [4]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289.0,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180.0,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283.0,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214.0,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195.0,0,Normal,122,N,0.0,Up,0


In [5]:
df.shape

(917, 12)

# Variáveis Categóricas

### Transformando as variáveis categóricas nominais em ordinais

In [6]:
# Realiza uma cópia do dataframe
df2 = pd.DataFrame.copy(df)

In [7]:
# Transformação das variáveis norminais
df2['Sex'].replace({'M': 0, 'F': 1}, inplace=True)
df2['ChestPainType'].replace({'TA': 0, 'ATA': 1, 'NAP': 2, 'ASY': 3}, inplace=True)
df2['RestingECG'].replace({'Normal': 0, 'ST': 1, 'LVH': 2}, inplace=True)
df2['ExerciseAngina'].replace({'N': 0, 'Y': 1}, inplace=True)
df2['ST_Slope'].replace({'Up': 0, 'Flat': 1, 'Down': 2}, inplace=True)

In [8]:
df2.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,0,1,140,289.0,0,0,172,0,0.0,0,0
1,49,1,2,160,180.0,0,0,156,0,1.0,1,1
2,37,0,1,130,283.0,0,1,98,0,0.0,0,0
3,48,1,3,138,214.0,0,0,108,1,1.5,1,1
4,54,0,2,150,195.0,0,0,122,0,0.0,0,0


In [9]:
df.dtypes

Unnamed: 0,0
Age,int64
Sex,object
ChestPainType,object
RestingBP,int64
Cholesterol,float64
FastingBS,int64
RestingECG,object
MaxHR,int64
ExerciseAngina,object
Oldpeak,float64


Sex:
- 0 = M
- 1 = F

ChestPainType:
- 0 = TA (Angina típica)
- 1 = ATA (Angina atípica)
- 2 = NAP (Dor não anginosa)
- 3 = ASY (Assintomático)

RestingECG:
- 0 = Normal
- 1 = ST (Anormalidade da onda ST)
- 2 = LVH (Hipertrofia ventricular esquerda)

ExerciseAngina:
- 0 = Não
- 1 = Sim

ST_Slope:
- 0 = Up
- 1 = Flat
- 2 = Down

# Separação das Variáveis e Escalonamento

In [10]:
# Separacao das variaveis preditoras e variavel alvo
y = df2['HeartDisease']
X = df2.drop('HeartDisease', axis=1)

In [11]:
# Forma alternativa de fazer a separacao

# y = df2.iloc[:, :11].values
# X = df2.iloc[:, 11].values

In [12]:
X.shape, y.shape

((917, 11), (917,))

## Analise das escalas das variaveis

Existe uma diferenca muito grande nas escalas dos valores para entre as variaveis, o que pode prejudicar a performance dos nossos modelos.

Para contornar isso, podemos aplicar algumas tecnicas de __padronizacao__ e __normalizacao__ dos dados.

Na _padronizacao_, utilizamos a media e o desvio-padrao como referencia.

Ja na _normalizacao_, utilizamos valores minimo e maximo como referencia.

In [13]:
df2.describe()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
count,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0
mean,53.509269,0.210469,2.251908,132.540894,244.635389,0.23337,0.604144,136.789531,0.40458,0.886696,0.63795,0.55289
std,9.437636,0.407864,0.931502,17.999749,53.347125,0.423206,0.806161,25.467129,0.491078,1.06696,0.60727,0.497466
min,28.0,0.0,0.0,80.0,85.0,0.0,0.0,60.0,0.0,-2.6,0.0,0.0
25%,47.0,0.0,2.0,120.0,214.0,0.0,0.0,120.0,0.0,0.0,0.0,0.0
50%,54.0,0.0,3.0,130.0,244.635389,0.0,0.0,138.0,0.0,0.6,1.0,1.0
75%,60.0,0.0,3.0,140.0,267.0,0.0,1.0,156.0,1.0,1.5,1.0,1.0
max,77.0,1.0,3.0,200.0,603.0,1.0,2.0,202.0,1.0,6.2,2.0,1.0


In [14]:
# Padronizacao
X_padronizado = StandardScaler().fit_transform(X)

In [15]:
X_padronizado

array([[-1.43220634, -0.51630861, -1.34470119, ..., -0.82431012,
        -0.83150225, -1.05109458],
       [-0.47805725,  1.9368261 , -0.27058012, ..., -0.82431012,
         0.10625149,  0.59651863],
       [-1.75025603, -0.51630861, -1.34470119, ..., -0.82431012,
        -0.83150225, -1.05109458],
       ...,
       [ 0.37007527, -0.51630861,  0.80354095, ...,  1.21313565,
         0.29380223,  0.59651863],
       [ 0.37007527,  1.9368261 , -1.34470119, ..., -0.82431012,
        -0.83150225,  0.59651863],
       [-1.64423947, -0.51630861, -0.27058012, ..., -0.82431012,
        -0.83150225, -1.05109458]])

In [16]:
X_padronizado_df = pd.DataFrame(X_padronizado, columns=X.columns)

In [17]:
X_padronizado_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,-1.432206,-0.516309,-1.344701,0.414627,0.832075,-0.551733,-0.749818,1.383339,-0.82431,-0.831502,-1.051095
1,-0.478057,1.936826,-0.27058,1.52636,-1.212261,-0.551733,-0.749818,0.754736,-0.82431,0.106251,0.596519
2,-1.750256,-0.516309,-1.344701,-0.14124,0.719543,-0.551733,0.491306,-1.523953,-0.82431,-0.831502,-1.051095
3,-0.584074,1.936826,0.803541,0.303453,-0.574578,-0.551733,-0.749818,-1.131075,1.213136,0.575128,0.596519
4,0.052026,-0.516309,-0.27058,0.970493,-0.930931,-0.551733,-0.749818,-0.581047,-0.82431,-0.831502,-1.051095


In [18]:
X_padronizado_df.describe()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
count,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0
mean,1.859654e-16,7.748558e-18,1.046055e-16,7.767929e-16,-1.86934e-16,4.649135e-17,0.0,-5.114048e-16,-1.046055e-16,7.748558000000001e-17,-3.8742790000000005e-17
std,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546
min,-2.704405,-0.5163086,-2.418822,-2.920572,-2.994023,-0.5517333,-0.749818,-3.016886,-0.8243101,-3.269662,-1.051095
25%,-0.6900904,-0.5163086,-0.2705801,-0.6971063,-0.5745784,-0.5517333,-0.749818,-0.6596226,-0.8243101,-0.8315022,-1.051095
50%,0.05202558,-0.5163086,0.803541,-0.1412398,0.0,-0.5517333,-0.749818,0.04755658,-0.8243101,-0.26885,0.5965186
75%,0.688125,-0.5163086,0.803541,0.4146267,0.4194568,-0.5517333,0.491306,0.7547357,1.213136,0.5751284,0.5965186
max,2.490407,1.936826,0.803541,3.749826,6.721265,1.81247,1.73243,2.561971,1.213136,4.982571,2.244132


# LabelEncoder e OneHotEncoder

Anteriormente, realizamos a conversão de variáveis categóricas para numéricas de forma manual. Neste momento, iremos ver como fazer isso de forma automatizada.

In [19]:
# No "df" possuímos as variáveis categóricas, ou seja, antes de realizarmos a conversão
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289.0,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180.0,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283.0,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214.0,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195.0,0,Normal,122,N,0.0,Up,0


In [20]:
X2 = df.drop('HeartDisease', axis=1)
X2

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,M,ATA,140,289.0,0,Normal,172,N,0.0,Up
1,49,F,NAP,160,180.0,0,Normal,156,N,1.0,Flat
2,37,M,ATA,130,283.0,0,ST,98,N,0.0,Up
3,48,F,ASY,138,214.0,0,Normal,108,Y,1.5,Flat
4,54,M,NAP,150,195.0,0,Normal,122,N,0.0,Up
...,...,...,...,...,...,...,...,...,...,...,...
912,45,M,TA,110,264.0,0,Normal,132,N,1.2,Flat
913,68,M,ASY,144,193.0,1,Normal,141,N,3.4,Flat
914,57,M,ASY,130,131.0,0,Normal,115,Y,1.2,Flat
915,57,F,ATA,130,236.0,0,LVH,174,N,0.0,Flat


In [21]:
# Testando a aplicação em uma variável.
X2['Sex'] = LabelEncoder().fit_transform(X2['Sex'])

In [22]:
# Vamos seguir para as demais variáveis categóricas
X2['ChestPainType'] = LabelEncoder().fit_transform(X2['ChestPainType'])
X2['RestingECG'] = LabelEncoder().fit_transform(X2['RestingECG'])
X2['ExerciseAngina'] = LabelEncoder().fit_transform(X2['ExerciseAngina'])
X2['ST_Slope'] = LabelEncoder().fit_transform(X2['ST_Slope'])

In [23]:
X2.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,1,1,140,289.0,0,1,172,0,0.0,2
1,49,0,2,160,180.0,0,1,156,0,1.0,1
2,37,1,1,130,283.0,0,2,98,0,0.0,2
3,48,0,0,138,214.0,0,1,108,1,1.5,1
4,54,1,2,150,195.0,0,1,122,0,0.0,2


O problema com essa abordagem é que são atribuídos valores às variáveis, porém que não necessariamente signifiquem um peso.

Como assim?

Na variável ST_Slope, foi atribuído o valor 2 para "up", 1 para "flat". Isso significa que o Up tem um peso maior ou é mais importante que o flat? Não! Porém, o algoritmo não sabe disso e, devido a isso, irá interpretar que o valor maior significa um peso maior.

Para lidar com isso, podemos realizar a criação de __variáveis "Dummy"__. Que são variáveis que recebem valores de 0 ou 1. Porém, precisamos tomar cuidado com a multicolinearidade ao aplicar essa técnica.

__Parâmetros do ColumnTransformer__:
- name: nome para a transformação
- transformer: tipo do estimador (OHE)
- columns: colunas que serão transformadas
- remainder: o que acontecerá com o restante das colunas não relacionadas. 1) drop = exclui-las; 2) passthrough = mante-las. O default é drop.
- sparse_threshold: parâmetro de classificação de matrizes esparsas. O default é 0.3
- n_jobs: número de trabalhos executados em paralelo. Default é None
- transformer_weights: define os pesos dos transformadores

In [24]:
# Aplicação do OHE
X3 = ColumnTransformer(transformers=[('OHE', OneHotEncoder(), ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'])],
                       remainder='passthrough').fit_transform(X2)

In [25]:
X3

array([[  0. ,   1. ,   0. , ...,   0. , 172. ,   0. ],
       [  1. ,   0. ,   0. , ...,   0. , 156. ,   1. ],
       [  0. ,   1. ,   0. , ...,   0. ,  98. ,   0. ],
       ...,
       [  0. ,   1. ,   1. , ...,   0. , 115. ,   1.2],
       [  1. ,   0. ,   0. , ...,   0. , 174. ,   0. ],
       [  0. ,   1. ,   0. , ...,   0. , 173. ,   0. ]])

In [26]:
X3.shape

(917, 20)

In [27]:
# Precisamos converter essa matriz em DataFrame
X3_df = pd.DataFrame(X3)
X3_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,40.0,140.0,289.0,0.0,172.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,49.0,160.0,180.0,0.0,156.0,1.0
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,37.0,130.0,283.0,0.0,98.0,0.0
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,48.0,138.0,214.0,0.0,108.0,1.5
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,54.0,150.0,195.0,0.0,122.0,0.0


## Vamos fazer novamente o escalonamento

In [28]:
X3_escalonado = StandardScaler().fit_transform(X3)

In [29]:
X3_escalonado_df = pd.DataFrame(X3_escalonado)
X3_escalonado_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,-0.516309,0.516309,-1.085425,2.073784,-0.531524,-0.22981,-0.507826,0.815013,-0.490781,0.82431,-0.82431,-0.271607,-1.001091,1.149573,-1.432206,0.414627,0.832075,-0.551733,1.383339,-0.831502
1,1.936826,-1.936826,-1.085425,-0.48221,1.881384,-0.22981,-0.507826,0.815013,-0.490781,0.82431,-0.82431,-0.271607,0.99891,-0.869888,-0.478057,1.52636,-1.212261,-0.551733,0.754736,0.106251
2,-0.516309,0.516309,-1.085425,2.073784,-0.531524,-0.22981,-0.507826,-1.226974,2.037569,0.82431,-0.82431,-0.271607,-1.001091,1.149573,-1.750256,-0.14124,0.719543,-0.551733,-1.523953,-0.831502
3,1.936826,-1.936826,0.921298,-0.48221,-0.531524,-0.22981,-0.507826,0.815013,-0.490781,-1.213136,1.213136,-0.271607,0.99891,-0.869888,-0.584074,0.303453,-0.574578,-0.551733,-1.131075,0.575128
4,-0.516309,0.516309,-1.085425,-0.48221,1.881384,-0.22981,-0.507826,0.815013,-0.490781,0.82431,-0.82431,-0.271607,-1.001091,1.149573,0.052026,0.970493,-0.930931,-0.551733,-0.581047,-0.831502


# O que foi feito até agora:
y = variável alvo

X = Variáveis previsoras com as variáveis categóricas transformadas em numéricas manualmente, e sem escalonamento.

X_padronizado = Variáveis preditoras com variáveis transformadas em numéricas, e escalonada.

X2 = variáveis previsoras com variáveis categóricas transformadas em numéricas usando LabelEncoder

X3 = variáveis previsoras transformadas usando LabelEncoder e OneHotEncoder, sem escalonamento

X3_escalonado = variáveis previsoras transformadas pelo LabelEncoder e OHE, e escalonadas.

# Redução de Dimensionalidade

O objetivo da reducao de dimensionalidade eh selecionar as melhores caracteristicas para o treinamento do modelo, atraves da analise de correlacoes entre as variaveis.

### Analise dos Componentes Principais (PCA)
__Selecao de Caracteristicas__: Seleciona os melhores atributos e utiliza sem transformacoes.

__Extracao de Caracteristicas__: Encontra os relacionamentos das melhores caracteristicas e cria novas caracteristicas.

Eh um algoritmo de aprendizagem nao superivisionada, e aplica-se em dados linearmente separaveis.

In [None]:
# Aplicação do PCA
pca = PCA(n_components=4)
X_pca = pca.fit_transform(X2)

In [None]:
X_pca.shape

(917, 4)

In [None]:
X_pca

array([[  44.01218603,   36.15137592,   10.64727631,   -9.47126222],
       [ -63.9918455 ,   13.93849233,   31.69212503,   -5.30052573],
       [  38.53844277,  -33.91712222,  -12.48837374,  -21.53860158],
       ...,
       [-113.3467467 ,  -23.48595247,   -2.51064362,    1.14935806],
       [  -9.1141306 ,   35.90083188,    4.81814643,    9.1238555 ],
       [ -70.01342608,   35.68741686,   12.10984845,  -10.51663588]])

In [None]:
# Razão das variáveis preditoras
pca.explained_variance_ratio_

array([0.72847929, 0.17183534, 0.08122019, 0.01767209])

In [None]:
# Soma da razão das variáveis preditoras
pca.explained_variance_ratio_.sum()

0.9992069186607356

### Kernel PCA
É um algortmo de aprendizagem não supervisionada que aplica-se em dados não linearmente separáveis.

In [None]:
kpca = KernelPCA(n_components=4, kernel='rbf')
X_kpca = kpca.fit_transform(X2)

In [None]:
X_kpca.shape

(917, 4)

In [None]:
X_kpca

array([[-0.00249772, -0.00290225, -0.0027372 , -0.00191636],
       [-0.00249877, -0.00290359, -0.00273854, -0.00191743],
       [-0.00249785, -0.00290241, -0.00273736, -0.00191649],
       ...,
       [-0.00249774, -0.00290227, -0.00273723, -0.00191638],
       [-0.00249793, -0.00290252, -0.00273748, -0.00191659],
       [-0.00249772, -0.00290225, -0.0027372 , -0.00191637]])

### Análise do Discriminante Linear (LDA)
É um algoritmo de aprendizado supervisionado, pois utiliza nossa variável target para fazer a seleção.

É aplicado em situações com muitos atributos previsores e também quando o atributo target possui muitas classes.


# Salvar as Variáveis

In [None]:
# Criação do arquivo
arq1 = open('heart.pkl', 'wb')

In [None]:
# Salvar a variável no arquivo
pickle.dump(y, arq1)

In [None]:
# Fechar o arquivo
arq1.close()

In [31]:
# Criando arquivos e salvando as variáveis.

arq2 = open('heart2.pkl', 'wb')
pickle.dump(X, arq2)
arq2.close()

arq3 = open('heart3.pkl', 'wb')
pickle.dump(X_padronizado, arq3)
arq3.close

arq4 = open('heart4.pkl', 'wb')
pickle.dump(X2, arq4)
arq4.close()

arq5 = open('heart5.pkl', 'wb')
pickle.dump(X3, arq5)
arq5.close()

arq6 = open('heart6.pkl', 'wb')
pickle.dump(X3_escalonado, arq6)
arq6.close()

In [30]:
X_padronizado.shape

(917, 11)

# Divisão entre treino e teste

### Parâmetros do train_test_split
- arrays: nome das variáveis alvo e previsoras
- test_size: tamanho em porcentagem do split
- train_size: tamanho em porcentagem do treinamento
- random_state = estado aleatório
- shuffle = embaralhamento aleatório dos dados.
- stratify = divide os dados de forma estratificada.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X3_escalonado_df, y, test_size=.3, random_state=0)

In [None]:
X_train.shape, X_test.shape

((641, 20), (276, 20))

In [None]:
y_train.shape, y_test.shape

((641,), (276,))