# Redes Neurais com Classificador

Este projeto tem por objetivo desenvolver um algoritmo de Machine Learning para prever a tendência de uma pessoa desenvolver algum tipo de doença cardíaca com base em alguns fatores clínicos e laboratoriais.

In [3]:
# dados
import pandas as pd
import numpy as np

df = pd.read_csv('heart_tratado.csv', sep=';', encoding='iso 8859-1')
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289.0,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180.0,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283.0,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214.0,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195.0,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
912,45,M,TA,110,264.0,0,Normal,132,N,1.2,Flat,1
913,68,M,ASY,144,193.0,1,Normal,141,N,3.4,Flat,1
914,57,M,ASY,130,131.0,0,Normal,115,Y,1.2,Flat,1
915,57,F,ATA,130,236.0,0,LVH,174,N,0.0,Flat,1


In [4]:
# verificando tamanho de colunas e registros
df.shape

(917, 12)

In [5]:
# verificando tipos de dados
df.dtypes

Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol       float64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
HeartDisease        int64
dtype: object

In [6]:
# verificando valores ausentes
df.isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [7]:
# verificando dados duplicados
df.duplicated().any()

False

In [8]:
# verificando info do dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 917 entries, 0 to 916
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             917 non-null    int64  
 1   Sex             917 non-null    object 
 2   ChestPainType   917 non-null    object 
 3   RestingBP       917 non-null    int64  
 4   Cholesterol     917 non-null    float64
 5   FastingBS       917 non-null    int64  
 6   RestingECG      917 non-null    object 
 7   MaxHR           917 non-null    int64  
 8   ExerciseAngina  917 non-null    object 
 9   Oldpeak         917 non-null    float64
 10  ST_Slope        917 non-null    object 
 11  HeartDisease    917 non-null    int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 86.1+ KB


In [14]:
# selecionar colunas categóricas
colunas_categoricas = df.select_dtypes(include=['category', 'object'])
colunas_categoricas

Unnamed: 0,Sex,ChestPainType,RestingECG,ExerciseAngina,ST_Slope
0,M,ATA,Normal,N,Up
1,F,NAP,Normal,N,Flat
2,M,ATA,ST,N,Up
3,F,ASY,Normal,Y,Flat
4,M,NAP,Normal,N,Up
...,...,...,...,...,...
912,M,TA,Normal,N,Flat
913,M,ASY,Normal,N,Flat
914,M,ASY,Normal,Y,Flat
915,F,ATA,LVH,N,Flat


In [19]:
# transformando dados categoricos em numericos
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_array = encoder.fit_transform(colunas_categoricas)
encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(colunas_categoricas.columns))
encoded_df

Unnamed: 0,Sex_0,Sex_1,ChestPainType_0,ChestPainType_1,ChestPainType_2,ChestPainType_3,RestingECG_0,RestingECG_1,RestingECG_2,ExerciseAngina_0,ExerciseAngina_1,ST_Slope_0,ST_Slope_1,ST_Slope_2
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
912,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
913,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
914,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
915,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [20]:
# concatenando com o DataFrame original (sem as colunas categóricas)
df2 = pd.concat([df.drop(columns=colunas_categoricas.columns), encoded_df], axis=1)
df2

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_0,Sex_1,ChestPainType_0,...,ChestPainType_2,ChestPainType_3,RestingECG_0,RestingECG_1,RestingECG_2,ExerciseAngina_0,ExerciseAngina_1,ST_Slope_0,ST_Slope_1,ST_Slope_2
0,40,140,289.0,0,172,0.0,0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,49,160,180.0,0,156,1.0,1,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,37,130,283.0,0,98,0.0,0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
3,48,138,214.0,0,108,1.5,1,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,54,150,195.0,0,122,0.0,0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
912,45,110,264.0,0,132,1.2,1,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
913,68,144,193.0,1,141,3.4,1,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
914,57,130,131.0,0,115,1.2,1,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
915,57,130,236.0,0,174,0.0,1,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [26]:
# separando dados
X = df2.drop('HeartDisease', axis=1).values
y = df2.HeartDisease.values

In [29]:
# verificando dados 
X, y

(array([[ 40., 140., 289., ...,   0.,   0.,   1.],
        [ 49., 160., 180., ...,   0.,   1.,   0.],
        [ 37., 130., 283., ...,   0.,   0.,   1.],
        ...,
        [ 57., 130., 131., ...,   0.,   1.,   0.],
        [ 57., 130., 236., ...,   0.,   1.,   0.],
        [ 38., 138., 175., ...,   0.,   0.,   1.]]),
 array([0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
        1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,
        0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0,
        1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
        1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
        1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0,

In [33]:
# separando bases de treino e teste
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape

((641, 20), (276, 20))

In [58]:
# padronizando dados
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled.shape, X_test_scaled.shape


((641, 20), (276, 20))

In [59]:
# modelos
from sklearn.neural_network import MLPClassifier

# dados não normalizados
modelo = MLPClassifier(hidden_layer_sizes=(7), activation='relu', solver='adam', max_iter =800,
                              tol=0.0001, random_state = 3)
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)

# dados normalizados
modelo_scaled = MLPClassifier(hidden_layer_sizes=(7), activation='relu', solver='adam', max_iter =800,
                              tol=0.0001, random_state = 3)
modelo_scaled.fit(X_train_scaled, y_train)
y_pred_scaled = modelo_scaled.predict(X_test_scaled)

In [63]:
# aplicando validação cruzada
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits = 30, shuffle=True, random_state = 5)

modelo_cross = MLPClassifier(hidden_layer_sizes=(7), activation='relu', solver='adam', max_iter =800,
                              tol=0.0001, random_state = 3)

resultado = cross_val_score(modelo_cross,X, y, cv = kfold)
resultado_scaled = cross_val_score(modelo_cross, scaler.fit_transform(X), y, cv=kfold)

In [65]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print(f'''
Modelo não normalizado: 
-------------------------------------------------------------------------------

    - Acurácia treino: {accuracy_score(y_train, modelo.predict(X_train)) * 100:.2f}%
    - Acurácia teste: {accuracy_score(y_test, y_pred) * 100:.2f}%
    - Acurácia média CV: {resultado.mean()*100:.2f}%
    - Matriz treino: {confusion_matrix(y_train, modelo.predict(X_train)).tolist()}
    - Matriz teste: {confusion_matrix(y_test, y_pred).tolist()}
    
    Classificação modelo:
    {classification_report(y_test, y_pred)}
    
Modelo normalizado: 
-------------------------------------------------------------------------------

    - Acurácia treino: {accuracy_score(y_train, modelo_scaled.predict(X_train_scaled)) * 100:.2f}%
    - Acurácia teste: {accuracy_score(y_test, y_pred_scaled) * 100:.2f}%
    - Acurácia média CV: {resultado_scaled.mean()*100:.2f}%
    - Matriz treino: {confusion_matrix(y_train, modelo_scaled.predict(X_train_scaled)).tolist()}
    - Matriz teste: {confusion_matrix(y_test, y_pred_scaled).tolist()}

    Classificação modelo:
    {classification_report(y_test, y_pred_scaled)}
''')



Modelo não normalizado: 
-------------------------------------------------------------------------------

    - Acurácia treino: 85.65%
    - Acurácia teste: 83.70%
    - Acurácia média CV: 85.83%
    - Matriz treino: [[260, 43], [49, 289]]
    - Matriz teste: [[94, 13], [32, 137]]
    
    Classificação modelo:
                  precision    recall  f1-score   support

           0       0.75      0.88      0.81       107
           1       0.91      0.81      0.86       169

    accuracy                           0.84       276
   macro avg       0.83      0.84      0.83       276
weighted avg       0.85      0.84      0.84       276

    
Modelo normalizado: 
-------------------------------------------------------------------------------

    - Acurácia treino: 88.46%
    - Acurácia teste: 85.14%
    - Acurácia média CV: 84.73%
    - Matriz treino: [[262, 41], [33, 305]]
    - Matriz teste: [[93, 14], [27, 142]]

    Classificação modelo:
                  precision    recall  f1-s