In [1]:
import pandas as pd

csv_path = "heart.csv"
heartDisease = pd.read_csv(csv_path)

In [2]:
heartDisease.head()
#RestingBP = Resting Blood Pressur
#FastingBS = Fasting blood sugar
#ExerciseAngina = Heart Pain during exercise

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

encoder = OneHotEncoder(sparse_output=False)
heartDisease_categorical = heartDisease[['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']]
heartDiseaseEncoded = encoder.fit_transform(heartDisease_categorical)
heartDiseaseEncoded_df = pd.DataFrame(heartDiseaseEncoded, columns=encoder.get_feature_names_out(heartDisease_categorical.columns))
heartDiseaseToTrain = pd.concat([heartDisease.drop(heartDisease_categorical, axis=1), heartDiseaseEncoded_df], axis=1)
print(heartDiseaseToTrain)

train_set, test_set = train_test_split(heartDiseaseToTrain, test_size=0.2 ,random_state=42)

     Age  RestingBP  Cholesterol  FastingBS  MaxHR  Oldpeak  HeartDisease  \
0     40        140          289          0    172      0.0             0   
1     49        160          180          0    156      1.0             1   
2     37        130          283          0     98      0.0             0   
3     48        138          214          0    108      1.5             1   
4     54        150          195          0    122      0.0             0   
..   ...        ...          ...        ...    ...      ...           ...   
913   45        110          264          0    132      1.2             1   
914   68        144          193          1    141      3.4             1   
915   57        130          131          0    115      1.2             1   
916   57        130          236          0    174      0.0             1   
917   38        138          175          0    173      0.0             0   

     Sex_F  Sex_M  ChestPainType_ASY  ...  ChestPainType_NAP  \
0      0.0 

In [4]:
from sklearn.ensemble import RandomForestRegressor

fr = RandomForestRegressor()
train_heart_disease = train_set.drop("HeartDisease", axis=1)
train_label_heart_disease = train_set["HeartDisease"].copy()

In [5]:
from sklearn.model_selection import cross_val_score

fr.fit(train_heart_disease, train_label_heart_disease)

In [6]:
from sklearn.metrics import accuracy_score
import numpy as np

test_heart_disease = test_set.drop("HeartDisease", axis=1)
test_label_heart_disease = test_set["HeartDisease"].copy()
predictions = fr.predict(test_heart_disease)
predictions = np.around(predictions)
accuracy = accuracy_score(predictions, test_label_heart_disease)

In [7]:
print(accuracy)

0.8641304347826086


In [8]:
score = cross_val_score(fr, train_heart_disease, train_label_heart_disease, cv=5)
print(score) #Overfitting nos dados

[0.53451259 0.56905    0.45662168 0.55641526 0.48695342]


In [None]:
#Fazendo GridSearch
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],         # Número de árvores
    'max_depth': [10, 20, None],            # Profundidade máxima das árvores
    'min_samples_split': [2, 5, 10],        # Mínimo de amostras para dividir um nó
    'min_samples_leaf': [1, 2, 4],          # Mínimo de amostras em uma folha
    'max_features': ['auto', 'sqrt', 'log2'],  # Máximo de características por divisão
    'bootstrap': [True, False]              # Se usar bootstrap
}

gridSearch = GridSearchCV(estimator=fr, param_grid=param_grid, cv=5, scoring="neg_mean_squared_error")
gridSearch.fit(train_heart_disease, train_label_heart_disease)

print("Melhores parâmetros encont‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’rados:", gridSearch.best_params_)

newRF = gridSearch.best_estimator_

In [None]:
newPredict = newRF.predict(test_heart_disease)
newPredict = np.around(newPredict)
accuracy = accuracy_score(newPredict, test_label_heart_disease)
print(accuracy)

In [None]:
newScore = cross_val_score(newRF, train_heart_disease, train_label_heart_disease, cv=5)
print(newScore) #Leve aumentada no score usando os melhores parâmetros

In [None]:
#Testando KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

In [None]:
knn.fit(train_heart_disease, train_label_heart_disease)

In [None]:
knnPredictions = knn.predict(test_heart_disease)

In [None]:
knnScore = accuracy_score(knnPredictions, test_label_heart_disease)
print(knnScore)

In [None]:
knnCrossScore = cross_val_score(knn, train_heart_disease, train_label_heart_disease, cv=5)
print(knnCrossScore) #Menos overfitting

In [None]:
knn_param_grid = {
    'n_neighbors': [10, 15, 20],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 30, 50],
    'metric' : ['euclidean', 'manhattan', 'minkowski']
}

knnGridSearch = GridSearchCV(estimator=knn, param_grid=knn_param_grid, cv=5, scoring="neg_mean_squared_error")
knnGridSearch.fit(train_heart_disease, train_label_heart_disease)
print(knnGridSearch.best_params_)

In [None]:
best_knn = knnGridSearch.best_estimator_

In [None]:
bestKnnPredictions = best_knn.predict(test_heart_disease)
bestKnnAccuracy = accuracy_score(bestKnnPredictions, test_label_heart_disease)
print(bestKnnAccuracy) #Leve melhora

In [None]:
bestKnnCrossScore = cross_val_score(best_knn, train_heart_disease, train_label_heart_disease, cv=5)
print(bestKnnCrossScore)
#Conclusão: KNN apesar de ter uma accuracy menor no teste, obteve uma melhor pontuação no cross-validation

In [None]:
#Fazendo o escalonamento de algumas colunas

from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

colunas_escalonadas = ["RestingBP", "Cholesterol", "MaxHR"]
transformer = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), colunas_escalonadas)
    ],
    force_int_remainder_cols=False
)

pipelineKnn = Pipeline(steps=[
     ('preprocessamento', transformer),
     ('best_knn', best_knn)
])

pipelineKnn.fit(train_heart_disease, train_label_heart_disease)

In [None]:
new_best_knn_predicts = pipelineKnn.predict(test_heart_disease)
new_best_knn_score = accuracy_score(bestKnnPredictions, test_label_heart_disease)
print(new_best_knn_score)

In [None]:
newBestKnnCrossScore = cross_val_score(new, train_heart_disease, train_label_heart_disease, cv=5)
print(newBestKnnCrossScore)