In [1]:
import pandas as pd
import numpy as np

In [4]:
new_dataset = pd.read_csv('../data/heart.csv')

In [5]:
new_dataset.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [6]:
new_dataset.shape

(918, 12)

In [7]:
new_dataset.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [8]:
new_dataset['HeartDisease'].value_counts()

HeartDisease
1    508
0    410
Name: count, dtype: int64

In [9]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

In [10]:
new_dataset.dtypes

Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
HeartDisease        int64
dtype: object

## 1.

In [13]:
new_dataset = new_dataset.dropna()

le = LabelEncoder()
for column in new_dataset.columns:
    if new_dataset[column].dtype == object:
        new_dataset[column] = le.fit_transform(new_dataset[column])

X_train, X_test, Y_train, Y_test = train_test_split(new_dataset.drop('HeartDisease', axis=1), new_dataset['HeartDisease'], test_size=0.25, stratify=new_dataset['HeartDisease'], random_state=42)
X_train.head(5)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
637,43,1,0,115,303,0,1,181,0,1.2,1
541,76,1,2,104,113,0,0,120,0,3.5,0
570,56,1,0,128,223,0,2,119,1,2.0,0
611,62,1,3,135,139,0,2,137,0,0.2,2
685,61,1,0,120,260,0,1,140,1,3.6,1


In [12]:
Y_test

311    1
52     0
466    1
310    0
735    1
      ..
566    0
442    1
416    1
428    1
271    0
Name: HeartDisease, Length: 230, dtype: int64

## 2.

In [18]:
models = {'Dummy': DummyClassifier(strategy="most_frequent"),
          'Decision Tree': DecisionTreeClassifier(),
          'KNN': KNeighborsClassifier()}

def test_model(name, model, traindata, testdata, trainlabels, testlabels):
  model.fit(traindata, trainlabels)
  trainpred = model.predict(traindata)
  testpred = model.predict(testdata)
  print(f" ===== {name} =====")
  print(f"Accuracy on train {accuracy_score(trainlabels, trainpred)}")
  print(f"F1score on train {f1_score(trainlabels, trainpred)}")
  print(f"Accuracy on test {accuracy_score(testlabels, testpred)}")
  print(f"F1score on test {f1_score(testlabels, testpred)}")
  print("Confusion matrix on test")
  print(confusion_matrix(testlabels, testpred))

for name, model in models.items():
  test_model(name, model, X_train, X_test, Y_train, Y_test)

 ===== Dummy =====
Accuracy on train 0.5537790697674418
F1score on train 0.7128157156220767
Accuracy on test 0.5521739130434783
F1score on test 0.711484593837535
Confusion matrix on test
[[  0 103]
 [  0 127]]
 ===== Decision Tree =====
Accuracy on train 1.0
F1score on train 1.0
Accuracy on test 0.8043478260869565
F1score on test 0.8178137651821862
Confusion matrix on test
[[ 84  19]
 [ 26 101]]
 ===== KNN =====
Accuracy on train 0.7790697674418605
F1score on train 0.8025974025974026
Accuracy on test 0.691304347826087
F1score on test 0.7215686274509804
Confusion matrix on test
[[67 36]
 [35 92]]


## 3.

In [19]:
from sklearn.model_selection import cross_val_score

In [20]:
Y = new_dataset["HeartDisease"]
X = new_dataset.drop("HeartDisease", axis=1)

In [21]:
for name, model in models.items():
  # Uso X e Y interi perchè farà lui la divisione in test e train
  result = cross_val_score(model, X, Y, cv=10)
  print(f" ===== {name} =====")
  print(f"Media {result.mean()}")
  print(result)

 ===== Dummy =====
Media 0.553368370759675
[0.55434783 0.55434783 0.55434783 0.55434783 0.55434783 0.55434783
 0.55434783 0.55434783 0.54945055 0.54945055]
 ===== Decision Tree =====
Media 0.7787386526516962
[0.76086957 0.81521739 0.7826087  0.81521739 0.82608696 0.79347826
 0.80434783 0.75       0.7032967  0.73626374]
 ===== KNN =====
Media 0.7035236502627807
[0.65217391 0.69565217 0.85869565 0.80434783 0.72826087 0.60869565
 0.7826087  0.66304348 0.61538462 0.62637363]


## 4.

In [22]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
columns_to_scale = ['RestingBP', 'Cholesterol', 'MaxHR']
new_dataset[columns_to_scale] = scaler.fit_transform(new_dataset[columns_to_scale])

In [23]:
new_dataset

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,0.70,0.479270,0,1,0.788732,0,0.0,2,0
1,49,0,2,0.80,0.298507,0,1,0.676056,0,1.0,1,1
2,37,1,1,0.65,0.469320,0,2,0.267606,0,0.0,2,0
3,48,0,0,0.69,0.354892,0,1,0.338028,1,1.5,1,1
4,54,1,2,0.75,0.323383,0,1,0.436620,0,0.0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,3,0.55,0.437811,0,1,0.507042,0,1.2,1,1
914,68,1,0,0.72,0.320066,1,1,0.570423,0,3.4,1,1
915,57,1,0,0.65,0.217247,0,1,0.387324,1,1.2,1,1
916,57,0,1,0.65,0.391376,0,0,0.802817,0,0.0,1,1


In [24]:
X_train, X_test, Y_train, Y_test = train_test_split(new_dataset.drop('HeartDisease', axis=1), new_dataset['HeartDisease'], test_size=0.25, stratify=new_dataset['HeartDisease'], random_state=42)

decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(X_train, Y_train)
Y_pred = decision_tree_model.predict(X_test)

accuracy = accuracy_score(Y_test, Y_pred)
print("Accuratezza del modello Decision Tree con le feature più correlate:", accuracy)

Accuratezza del modello Decision Tree con le feature più correlate: 0.808695652173913


## 5.

In [31]:
new_dataset.corr()['HeartDisease'].sort_values(ascending=False)#[0:6].index.tolist()

HeartDisease      1.000000
ExerciseAngina    0.494282
Oldpeak           0.403951
Sex               0.305445
Age               0.282039
FastingBS         0.267291
RestingBP         0.107589
RestingECG        0.057384
Cholesterol      -0.232741
ChestPainType    -0.386828
MaxHR            -0.400421
ST_Slope         -0.558771
Name: HeartDisease, dtype: float64

In [25]:
correlation_matrix = new_dataset.corr()
top_correlated_features = correlation_matrix['HeartDisease'].sort_values(ascending=False)[0:6].index.tolist()
new_df = new_dataset[['HeartDisease'] + top_correlated_features]

X_train, X_test, Y_train, Y_test = train_test_split(new_df.drop('HeartDisease', axis=1), new_df['HeartDisease'], test_size=0.25, stratify=new_df['HeartDisease'], random_state=42)

decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(X_train, Y_train)
Y_pred = decision_tree_model.predict(X_test)

accuracy = accuracy_score(Y_test, Y_pred)
print("Accuratezza del modello Decision Tree con le feature più correlate:", accuracy)

Accuratezza del modello Decision Tree con le feature più correlate: 0.7


## 6.

In [33]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20]
}

decision_tree_model = DecisionTreeClassifier()
grid_search = GridSearchCV(estimator=decision_tree_model, param_grid=param_grid, cv=5, scoring='accuracy')

new_dataset = pd.read_csv("../data/heart.csv")
new_dataset = new_dataset.dropna()

le = LabelEncoder()
for column in new_dataset.columns:
    if new_dataset[column].dtype == object:
        new_dataset[column] = le.fit_transform(new_dataset[column])

X_train, X_test, Y_train, Y_test = train_test_split(new_dataset.drop('HeartDisease', axis=1), new_dataset['HeartDisease'], test_size=0.25, stratify=new_dataset['HeartDisease'], random_state=42)

grid_search.fit(X_train, Y_train)
print("Migliori parametri:", grid_search.best_params_)

best_decision_tree_model = grid_search.best_estimator_
Y_pred = best_decision_tree_model.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
print("Accuratezza del modello Decision Tree con i migliori parametri:", accuracy)

Migliori parametri: {'criterion': 'entropy', 'max_depth': 5}
Accuratezza del modello Decision Tree con i migliori parametri: 0.8347826086956521
