In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score
from sklearn.ensemble import RandomForestClassifier

# Heart Disease Prediction 

In this example we'll work with the famous "UCI Heart Disease" dataset. This dataset contains a set of attributes related to patient potentially affected by a cardiovascular disease (CVD). CVD is one of the biggest causes of mortality but it's estimated that up to 90% of CVD may be preventable, an early diagnosis could be essential in most cases and AI can achieve this goal.

## Binary Classification
Classification is one of the most common unsupervised learning tasks. In a lot of ML applications we need a model that is able to distinguish between two classes, therefore a binary classifier.
With the help of the most common ML libraries (like sklearn) it's really easy to train a binary classifier, however evaluating the performance it's not so easy. We will explore model training/evaluating and concepts like Confusion Matrix, Precision, Recall etc ...

In [None]:
# Get the data
data = pd.read_csv('../../data/raw/heart.csv')

In [None]:
# Let's check the data we have
data.head()

Attribute meaning:

1. age: age in years
2. sex: sex (1 = male; 0 = female)
3.  cp: chest pain type
    * Value 1: typical angina
    * Value 2: atypical angina
    * Value 3: non-anginal pain
    * Value 4: asymptomatic
4. trestbps: resting blood pressure (in mm Hg on admission to the hospital)
5. chol: serum cholestoral in mg/dl
6. fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
7. restecg: resting electrocardiographic results
    * Value 0: normal
    * Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
    * Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
8. thalach: maximum heart rate achieved
9. exang: exercise induced angina (1 = yes; 0 = no)
10. oldpeak = ST depression induced by exercise relative to rest
11. slope: the slope of the peak exercise ST segment
    * Value 1: upsloping
    * Value 2: flat
    * Value 3: downsloping
12. ca: number of major vessels (0-3) colored by flourosopy
13. thal: 3 = normal; 6 = fixed defect; 7 = reversable defect
14. target: target variable, this field refers to the presence of heart disease in the patient.



In [None]:
data.info()

In [None]:
data.describe()

The mean age is the 54 years, adults are more likely than younger people to suffer from cardiovascular disease.

In [None]:
# Let's check the target value
data.target.value_counts()

The dataset is balanced.

A little bit dirty dataset <3

In [None]:
# test and train split
train_set, test_set = train_test_split(data, test_size=0.2, random_state=5)

In [None]:
# Let's plot the numerical histogram
train_set.hist(bins=50, figsize=(20, 15))

The feature have different scale, it's a good idea to perform standard scaling

In [None]:
# Create a pipeline to encode categorical variable using one hot encoding, insert missing values using the median strategy and scale the value 
cat_attr = ["sex", "cp", "fbs", "restecg", "exang", "slope"]
num_attr = ["age", "trestbps", "chol", "thalach", "oldpeak", "ca", "thal"]

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("std_scaler", StandardScaler())
])

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attr),
    ("cat", OneHotEncoder(), cat_attr)
])

In [None]:
x_train = train_set.drop("target", axis=1)
y_train = train_set.target

In [None]:
x_train_pr = full_pipeline.fit_transform(x_train)

# Model Training and Evaluation

Let's train a simple binary classifier as baseline, a Stochastic Gradient Descent classifier (SGD)

In [None]:
sgd_clf = SGDClassifier(random_state=42)

Usually a good way to evaluate a model is to use cross-validation

In [None]:
scores = cross_val_score(sgd_clf, x_train_pr, y_train, cv=3, scoring="accuracy")
scores.mean()

It's not a great value. I think it's a better idea to evaluate our model using other tools like the confusion matrix

In [None]:
# It's like cross-validation but it returns the predictions
preds = cross_val_predict(sgd_clf, x_train_pr, y_train, cv=3)

In [None]:
# Now we can plot the confusion matrix
cm = confusion_matrix(y_train, preds)
ConfusionMatrixDisplay(cm).plot()

Let's see precison, recall and f1 score

In [None]:
print("Precision: ", precision_score(y_train, preds))
print("Recall: ", recall_score(y_train, preds))
print("F1 score: ", f1_score(y_train, preds))

In [None]:
# Another useful metric is the roc auc score
print("roc auc score:", roc_auc_score(y_train, preds))

I think that it's a better idea to try more powerfull models like a RandomForestClassifier

In [None]:
rf_clf = RandomForestClassifier(random_state=42)
rf_preds = cross_val_predict(rf_clf, x_train_pr, y_train, cv=3)

In [None]:
cm = confusion_matrix(y_train, rf_preds)
ConfusionMatrixDisplay(cm).plot()

In [None]:
# Let's plot the metrics
print("Precision: ", precision_score(y_train, rf_preds))
print("Recall: ", recall_score(y_train, rf_preds))
print("F1 score: ", f1_score(y_train, rf_preds))
print("roc auc score:", roc_auc_score(y_train, rf_preds))

Slightly better thant the SGD classifier. Let's train the random forest classifier on the full dataset and evaluate on the test set

In [None]:
forest_clf = RandomForestClassifier(random_state=42)
forest_clf.fit(x_train_pr, y_train)

In [None]:
x_test = test_set.drop("target", axis=1)
y_test = test_set.target

In [None]:
x_test_pr = full_pipeline.transform(x_test)
final_preds = forest_clf.predict(x_test_pr)

In [None]:
# Print the final metrics
print("Precision: ", precision_score(y_test, final_preds))
print("Recall: ", recall_score(y_test, final_preds))
print("F1 score: ", f1_score(y_test, final_preds))
print("roc auc score:", roc_auc_score(y_test, final_preds))

This was a simple example on how to evaluate a classifier, however the result is good! Cross-validation is a good method for model evaluation, but since we split our dataset in 3 folds the model had few data for achieve good performances. With the full dataset we reach a good result!