# Data Processing

In [1]:
# import library
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
# import data
df = pd.read_csv('heart_clean.csv')
df.head()

# note:
# heart_clean.csv is heart.csv that has been cleaned in heart_diagnosis_EDA.ipynb

Unnamed: 0,age,sex,chest_pain,systolic,cholesterol,fasting_blood_sugar,restECG,max_heart_rate,exercise_induced_angina,st_depression,slope,number_vessel,thallium_stress,diagnose
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
# correlation between features and target
df.corr()['diagnose'].sort_values(ascending=False)[1:]

max_heart_rate             0.426655
chest_pain                 0.423425
slope                      0.337825
restECG                    0.131716
fasting_blood_sugar       -0.004680
cholesterol               -0.076541
systolic                  -0.148922
age                       -0.225453
sex                       -0.285322
thallium_stress           -0.364399
exercise_induced_angina   -0.425085
st_depression             -0.428804
number_vessel             -0.467158
Name: diagnose, dtype: float64

# Feature Engineering

In [4]:
# feature selection

# all features has moderate correlation level, except the fasting_blood sugar
# cholesterol has low correlation with diagnose, but EDA says it matters
df = df.drop(columns='fasting_blood_sugar')

In [5]:
# one hot encoding non-binary nominal data

# restECG
df = pd.get_dummies(data=df, columns=['restECG'], prefix_sep='_')

# thallium_stress
df = pd.get_dummies(data=df, columns=['thallium_stress'], prefix_sep='_')

df.head()

Unnamed: 0,age,sex,chest_pain,systolic,cholesterol,max_heart_rate,exercise_induced_angina,st_depression,slope,number_vessel,diagnose,restECG_0,restECG_1,restECG_2,thallium_stress_1,thallium_stress_2,thallium_stress_3
0,63,1,3,145,233,150,0,2.3,0,0,1,1,0,0,1,0,0
1,37,1,2,130,250,187,0,3.5,0,0,1,0,1,0,0,1,0
2,41,0,1,130,204,172,0,1.4,2,0,1,1,0,0,0,1,0
3,56,1,1,120,236,178,0,0.8,2,0,1,0,1,0,0,1,0
4,57,0,0,120,354,163,1,0.6,2,0,1,0,1,0,0,1,0


In [6]:
# correlation between features and target
df.corr()['diagnose'].sort_values(ascending=False)[1:]

thallium_stress_2          0.530032
max_heart_rate             0.426655
chest_pain                 0.423425
slope                      0.337825
restECG_1                  0.170030
restECG_2                 -0.068235
cholesterol               -0.076541
thallium_stress_1         -0.105799
systolic                  -0.148922
restECG_0                 -0.154302
age                       -0.225453
sex                       -0.285322
exercise_induced_angina   -0.425085
st_depression             -0.428804
number_vessel             -0.467158
thallium_stress_3         -0.489046
Name: diagnose, dtype: float64

In [7]:
# feature scaling

# we need to scale some features by using Robust Scaler to minimize the effect of outliers
from sklearn.preprocessing import RobustScaler
df['systolic'] = RobustScaler().fit_transform(df[['systolic']])
df['cholesterol'] = RobustScaler().fit_transform(df[['cholesterol']])
df['max_heart_rate'] = RobustScaler().fit_transform(df[['max_heart_rate']])

df.head()

Unnamed: 0,age,sex,chest_pain,systolic,cholesterol,max_heart_rate,exercise_induced_angina,st_depression,slope,number_vessel,diagnose,restECG_0,restECG_1,restECG_2,thallium_stress_1,thallium_stress_2,thallium_stress_3
0,63,1,3,0.75,-0.14786,-0.075758,0,2.3,0,0,1,1,0,0,1,0,0
1,37,1,2,0.0,0.116732,1.045455,0,3.5,0,0,1,0,1,0,0,1,0
2,41,0,1,0.0,-0.599222,0.590909,0,1.4,2,0,1,1,0,0,0,1,0
3,56,1,1,-0.5,-0.101167,0.772727,0,0.8,2,0,1,0,1,0,0,1,0
4,57,0,0,-0.5,1.735409,0.318182,1,0.6,2,0,1,0,1,0,0,1,0


# Data Processing into Machine Learning

In [8]:
# data splitting into fetaure and target
X = df.drop(columns='diagnose')
y = df['diagnose']

In [9]:
# data splitting into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size=0.75, random_state=99)
df_train = pd.concat([X_train, y_train], axis=1)

In [10]:
# checking the balance of values
y_train.value_counts()

# the data is imbalance, there is gap between the number of heart disease and non-heart disease
# majority data: diagnosed heart disease
# minority data: diagnosed non-heart disease

1    120
0    102
Name: diagnose, dtype: int64

In [11]:
# do SMOTE to synthesize data of the minority
import imblearn
from imblearn.over_sampling import SMOTE

sm = SMOTE()
X_sm, y_sm = sm.fit_sample(X_train, y_train)
df_sm = pd.concat([X_sm, y_sm], axis=1)

# check if the data has been balanced
y_sm.value_counts()

1    120
0    120
Name: diagnose, dtype: int64

# Machine Learning

In [12]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In this machine learning process we need to focus on the recall_score value. We just do not want those with heart-disease risk is missed from observation, because it can risks patient's life.

## Logistic Regression (base)

In [13]:
from sklearn.linear_model import LogisticRegression

In [14]:
LR = LogisticRegression()
LR.fit(X_sm, y_sm)
y_pred_LR = LR.predict(X_test)

In [15]:
acc_LR = accuracy_score(y_test, y_pred_LR)
pre_LR = precision_score(y_test, y_pred_LR)
rec_LR = recall_score(y_test, y_pred_LR)
f1_LR = f1_score(y_test, y_pred_LR)
print(classification_report(y_test, y_pred_LR))

              precision    recall  f1-score   support

           0       0.89      0.74      0.81        34
           1       0.80      0.93      0.86        40

    accuracy                           0.84        74
   macro avg       0.85      0.83      0.83        74
weighted avg       0.85      0.84      0.84        74



In [16]:
cm_LR = confusion_matrix(y_test, y_pred_LR, labels = [1,0])
df_LR = pd.DataFrame(data = cm_LR , index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_LR

Unnamed: 0,Pred 1,Pred 0
Act 1,37,3
Act 0,9,25


### Logistic Regression (Hyperparameter Tuning)

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [54]:
LR_tuning = LogisticRegression()
para_LR = {'C': [1, 0.001, 0.15, 0.01, 0.1],
            'penalty': ['l2', 'l1', 'elasticnet']}
LR_tuning = GridSearchCV(estimator = LR_tuning, param_grid = para_LR, cv = 3, n_jobs = -1 , verbose = 1, scoring = 'recall')

In [55]:
LR_tuning.fit(X_sm, y_sm)
LR_tuned = LR_tuning.best_estimator_
y_pred_LR_tuned = LR_tuned.predict(X_test)

Fitting 3 folds for each of 15 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    1.4s finished


In [56]:
acc_LR_tuned = accuracy_score(y_test, y_pred_LR_tuned)
pre_LR_tuned = precision_score(y_test, y_pred_LR_tuned)
rec_LR_tuned = recall_score(y_test, y_pred_LR_tuned)
f1_LR_tuned = f1_score(y_test, y_pred_LR_tuned)
print(classification_report(y_test, y_pred_LR_tuned))

              precision    recall  f1-score   support

           0       0.89      0.71      0.79        34
           1       0.79      0.93      0.85        40

    accuracy                           0.82        74
   macro avg       0.84      0.82      0.82        74
weighted avg       0.83      0.82      0.82        74



In [57]:
cm_LR_tuned = confusion_matrix(y_test, y_pred_LR_tuned, labels = [1,0])
df_LR_tuned = pd.DataFrame(data = cm_LR_tuned , index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_LR_tuned

Unnamed: 0,Pred 1,Pred 0
Act 1,37,3
Act 0,10,24


### K-Nearest Neighbors (Base Model)

In [22]:
from sklearn.neighbors import KNeighborsClassifier

In [23]:
KNN = KNeighborsClassifier()
KNN.fit(X_sm, y_sm)
y_pred_KNN = KNN.predict(X_test)

In [24]:
acc_KNN = accuracy_score(y_test, y_pred_KNN)
pre_KNN = precision_score(y_test, y_pred_KNN)
rec_KNN = recall_score(y_test, y_pred_KNN)
f1_KNN = f1_score(y_test, y_pred_KNN)
print(classification_report(y_test, y_pred_KNN))

              precision    recall  f1-score   support

           0       0.84      0.62      0.71        34
           1       0.73      0.90      0.81        40

    accuracy                           0.77        74
   macro avg       0.79      0.76      0.76        74
weighted avg       0.78      0.77      0.76        74



In [25]:
cm_KNN = confusion_matrix(y_test, y_pred_KNN, labels = [1,0])
df_KNN = pd.DataFrame(data = cm_KNN, index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_KNN

Unnamed: 0,Pred 1,Pred 0
Act 1,36,4
Act 0,13,21


## KNN Classifier - Hyperparameter Tuning

In [26]:
from sklearn.model_selection import GridSearchCV

In [32]:
KNN_tuning = KNeighborsClassifier()
para_KNN = {'n_neighbors': [5, 3 , 10, 20],
            'weights': ['uniform', 'distance'],
            'p': [2,1, 0.2],
            'algorithm': ['auto', 'kd_tree']}
KNN_tuning = GridSearchCV(estimator = KNN_tuning, param_grid = para_KNN, cv = 3, n_jobs = -1 , verbose = 1, scoring = 'recall')

In [33]:
KNN_tuning.fit(X_sm, y_sm)
KNN_tuned = KNN_tuning.best_estimator_
y_pred_KNN_tuned = KNN_tuned.predict(X_test)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:    0.4s finished


In [34]:
acc_KNN_tuned = accuracy_score(y_test, y_pred_KNN_tuned)
rec_KNN_tuned = recall_score(y_test, y_pred_KNN_tuned)
pre_KNN_tuned = precision_score(y_test, y_pred_KNN_tuned)
f1_KNN_tuned = f1_score(y_test, y_pred_KNN_tuned)
print(classification_report(y_test, y_pred_KNN_tuned))

              precision    recall  f1-score   support

           0       0.82      0.53      0.64        34
           1       0.69      0.90      0.78        40

    accuracy                           0.73        74
   macro avg       0.76      0.71      0.71        74
weighted avg       0.75      0.73      0.72        74



In [35]:
cm_KNN_tuned = confusion_matrix(y_test, y_pred_KNN_tuned, labels = [1,0])
df_KNN_tuned = pd.DataFrame(data = cm_KNN_tuned , index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_KNN_tuned

Unnamed: 0,Pred 1,Pred 0
Act 1,36,4
Act 0,16,18


### Random Forest (Base Model)

In [36]:
from sklearn.ensemble import RandomForestClassifier

In [37]:
RFC = RandomForestClassifier(random_state=99)
RFC.fit(X_sm, y_sm)
y_pred_RFC = RFC.predict(X_test)

In [38]:
acc_RFC = accuracy_score(y_test, y_pred_RFC)
pre_RFC = precision_score(y_test, y_pred_RFC)
rec_RFC = recall_score(y_test, y_pred_RFC)
f1_RFC = f1_score(y_test, y_pred_RFC)
print(classification_report(y_test, y_pred_RFC))

              precision    recall  f1-score   support

           0       0.87      0.76      0.81        34
           1       0.82      0.90      0.86        40

    accuracy                           0.84        74
   macro avg       0.84      0.83      0.83        74
weighted avg       0.84      0.84      0.84        74



In [39]:
cm_RFC = confusion_matrix(y_test, y_pred_RFC, labels = [1,0])
df_RFC = pd.DataFrame(data = cm_RFC, index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_RFC

Unnamed: 0,Pred 1,Pred 0
Act 1,36,4
Act 0,8,26


### Random Forest (Hyperparameter Tuning)

In [40]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [41]:
RFC_tuning = RandomForestClassifier(random_state=99)
para_RFC = {'n_estimators': [100, 50, 250, 500],
           'max_depth': [None, 5, 10],
           'min_samples_split': [2, 5, 10],
           'min_samples_leaf': [1, 7, 15]}
RFC_tuning = RandomizedSearchCV(estimator = RFC_tuning, param_distributions = para_RFC, cv = 3, n_jobs = -1 , verbose = 2, scoring = 'recall')

In [42]:
RFC_tuning.fit(X_sm, y_sm)
RFC_tuned = RFC_tuning.best_estimator_
y_pred_RFC_tuned = RFC_tuned.predict(X_test)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    3.4s finished


In [43]:
acc_RFC_tuned = accuracy_score(y_test, y_pred_RFC_tuned)
pre_RFC_tuned = precision_score(y_test, y_pred_RFC_tuned)
rec_RFC_tuned = recall_score(y_test, y_pred_RFC_tuned)
f1_RFC_tuned = f1_score(y_test, y_pred_RFC_tuned)
print(classification_report(y_test, y_pred_RFC_tuned))

              precision    recall  f1-score   support

           0       0.83      0.71      0.76        34
           1       0.78      0.88      0.82        40

    accuracy                           0.80        74
   macro avg       0.80      0.79      0.79        74
weighted avg       0.80      0.80      0.80        74



In [45]:
cm_RFC_tuned = confusion_matrix(y_test, y_pred_RFC_tuned, labels = [1,0])
df_RFC_tuned = pd.DataFrame(data = cm_RFC_tuned , index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_RFC_tuned

Unnamed: 0,Pred 1,Pred 0
Act 1,35,5
Act 0,10,24


## XtremeGradientBoosting (base model)

In [46]:
from sklearn.ensemble import GradientBoostingClassifier

In [48]:
XGB = GradientBoostingClassifier()
XGB.fit(X_sm, y_sm)
y_pred_XGB = XGB.predict(X_test)

In [49]:
acc_XGB = accuracy_score(y_test, y_pred_XGB)
pre_XGB = precision_score(y_test, y_pred_XGB)
rec_XGB = recall_score(y_test, y_pred_XGB)
f1_XGB = f1_score(y_test, y_pred_XGB)
print(classification_report(y_test, y_pred_XGB))

              precision    recall  f1-score   support

           0       0.81      0.65      0.72        34
           1       0.74      0.88      0.80        40

    accuracy                           0.77        74
   macro avg       0.78      0.76      0.76        74
weighted avg       0.78      0.77      0.77        74



In [51]:
cm_XGB = confusion_matrix(y_test, y_pred_XGB, labels = [1,0])
df_XGB = pd.DataFrame(data = cm_XGB, index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_XGB

Unnamed: 0,Pred 1,Pred 0
Act 1,35,5
Act 0,12,22


# Summary

In [61]:
summary = {
    "LR": [acc_LR, rec_LR, pre_LR, f1_LR],
    "LR_tuned": [acc_LR_tuned, rec_LR_tuned, pre_LR_tuned, f1_LR_tuned],
    "KNN": [acc_KNN, rec_KNN, pre_KNN, f1_KNN],
    "KNN_tuned": [acc_KNN_tuned, rec_KNN_tuned, pre_KNN_tuned, f1_KNN_tuned],
    "RandomForest": [acc_RFC, rec_RFC, pre_RFC, f1_RFC],
    "RandomForest_tuned": [acc_RFC_tuned, rec_RFC_tuned, pre_RFC_tuned, f1_RFC_tuned],
    "XGB": [acc_XGB, rec_XGB, pre_XGB, f1_XGB]}

summary = pd.DataFrame(data=summary, index=['Accuracy', 'Recall', 'Precision', 'F1-Score'])
summary

Unnamed: 0,LR,LR_tuned,KNN,KNN_tuned,RandomForest,RandomForest_tuned,XGB
Accuracy,0.837838,0.824324,0.77027,0.72973,0.837838,0.797297,0.77027
Recall,0.925,0.925,0.9,0.9,0.9,0.875,0.875
Precision,0.804348,0.787234,0.734694,0.692308,0.818182,0.777778,0.744681
F1-Score,0.860465,0.850575,0.808989,0.782609,0.857143,0.823529,0.804598


Summary:
- The best model is Logistic Regression with recall value of 92.5% and 83.8%  accuracy
- With the best model, there is only 3 people who is missed from prediction.