# Libraries

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

RERUN = True

# Data

In [3]:
df = pd.read_csv("heart.csv")
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [4]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [5]:
df.dtypes

Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
HeartDisease        int64
dtype: object

## Scale Numerical features (Z-scroing)

In [7]:
numeric_df = df.select_dtypes(np.number)
scaled_features = StandardScaler().fit_transform(numeric_df.values)
scaled_df = pd.DataFrame(scaled_features, columns=numeric_df.columns)

## Create dummy variables for categorical features

In [8]:
cat_df = df.select_dtypes(exclude=np.number)
dummy_df = pd.get_dummies(
    cat_df, drop_first=True
)  # Drop first dummy variable as a base

In [9]:
features = pd.concat([scaled_df, dummy_df], axis=1)
features["HeartDisease"] = df[
    "HeartDisease"
]  # Undo standard scaling for target variable

In [10]:
features.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,-7.304735000000001e-17,1.084221e-16,-6.482953e-16,-1.009843e-15,4.370747e-16,-2.024524e-16,0.553377,0.78976,0.188453,0.221133,0.050109,0.601307,0.1939,0.404139,0.501089,0.430283
std,1.000545,1.000545,1.000545,1.000545,1.000545,1.000545,0.497414,0.407701,0.391287,0.415236,0.218289,0.489896,0.395567,0.490992,0.500271,0.495386
min,-2.706015,-7.154995,-1.818435,-0.5513413,-3.018469,-3.271482,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.6906294,-0.6699346,-0.2337038,-0.5513413,-0.6605778,-0.8324324,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.05188098,-0.1295128,0.2213632,-0.5513413,0.04678968,-0.2695748,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
75%,0.6883185,0.4109089,0.6238346,-0.5513413,0.7541571,0.5747115,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0
max,2.491558,3.653439,3.697252,1.813758,2.561874,4.983762,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Train & Test Split

In [11]:
x = features.drop(columns=["HeartDisease"])
y = features["HeartDisease"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

# Model Comparison

#  KNN

In [13]:
# KNN Classifier object
knn = KNeighborsClassifier()
# Hyperparameters
hyper_params = dict(n_neighbors=list(range(1, 100)))

In [14]:
# Grid Search Cross-validation
if RERUN:
    clf = GridSearchCV(knn, param_grid=hyper_params, scoring="roc_auc", cv=10, n_jobs=-1)
    best_model = clf.fit(x_train, y_train)
    print("Best n_neighbors:", best_model.best_estimator_.get_params()["n_neighbors"])
    print(f"Best Score: {best_model.best_score_}")

Best n_neighbors: 17
Best Score: 0.9170913074907382


In [15]:
best_param = {"n_neighbors": 17}
clf = KNeighborsClassifier(**best_param).fit(x_train, y_train)
predictions = clf.predict(x_test)
print(classification_report(y_test, predictions))
print(roc_auc_score(y_test, predictions))

              precision    recall  f1-score   support

           0       0.81      0.87      0.84       109
           1       0.91      0.87      0.89       167

    accuracy                           0.87       276
   macro avg       0.86      0.87      0.87       276
weighted avg       0.87      0.87      0.87       276

0.8699115530407077


## Logistic Regression

In [16]:
# Logistic Regression object
lr = LogisticRegression()

# Hyperparameters
solvers = ["liblinear"]
penalty = ["l2", "l1"]
c = np.arange(0.01, 100, 0.1).tolist()

# Hyperparameter space
hyper_params = dict(penalty=penalty, C=c, solver=solvers, random_state=[1])

In [17]:
# Grid Search Cross-validation
if RERUN:
    clf = GridSearchCV(
        lr, param_grid=hyper_params, scoring="roc_auc", cv=10, n_jobs=-1
    )
    best_model = clf.fit(x_train, y_train)
    print(f"Best Hyperparameters: {best_model.best_params_}")
    print(f"Best Score: {best_model.best_score_}")

Best Hyperparameters: {'C': 0.7100000000000001, 'penalty': 'l2', 'random_state': 1, 'solver': 'liblinear'}
Best Score: 0.9234234209812959


In [None]:
best_param = {'C': 0.7100000000000001, 'penalty': 'l2',
              'random_state': 1, 'solver': 'liblinear'}
clf = LogisticRegression(**best_param).fit(x_train, y_train)
predictions = clf.predict(x_test)
print(classification_report(y_test, predictions))
print(roc_auc_score(y_test, predictions))

## Decision Tree

In [18]:
# Hyperparameters
criterion = ["gini", "entropy"]
max_depths = np.linspace(1, 32, 32, endpoint=True)
min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True)
min_samples_leafs = np.linspace(0.1, 0.5, 5, endpoint=True)
max_features = list(range(1, x_train.shape[1]))

# Hyperparameter space
hyper_params = dict(
    criterion=criterion,
    max_depth=max_depths,
    min_samples_leaf=min_samples_leafs,
    min_samples_split=min_samples_splits,
    max_features=max_features,
)

In [19]:
# Random Search Cross-validation, it yields the same result as GridSearch CV but with much better efficiency
if RERUN:
    seed = np.random.seed(1)
    rng = np.random.RandomState(1)
    dt = DecisionTreeClassifier(random_state=rng)
    clf = RandomizedSearchCV(dt, n_iter=1000, random_state=rng,
                            param_distributions =hyper_params, scoring="roc_auc", cv=10, n_jobs=-1)
    best_model = clf.fit(x_train, y_train)
    print(f"Best Hyperparameters: {best_model.best_params_}")
    print(f"Best Score: {best_model.best_score_}")

Best Hyperparameters: {'min_samples_split': 0.2, 'min_samples_leaf': 0.1, 'max_features': 12, 'max_depth': 7.0, 'criterion': 'gini'}
Best Score: 0.8733241167434717


In [22]:
best_param = {'min_samples_split': 0.2, 'min_samples_leaf': 0.1,
              'max_features': 12, 'max_depth': 7.0, 'criterion': 'gini'}

clf = DecisionTreeClassifier(**best_param).fit(x_train, y_train)
predictions = clf.predict(x_test)
print(classification_report(y_test, predictions))
print(roc_auc_score(y_test, predictions))

              precision    recall  f1-score   support

           0       0.79      0.77      0.78       109
           1       0.85      0.87      0.86       167

    accuracy                           0.83       276
   macro avg       0.82      0.82      0.82       276
weighted avg       0.83      0.83      0.83       276

0.8194528374443774


## Random Forest

In [20]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=0, stop=2000, num=100)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 120, num=10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 20, 50, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 5, 15]

hyper_params = dict(
    n_estimators=n_estimators,
    min_samples_split=min_samples_split,
    min_samples_leaf=min_samples_leaf,
    max_depth=max_depth
)

In [21]:
if RERUN:
    seed = np.random.seed(1)
    rng = np.random.RandomState(1)
    rf = RandomForestClassifier(random_state=rng)
    clf = RandomizedSearchCV(rf, n_iter=100, random_state=1,
                            param_distributions =hyper_params, scoring="roc_auc", cv=10, n_jobs=-1)
    best_model = clf.fit(x_train, y_train)
    print(f"Best Hyperparameters: {best_model.best_params_}")
    print(f"Best Score: {best_model.best_score_}")

Best Hyperparameters: {'n_estimators': 121, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': 14}
Best Score: 0.927499954820638


10 fits failed out of a total of 1000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\nileb\anaconda3\envs\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\nileb\anaconda3\envs\venv\lib\site-packages\sklearn\ensemble\_forest.py", line 384, in fit
    self._validate_estimator()
  File "C:\Users\nileb\anaconda3\envs\venv\lib\site-packages\sklearn\ensemble\_base.py", line 138, in _validate_estimator
    raise ValueError(
ValueError: n_estimators must be greater than zero, got 0.

 0.92269676 0.92386022 0.91554441 0.88910075 0.9178444  0.923

In [23]:
# Best parameter based on Randomised Hyperparameter CV search
best_param = {'n_estimators': 121, 'min_samples_split': 2,
              'min_samples_leaf': 2, 'max_depth': 14}
clf = RandomForestClassifier(**best_param).fit(x_train, y_train)
predictions = clf.predict(x_test)
print(classification_report(y_test, predictions))
print(roc_auc_score(y_test, predictions))

              precision    recall  f1-score   support

           0       0.86      0.86      0.86       109
           1       0.91      0.91      0.91       167

    accuracy                           0.89       276
   macro avg       0.89      0.89      0.89       276
weighted avg       0.89      0.89      0.89       276

0.88628248090974


## LDA

## QDA

## Support Vector Machine

## Extreme Gradient Boosting (XGBOOST)