In [33]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.model_selection import GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
import xgboost as xgb

In [2]:
df = pd.read_csv("heart.csv")

In [3]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [4]:
numeric_df = df.select_dtypes(np.number)
scaled_features = StandardScaler().fit_transform(numeric_df.values)
scaled_df = pd.DataFrame(scaled_features, columns=numeric_df.columns)

In [5]:
cat_df = df.select_dtypes(exclude=np.number)
dummy_df = pd.get_dummies(
    cat_df, drop_first=True
)  # Drop first dummy variable as a base

In [6]:
features = pd.concat([scaled_df, dummy_df], axis=1)
features["HeartDisease"] = df[
    "HeartDisease"
]  # Undo standard scaling for target variable

In [11]:
x = features.drop(columns=["HeartDisease"])
y = features["HeartDisease"]

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

## Discriminant Analysis (LDA/QDA)

In [16]:
# LDA
# Start with solver
grid = {'solver': ['svd', 'lsqr', 'eigen']}

lda_mod = LinearDiscriminantAnalysis()
clf = GridSearchCV(lda_mod, param_grid=grid, scoring='recall', cv=10, n_jobs=-1)
best_lda = clf.fit(X_train, y_train)

# SVD is best solver, so no shrinkage tuning required
print(best_lda.best_estimator_.get_params())

{'n_components': None, 'priors': None, 'shrinkage': None, 'solver': 'svd', 'store_covariance': False, 'tol': 0.0001}




In [17]:
# Recall = [.9, .87], weighted avg = .88
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.90      0.86       109
           1       0.93      0.87      0.90       167

    accuracy                           0.88       276
   macro avg       0.88      0.89      0.88       276
weighted avg       0.89      0.88      0.88       276



In [19]:
# QDA
qda_mod = QuadraticDiscriminantAnalysis()
qda_mod.fit(X_train, y_train)

# recall = [.91, .85], weighted avg = .87
y_pred = qda_mod.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.91      0.85       109
           1       0.93      0.85      0.89       167

    accuracy                           0.87       276
   macro avg       0.87      0.88      0.87       276
weighted avg       0.88      0.87      0.87       276



## SVM

In [40]:
# SVM
svm_mod = SVC()

grid = {
    'C': [.01, .1, 1, 10],
    'kernel': ['poly', 'rbf', 'sigmoid'],
    'gamma': ['auto', 10, 1, .1, .01, .001]
}

# grid = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}

clf_svm = GridSearchCV(svm_mod, param_grid=grid, scoring='recall_weighted', cv=10, n_jobs=-1, verbose=2)
best_svm = clf_svm.fit(X_train, y_train)
print(best_svm.best_estimator_.get_params())

Fitting 10 folds for each of 72 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 174 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done 580 tasks      | elapsed:   22.1s


{'C': 1, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}


[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:   25.1s finished


In [41]:
y_pred = clf_svm.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.86      0.85       109
           1       0.91      0.89      0.90       167

    accuracy                           0.88       276
   macro avg       0.87      0.87      0.87       276
weighted avg       0.88      0.88      0.88       276



## XGBoost

In [None]:
# XGBoost
