Grab the dataset from Kaggle

In [11]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

file_path = "Heart_Disease_Prediction.csv"

df = kagglehub.dataset_load(
  KaggleDatasetAdapter.PANDAS,
  "neurocipher/heartdisease",
  file_path
)

df.head()


Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,Presence
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7,Absence
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7,Presence
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7,Absence
4,74,0,2,120,269,0,2,121,1,0.2,1,1,3,Absence


In [10]:
df.describe()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium
count,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0
mean,54.433333,0.677778,3.174074,131.344444,249.659259,0.148148,1.022222,149.677778,0.32963,1.05,1.585185,0.67037,4.696296
std,9.109067,0.468195,0.95009,17.861608,51.686237,0.355906,0.997891,23.165717,0.470952,1.14521,0.61439,0.943896,1.940659
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0
25%,48.0,0.0,3.0,120.0,213.0,0.0,0.0,133.0,0.0,0.0,1.0,0.0,3.0
50%,55.0,1.0,3.0,130.0,245.0,0.0,2.0,153.5,0.0,0.8,2.0,0.0,3.0
75%,61.0,1.0,4.0,140.0,280.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0


In [12]:


labels_str = df["Heart Disease"]
labels_str



0      Presence
1       Absence
2      Presence
3       Absence
4       Absence
         ...   
265     Absence
266     Absence
267     Absence
268     Absence
269    Presence
Name: Heart Disease, Length: 270, dtype: str

In [13]:
labels = labels_str.map({'Absence': 0, 'Presence': 1})
labels.head()

0    1
1    0
2    1
3    0
4    0
Name: Heart Disease, dtype: int64

In [14]:
labels.unique()

array([1, 0])

This confirms `labels` is binary 

<b>Logistic Regression with One Input Feature</b>


We use the feature 'Cholesterol'.

In [8]:
features = df["Cholesterol"]
features.head()


0    322
1    564
2    261
3    263
4    269
Name: Cholesterol, dtype: int64

In [15]:
x = features
y = labels

In [16]:
from sklearn.linear_model import LogisticRegression


from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

import pandas as pd

In [19]:
standardized_data = pd.DataFrame(scaler.fit_transform(x.values.reshape(-1,1)), columns=['cholesterol'])
standardized_data.head()

Unnamed: 0,cholesterol
0,1.402212
1,6.093004
2,0.219823
3,0.258589
4,0.37489


Now we split data so some of it used used for training and the rest for testing.

In [22]:
from sklearn.model_selection import train_test_split



x_train, x_test, y_train, y_test = train_test_split(standardized_data, y, test_size=0.2, shuffle=True, random_state=42)
print("x_train shape", x_train.shape)
print("x_test shape:", x_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

x_train shape (216, 1)
x_test shape: (54, 1)
y_train shape: (216,)
y_test shape: (54,)


Now we pick the default loss function for scikit-learn which is the log-loss function.

The log loss function is a function that measures the divergence of the predicted probability with the actual labels. The less the output of this function, the better the model. For a hypothetical perfect model, the log loss = 0.

In [23]:
model = LogisticRegression(random_state=42).fit(x_train, y_train)

The model has been trained, now we can test is accuracy.

In [24]:
scores = model.score(x_test, y_test)
scores

0.6111111111111112

In [25]:
pred_labels = model.predict(x_test)



from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, pred_labels)
print(cm)

[[28  5]
 [16  5]]


According to the confusion matrix, we can see that the model got 28 positive diagnoses right and 5 negative diagnoses correct, but had 16 false negatives and 5 false positives.

In [26]:
from numpy import mean, std
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.metrics import classification_report

f1 = f1_score(y_test, pred_labels)
recall = recall_score(y_test, pred_labels)
precision = precision_score(y_test, pred_labels)

f1_avg = mean(f1_score(y_test, pred_labels, average=None))
recall_avg = mean(recall_score(y_test, pred_labels, average=None))
precision_avg = mean(precision_score(y_test, pred_labels, average=None))

f1_sd = std(f1_score(y_test, pred_labels, average=None))
recall_sd = std(recall_score(y_test, pred_labels, average=None))
precision_sd = std(precision_score(y_test, pred_labels, average=None))

print('\nf1:\t\t',f1)
print('recall\t\t',recall)
print('precision\t',precision)

print('\nf1_avg:\t\t',f1_avg)
print('recall_avg\t',recall_avg)
print('precision_avg\t',precision_avg)

print('\nf1_sd:\t\t',f1_sd)
print('recall_sd\t',recall_sd)
print('precision_sd\t',precision_sd)

print('\n',classification_report(y_test, pred_labels))



f1:		 0.3225806451612903
recall		 0.23809523809523808
precision	 0.5

f1_avg:		 0.5249266862170088
recall_avg	 0.5432900432900433
precision_avg	 0.5681818181818181

f1_sd:		 0.2023460410557185
recall_sd	 0.3051948051948052
precision_sd	 0.06818181818181818

               precision    recall  f1-score   support

           0       0.64      0.85      0.73        33
           1       0.50      0.24      0.32        21

    accuracy                           0.61        54
   macro avg       0.57      0.54      0.52        54
weighted avg       0.58      0.61      0.57        54



Thus, we can evaluate our model using precision, recall and f1-score.

We can use cross-validation to get even more accurate numbers without risk of overfitting on the test data.

In [27]:
from sklearn.model_selection import cross_val_predict

model = LogisticRegression(random_state=42)

cv_pred_labels = cross_val_predict(model, standardized_data, labels, cv=10)


accuracy = accuracy_score(labels, cv_pred_labels)

cm = confusion_matrix(labels, cv_pred_labels)

f1 = f1_score(labels, cv_pred_labels)
recall = recall_score(labels, cv_pred_labels)
precision = precision_score(labels, cv_pred_labels)

f1_avg = mean(f1_score(labels, cv_pred_labels, average=None))
recall_avg = mean(recall_score(labels, cv_pred_labels, average=None))
precision_avg = mean(precision_score(labels, cv_pred_labels, average=None))

f1_sd = std(f1_score(labels, cv_pred_labels, average=None))
recall_sd = std(recall_score(labels, cv_pred_labels, average=None))
precision_sd = std(precision_score(labels, cv_pred_labels, average=None))

print('accuracy:\t', accuracy)

print(cm)

print('\nf1:\t\t',f1)
print('recall\t\t',recall)
print('precision\t',precision)

print('\nf1_avg:\t\t',f1_avg)
print('recall_avg\t',recall_avg)
print('precision_avg\t',precision_avg)

print('\nf1_sd:\t\t',f1_sd)
print('recall_sd\t',recall_sd)
print('precision_sd\t',precision_sd)

print('\n',classification_report(labels, cv_pred_labels))



accuracy:	 0.5407407407407407
[[128  22]
 [102  18]]

f1:		 0.225
recall		 0.15
precision	 0.45

f1_avg:		 0.4493421052631579
recall_avg	 0.5016666666666667
precision_avg	 0.5032608695652174

f1_sd:		 0.2243421052631579
recall_sd	 0.3516666666666667
precision_sd	 0.05326086956521739

               precision    recall  f1-score   support

           0       0.56      0.85      0.67       150
           1       0.45      0.15      0.23       120

    accuracy                           0.54       270
   macro avg       0.50      0.50      0.45       270
weighted avg       0.51      0.54      0.47       270



We get a final accuraccy of 0.54.

<b>Logictic Regression with Multiple Input Features</b>


We now use all features except `Heart Disease` as the input features.

In [28]:
features = df[df.columns.difference(["Heart Disease"])]
features.head()

Unnamed: 0,Age,BP,Chest pain type,Cholesterol,EKG results,Exercise angina,FBS over 120,Max HR,Number of vessels fluro,ST depression,Sex,Slope of ST,Thallium
0,70,130,4,322,2,0,0,109,3,2.4,1,2,3
1,67,115,3,564,2,0,0,160,0,1.6,0,2,7
2,57,124,2,261,0,0,0,141,0,0.3,1,1,7
3,64,128,4,263,0,1,0,105,1,0.2,1,2,7
4,74,120,2,269,2,1,0,121,1,0.2,0,1,3


In [29]:
x = features

standardized_data = pd.DataFrame(scaler.fit_transform(x), columns=["age", "bp", "chest pain type", "cholesterol",
                                                                   "ekg results", "exercise angina", "fbs over 120",
                                                                   "max hr", "number of vessels fluro", "st depression",
                                                                   "sex", "slope of st", "thallium"])
standardized_data.head()

Unnamed: 0,age,bp,chest pain type,cholesterol,ekg results,exercise angina,fbs over 120,max hr,number of vessels fluro,st depression,sex,slope of st,thallium
0,1.712094,-0.07541,0.870928,1.402212,0.981664,-0.701222,-0.417029,-1.759208,2.472682,1.181012,0.6895,0.676419,-0.875706
1,1.38214,-0.916759,-0.183559,6.093004,0.981664,-0.701222,-0.417029,0.446409,-0.711535,0.481153,-1.450327,0.676419,1.189277
2,0.282294,-0.41195,-1.238045,0.219823,-1.026285,-0.701222,-0.417029,-0.375291,-0.711535,-0.656118,0.6895,-0.954234,1.189277
3,1.052186,-0.18759,0.870928,0.258589,-1.026285,1.426081,-0.417029,-1.932198,0.349871,-0.7436,0.6895,0.676419,1.189277
4,2.152032,-0.63631,-1.238045,0.37489,0.981664,1.426081,-0.417029,-1.240239,0.349871,-0.7436,-1.450327,-0.954234,-0.875706


In [31]:
x_train, x_test, y_train, y_test = train_test_split(standardized_data, y, test_size=0.2, shuffle=True, random_state=42)
print("x_train shape", x_train.shape)
print("x_test shape:", x_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

x_train shape (216, 13)
x_test shape: (54, 13)
y_train shape: (216,)
y_test shape: (54,)


In [32]:
model = LogisticRegression(random_state=42).fit(x_train, y_train)

The model is now ready. We can test it to see how it does in comparision to single feature regression.

In [33]:
scores = model.score(x_test, y_test)
scores

0.9074074074074074

In [34]:
pred_labels = model.predict(x_test)



cm = confusion_matrix(y_test, pred_labels)
print(cm)

[[31  2]
 [ 3 18]]


In [35]:
from sklearn.metrics import classification_report

f1 = f1_score(y_test, pred_labels)
recall = recall_score(y_test, pred_labels)
precision = precision_score(y_test, pred_labels)

f1_avg = mean(f1_score(y_test, pred_labels, average=None))
recall_avg = mean(recall_score(y_test, pred_labels, average=None))
precision_avg = mean(precision_score(y_test, pred_labels, average=None))

f1_sd = std(f1_score(y_test, pred_labels, average=None))
recall_sd = std(recall_score(y_test, pred_labels, average=None))
precision_sd = std(precision_score(y_test, pred_labels, average=None))

print('\nf1:\t\t',f1)
print('recall\t\t',recall)
print('precision\t',precision)

print('\nf1_avg:\t\t',f1_avg)
print('recall_avg\t',recall_avg)
print('precision_avg\t',precision_avg)

print('\nf1_sd:\t\t',f1_sd)
print('recall_sd\t',recall_sd)
print('precision_sd\t',precision_sd)

print('\n',classification_report(y_test, pred_labels))


f1:		 0.8780487804878049
recall		 0.8571428571428571
precision	 0.9

f1_avg:		 0.9017109574080815
recall_avg	 0.8982683982683983
precision_avg	 0.9058823529411765

f1_sd:		 0.023662176920276667
recall_sd	 0.041125541125541176
precision_sd	 0.00588235294117645

               precision    recall  f1-score   support

           0       0.91      0.94      0.93        33
           1       0.90      0.86      0.88        21

    accuracy                           0.91        54
   macro avg       0.91      0.90      0.90        54
weighted avg       0.91      0.91      0.91        54



Using cross-validation for better accuracy,

In [36]:
model = LogisticRegression(random_state=42)

cv_pred_labels = cross_val_predict(model, standardized_data, labels, cv=10)

In [37]:
accuracy = accuracy_score(labels, cv_pred_labels)

cm = confusion_matrix(labels, cv_pred_labels)

f1 = f1_score(labels, cv_pred_labels)
recall = recall_score(labels, cv_pred_labels)
precision = precision_score(labels, cv_pred_labels)

f1_avg = mean(f1_score(labels, cv_pred_labels, average=None))
recall_avg = mean(recall_score(labels, cv_pred_labels, average=None))
precision_avg = mean(precision_score(labels, cv_pred_labels, average=None))

f1_sd = std(f1_score(labels, cv_pred_labels, average=None))
recall_sd = std(recall_score(labels, cv_pred_labels, average=None))
precision_sd = std(precision_score(labels, cv_pred_labels, average=None))

print('accuracy:\t', accuracy)

print(cm)

print('\nf1:\t\t',f1)
print('recall\t\t',recall)
print('precision\t',precision)

print('\nf1_avg:\t\t',f1_avg)
print('recall_avg\t',recall_avg)
print('precision_avg\t',precision_avg)

print('\nf1_sd:\t\t',f1_sd)
print('recall_sd\t',recall_sd)
print('precision_sd\t',precision_sd)

print('\n',classification_report(labels, cv_pred_labels))

accuracy:	 0.8407407407407408
[[132  18]
 [ 25  95]]

f1:		 0.8154506437768241
recall		 0.7916666666666666
precision	 0.8407079646017699

f1_avg:		 0.8376927485985097
recall_avg	 0.8358333333333333
precision_avg	 0.8407361479059805

f1_sd:		 0.022242104821685704
recall_sd	 0.04416666666666669
precision_sd	 2.81833042106161e-05

               precision    recall  f1-score   support

           0       0.84      0.88      0.86       150
           1       0.84      0.79      0.82       120

    accuracy                           0.84       270
   macro avg       0.84      0.84      0.84       270
weighted avg       0.84      0.84      0.84       270



We get a final accuracy of 0.84.
The precision and recall have values of 0.84 and 0.79.

Clearly, using multiple features provides us a much better model in terms of accuracy as well as all other metrics.