Importing the Dependencies

In [44]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

Data Collection and Processing

In [45]:
#loading the csv data to a Pandas DataFrame
heartdata = pd.read_csv('heart_disease_uci.csv')

In [46]:
#print first 5 rows of the dataset
heartdata.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [47]:
#print last 5 rows of the dataset
heartdata.tail()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
915,916,54,Female,VA Long Beach,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,917,62,Male,VA Long Beach,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,918,55,Male,VA Long Beach,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2
918,919,58,Male,VA Long Beach,asymptomatic,,385.0,True,lv hypertrophy,,,,,,,0
919,920,62,Male,VA Long Beach,atypical angina,120.0,254.0,False,lv hypertrophy,93.0,True,0.0,,,,1


In [48]:
##number of rows and columns in the dataset
heartdata.shape

#getting some info about the data
heartdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


In [49]:
#checking for missing values
heartdata.isnull().sum()

Unnamed: 0,0
id,0
age,0
sex,0
dataset,0
cp,0
trestbps,59
chol,30
fbs,90
restecg,2
thalch,55


In [50]:
#statistical measures about the data
heartdata.describe()

Unnamed: 0,id,age,trestbps,chol,thalch,oldpeak,ca,num
count,920.0,920.0,861.0,890.0,865.0,858.0,309.0,920.0
mean,460.5,53.51087,132.132404,199.130337,137.545665,0.878788,0.676375,0.995652
std,265.725422,9.424685,19.06607,110.78081,25.926276,1.091226,0.935653,1.142693
min,1.0,28.0,0.0,0.0,60.0,-2.6,0.0,0.0
25%,230.75,47.0,120.0,175.0,120.0,0.0,0.0,0.0
50%,460.5,54.0,130.0,223.0,140.0,0.5,0.0,1.0
75%,690.25,60.0,140.0,268.0,157.0,1.5,1.0,2.0
max,920.0,77.0,200.0,603.0,202.0,6.2,3.0,4.0


In [51]:
#checking distribution of num variable
#target [0=no heart disease; 1,2,3,4 = stages of heart disease]
heartdata['heart_disease_binary'] = heartdata['num'].apply(lambda x: 1 if x > 0 else 0)
#heartdata['num'].value_counts()
heartdata['heart_disease_binary'].value_counts()

Unnamed: 0_level_0,count
heart_disease_binary,Unnamed: 1_level_1
1,509
0,411


In [52]:
x = heartdata.drop(columns=['num', 'heart_disease_binary'], axis=1)
y = heartdata['heart_disease_binary']
print(y)

0      0
1      1
2      1
3      0
4      0
      ..
915    1
916    0
917    1
918    0
919    1
Name: heart_disease_binary, Length: 920, dtype: int64


1. Splitting the Data into Training and Test Data

In [53]:
#test_size is the percentage of data being used as test data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=2)

In [54]:
print(x.shape, x_train.shape, x_test.shape)

(920, 15) (736, 15) (184, 15)


2. Data Preprocessing

In [55]:

encoder = LabelEncoder()
# Iterate through columns and transform object type columns
for col in x_train.columns:
    if x_train[col].dtype == 'object' or x_train[col].dtype == 'bool':
        # Replace NaN values with a placeholder, for example 'unknown'
        x_train[col] = x_train[col].fillna('unknown')
        x_test[col] = x_test[col].fillna('unknown')

        # Combine unique values from both train and test sets before fitting
        all_values = pd.concat([x_train[col], x_test[col]]).unique()

        # Convert boolean values to strings before fitting LabelEncoder
        all_values = [str(value) for value in all_values]  # This line is the fix

        encoder.fit(all_values)

        # Transform the column in BOTH training and testing data using the encoder
        x_train[col] = encoder.transform(x_train[col].astype(str)) # Also cast to string here
        x_test[col] = encoder.transform(x_test[col].astype(str)) # Also cast to string here

# Impute missing numerical values using SimpleImputer
imputer = SimpleImputer(strategy='mean')  # or 'median', 'most_frequent'

# Apply the imputer to numerical features in both training and testing data
x_train = imputer.fit_transform(x_train) # Fit and transform on training data
x_test = imputer.transform(x_test) # Only transform on testing data


3. Model Training

In [56]:
#training the Logistic Regression model with Training data
model = LogisticRegression(max_iter=5000)
model.fit(x_train, y_train)

Model Evaluation

Accuracy Score

In [57]:
#accuracy on training data
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(y_train, x_train_prediction)

In [58]:
print("Accuracy on Training data:", training_data_accuracy)

Accuracy on Training data: 0.8383152173913043


In [59]:
# accuracy on test data
x_test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(y_test, x_test_prediction)
print("Test Accuracy:", test_data_accuracy)

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print("Confusion Matrix:")
print(confusion_matrix(y_test, x_test_prediction))

print("\nClassification Report:")
print(classification_report(y_test, x_test_prediction))

print("\nROC AUC:", roc_auc_score(y_test, model.predict_proba(x_test)[:,1]))

Test Accuracy: 0.842391304347826
Confusion Matrix:
[[65 17]
 [12 90]]

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.79      0.82        82
           1       0.84      0.88      0.86       102

    accuracy                           0.84       184
   macro avg       0.84      0.84      0.84       184
weighted avg       0.84      0.84      0.84       184


ROC AUC: 0.8864179818268771


In [60]:
print("Accuracy on Test data:", test_data_accuracy)

Accuracy on Test data: 0.842391304347826
