Importing the Dependencies

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

Data Collection and Processing

In [None]:
#loading the csv data to a Pandas DataFrame
heartdata = pd.read_csv('/content/heart_disease_uci.csv')

In [None]:
#print first 5 rows of the dataset
heartdata.head()

#print last 5 rows of the dataset
heartdata.tail()


Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
915,916,54,Female,VA Long Beach,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,917,62,Male,VA Long Beach,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,918,55,Male,VA Long Beach,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2
918,919,58,Male,VA Long Beach,asymptomatic,,385.0,True,lv hypertrophy,,,,,,,0
919,920,62,Male,VA Long Beach,atypical angina,120.0,254.0,False,lv hypertrophy,93.0,True,0.0,,,,1


In [None]:
##number of rows and columns in the dataset
heartdata.shape

#getting some info about the data
heartdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


In [None]:
#checking for missing values
heartdata.isnull().sum()

Unnamed: 0,0
id,0
age,0
sex,0
dataset,0
cp,0
trestbps,59
chol,30
fbs,90
restecg,2
thalch,55


In [None]:
#statistical measures about the data
heartdata.describe()

Unnamed: 0,id,age,trestbps,chol,thalch,oldpeak,ca,num
count,920.0,920.0,861.0,890.0,865.0,858.0,309.0,920.0
mean,460.5,53.51087,132.132404,199.130337,137.545665,0.878788,0.676375,0.995652
std,265.725422,9.424685,19.06607,110.78081,25.926276,1.091226,0.935653,1.142693
min,1.0,28.0,0.0,0.0,60.0,-2.6,0.0,0.0
25%,230.75,47.0,120.0,175.0,120.0,0.0,0.0,0.0
50%,460.5,54.0,130.0,223.0,140.0,0.5,0.0,1.0
75%,690.25,60.0,140.0,268.0,157.0,1.5,1.0,2.0
max,920.0,77.0,200.0,603.0,202.0,6.2,3.0,4.0


In [None]:
#checking distribution of num variable
#target [0=no heart disease; 1,2,3,4 = stages of heart disease]
heartdata['heart_disease_binary'] = heartdata['num'].apply(lambda x: 1 if x > 0 else 0)
#heartdata['num'].value_counts()
heartdata['heart_disease_binary'].value_counts()

Unnamed: 0_level_0,count
heart_disease_binary,Unnamed: 1_level_1
1,509
0,411


In [None]:
x = heartdata.drop(columns=['num', 'heart_disease_binary'], axis=1)
y = heartdata['heart_disease_binary']
print(y)

0      0
1      1
2      1
3      0
4      0
      ..
915    1
916    0
917    1
918    0
919    1
Name: heart_disease_binary, Length: 920, dtype: int64


Splitting the Data into Training and Test Data

In [None]:
#test_size is the percentage of data being used as test data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=2)

In [None]:
print(x.shape, x_train.shape, x_test.shape)

(920, 15) (736, 15) (184, 15)


Model Training

Logistic Regression

In [None]:

encoder = LabelEncoder()
# Iterate through columns and transform object type columns
for col in x_train.columns:
    if x_train[col].dtype == 'object':
        # Replace NaN values with a placeholder, for example 'unknown'
        x_train[col] = x_train[col].fillna('unknown')
        x_test[col] = x_test[col].fillna('unknown')

        # Combine unique values from both train and test sets before fitting
        all_values = pd.concat([x_train[col], x_test[col]]).unique()

        # Convert boolean values to strings before fitting LabelEncoder
        all_values = [str(value) for value in all_values]  # This line is the fix

        encoder.fit(all_values)

        # Transform the column in BOTH training and testing data using the encoder
        x_train[col] = encoder.transform(x_train[col].astype(str)) # Also cast to string here
        x_test[col] = encoder.transform(x_test[col].astype(str)) # Also cast to string here

# Impute missing numerical values using SimpleImputer
imputer = SimpleImputer(strategy='mean')  # or 'median', 'most_frequent'

# Apply the imputer to numerical features in both training and testing data
x_train = imputer.fit_transform(x_train) # Fit and transform on training data
x_test = imputer.transform(x_test) # Only transform on testing data


In [None]:
#training the Logistic Regression model with Training data
model = LogisticRegression()
model.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Evaluation

Accuracy Score

In [None]:
#accuracy on training data
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction, y_train)

In [None]:
print("Accuracy on Training data:", training_data_accuracy)

Accuracy on Training data: 0.8260869565217391


In [None]:
#accuracy on test data
x_test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(x_test_prediction, y_test)

In [None]:
print("Accuracy on Test data:", test_data_accuracy)

Accuracy on Test data: 0.8206521739130435
