In [1]:
# Import libraries for data manipulation, visualization, and machine learning

# Data manipulation libraries
import pandas as pd  # For handling and manipulating dataframes
import numpy as np  # For numerical operations

# Visualization libraries
import matplotlib.pyplot as plt  # For data visualization using Matplotlib
import seaborn as sns  # For advanced data visualization, built on top of Matplotlib

# Machine learning libraries
from sklearn.model_selection import train_test_split  # For splitting the dataset into training and testing sets
from sklearn.linear_model import LogisticRegression  # For applying logistic regression model
from sklearn.metrics import accuracy_score, classification_report  # For evaluating model performance

# Libraries for data preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder  # For encoding categorical variables
from sklearn.compose import ColumnTransformer  # For applying transformations to specific columns
from sklearn.pipeline import Pipeline  # For creating a machine learning pipeline
from sklearn.impute import SimpleImputer  # For handling missing values in the dataset

# Library for saving and loading models
import joblib  # For saving trained models to a file


In [2]:
# Load the dataset
df = pd.read_csv(r'C:\Users\debku\OneDrive\Desktop\heart_disease_prediction\heart_disease_prediction_model\data\heart_disease_uci.csv')


In [3]:
# View the first 10 rows of the dataset
df.head(10)

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
5,6,56,Male,Cleveland,atypical angina,120.0,236.0,False,normal,178.0,False,0.8,upsloping,0.0,normal,0
6,7,62,Female,Cleveland,asymptomatic,140.0,268.0,False,lv hypertrophy,160.0,False,3.6,downsloping,2.0,normal,3
7,8,57,Female,Cleveland,asymptomatic,120.0,354.0,False,normal,163.0,True,0.6,upsloping,0.0,normal,0
8,9,63,Male,Cleveland,asymptomatic,130.0,254.0,False,lv hypertrophy,147.0,False,1.4,flat,1.0,reversable defect,2
9,10,53,Male,Cleveland,asymptomatic,140.0,203.0,True,lv hypertrophy,155.0,True,3.1,downsloping,0.0,reversable defect,1


In [4]:
# View the last 10 rows of the dataset
df.tail(10)

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
910,911,51,Female,VA Long Beach,asymptomatic,114.0,258.0,True,lv hypertrophy,96.0,False,1.0,upsloping,,,0
911,912,62,Male,VA Long Beach,asymptomatic,160.0,254.0,True,st-t abnormality,108.0,True,3.0,flat,,,4
912,913,53,Male,VA Long Beach,asymptomatic,144.0,300.0,True,st-t abnormality,128.0,True,1.5,flat,,,3
913,914,62,Male,VA Long Beach,asymptomatic,158.0,170.0,False,st-t abnormality,138.0,True,0.0,,,,1
914,915,46,Male,VA Long Beach,asymptomatic,134.0,310.0,False,normal,126.0,False,0.0,,,normal,2
915,916,54,Female,VA Long Beach,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,917,62,Male,VA Long Beach,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,918,55,Male,VA Long Beach,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2
918,919,58,Male,VA Long Beach,asymptomatic,,385.0,True,lv hypertrophy,,,,,,,0
919,920,62,Male,VA Long Beach,atypical angina,120.0,254.0,False,lv hypertrophy,93.0,True,0.0,,,,1


In [5]:
# Check the data types of each column and non-null counts
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


In [6]:
# Display basic statistical summary (mean, min, max, standard deviation, etc.) for numerical columns
df.describe()

Unnamed: 0,id,age,trestbps,chol,thalch,oldpeak,ca,num
count,920.0,920.0,861.0,890.0,865.0,858.0,309.0,920.0
mean,460.5,53.51087,132.132404,199.130337,137.545665,0.878788,0.676375,0.995652
std,265.725422,9.424685,19.06607,110.78081,25.926276,1.091226,0.935653,1.142693
min,1.0,28.0,0.0,0.0,60.0,-2.6,0.0,0.0
25%,230.75,47.0,120.0,175.0,120.0,0.0,0.0,0.0
50%,460.5,54.0,130.0,223.0,140.0,0.5,0.0,1.0
75%,690.25,60.0,140.0,268.0,157.0,1.5,1.0,2.0
max,920.0,77.0,200.0,603.0,202.0,6.2,3.0,4.0


In [7]:
# Check the number of rows and columns in the dataset
print(df.shape)

(920, 16)


In [8]:
# Display column names
print(df.columns)

Index(['id', 'age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs',
       'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'],
      dtype='object')


In [9]:
# Check for any missing values in each column
print(df.isnull().sum())

id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64


In [10]:
# Get a count of unique values in each column (useful for categorical data)
print(df.nunique())

id          920
age          50
sex           2
dataset       4
cp            4
trestbps     61
chol        217
fbs           2
restecg       3
thalch      119
exang         2
oldpeak      53
slope         3
ca            4
thal          3
num           5
dtype: int64


In [11]:
# Map categorical text data to numeric values
df['sex'] = df['sex'].map({'Male': 1, 'Female': 0})
df['fbs'] = df['fbs'].map({'TRUE': 1, 'FALSE': 0})
df['exang'] = df['exang'].map({'TRUE': 1, 'FALSE': 0})
df['cp'] = df['cp'].map({'typical angina': 0, 'atypical angina': 1, 'non-anginal': 2, 'asymptomatic': 3})
df['restecg'] = df['restecg'].map({'normal': 0, 'lv hypertrophy': 1, 'ST-T wave abnormality': 2})
df['slope'] = df['slope'].map({'upsloping': 0, 'flat': 1, 'downsloping': 2})
df['thal'] = df['thal'].map({'normal': 0, 'fixed defect': 1, 'reversable defect': 2})


In [12]:
# Save the cleaned data
df.to_csv('data/cleaned_heart_disease_data.csv', index=False)


In [13]:
# Convert the 'num' column to binary (0 for no disease, 1 for disease)
df['target'] = df['num'].apply(lambda x: 1 if x > 0 else 0)


In [14]:
# Select features and target
X = df.drop(columns=['num', 'id', 'target', 'dataset'])
y = df['target']

In [15]:
# Drop columns with all NaN values
X = X.drop(columns=['fbs', 'exang'], errors='ignore')

In [16]:
# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()


In [17]:
# Create preprocessing pipelines for numeric and categorical data
numeric_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [18]:
# Combine preprocessing for both numeric and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [19]:
# Create a pipeline with preprocessing and logistic regression model
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', LogisticRegression(max_iter=1000, random_state=42))])


In [20]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [21]:
X_train

Unnamed: 0,age,sex,cp,trestbps,chol,restecg,thalch,oldpeak,slope,ca,thal
880,62,1,3,,170.0,,120.0,3.0,,,
457,54,1,2,150.0,,0.0,122.0,0.0,,,
797,51,1,2,,339.0,0.0,,,,,
25,50,0,2,120.0,219.0,0.0,158.0,1.6,1.0,0.0,0.0
84,52,1,1,120.0,325.0,0.0,172.0,0.2,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
106,59,1,3,140.0,177.0,0.0,162.0,0.0,0.0,1.0,2.0
270,61,1,3,140.0,207.0,1.0,138.0,1.9,0.0,1.0,2.0
860,75,1,3,160.0,310.0,0.0,112.0,2.0,2.0,,2.0
435,53,0,1,140.0,216.0,0.0,142.0,2.0,1.0,,


In [22]:
X_test

Unnamed: 0,age,sex,cp,trestbps,chol,restecg,thalch,oldpeak,slope,ca,thal
319,36,1,1,120.0,166.0,0.0,180.0,0.0,,,
377,45,1,1,140.0,224.0,0.0,122.0,0.0,,,
538,48,1,3,160.0,329.0,0.0,92.0,1.5,1.0,,
296,59,1,3,164.0,176.0,1.0,90.0,1.0,1.0,2.0,1.0
531,40,0,3,150.0,392.0,0.0,130.0,2.0,1.0,,1.0
...,...,...,...,...,...,...,...,...,...,...,...
382,46,1,1,140.0,275.0,0.0,165.0,0.0,,,
97,60,0,3,150.0,258.0,1.0,157.0,2.6,1.0,2.0,2.0
906,61,1,2,120.0,337.0,0.0,98.0,0.0,,,
467,55,1,3,120.0,270.0,0.0,140.0,0.0,,,


In [23]:
y_train

880    1
457    0
797    1
25     0
84     0
      ..
106    1
270    1
860    0
435    0
102    0
Name: target, Length: 736, dtype: int64

In [24]:
y_test

319    0
377    0
538    1
296    1
531    1
      ..
382    0
97     1
906    1
467    0
732    1
Name: target, Length: 184, dtype: int64

In [25]:
# Train the model
model_pipeline.fit(X_train, y_train)

In [26]:
# Evaluate the model
y_pred = model_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.81


In [27]:
# Save the model
joblib.dump(model_pipeline, 'heart_disease_risk_model.pkl')

['heart_disease_risk_model.pkl']

In [28]:
# Load the entire pipeline
model_pipeline = joblib.load('heart_disease_risk_model.pkl')

In [29]:
# Get the feature names expected by the pipeline
expected_columns = model_pipeline.named_steps['preprocessor'].transformers_[0][1].get_feature_names_out()
print("Expected Columns:", expected_columns)

Expected Columns: ['age' 'sex' 'cp' 'trestbps' 'chol' 'restecg' 'thalch' 'oldpeak' 'slope'
 'ca' 'thal']


In [32]:
# Test with new input data including all expected columns
input_data = pd.DataFrame({
    'age': [25],
    'sex': [0],
    'cp': [0],
    'trestbps': [110],
    'chol': [150],
    'thalch': [180],
    'restecg': [0],
    'oldpeak': [0.0],
    'slope': [0],
    'ca': [0],
    'thal': [0]
})

# Make prediction using the model pipeline
prediction = model_pipeline.predict(input_data)

# Print the prediction result
if prediction[0] == 0:
    print('The Person does not have Heart Disease')
else:
    print('The Person has Heart Disease')


The Person does not have Heart Disease


In [33]:
# Test with another input data
input_data = pd.DataFrame({
    'age': [62],
    'sex': [0],
    'cp': [0],
    'trestbps': [140],
    'chol': [268],
    'thalch': [160],
    'restecg': [1],
    'oldpeak': [3.6],
    'slope': [2],
    'ca': [2],
    'thal': [0]
})

# Make prediction using the model pipeline
prediction = model_pipeline.predict(input_data)

# Print the prediction result
if prediction[0] == 0:
    print('The Person does not have Heart Disease')
else:
    print('The Person has Heart Disease')

The Person has Heart Disease


In [34]:
import pandas as pd

# Test with multiple patient data
input_data = pd.DataFrame({
    'age': [45, 63, 67, 67, 37, 41, 56, 62, 57, 63],
    'sex': [0, 1, 1, 1, 1, 0, 1, 0, 0, 1],
    'cp': [1, 0, 3, 3, 2, 1, 1, 3, 1, 3],
    'trestbps': [130, 145, 160, 120, 130, 130, 120, 140, 180, 130],
    'chol': [250, 233, 286, 229, 250, 204, 236, 268, 354, 254],
    'thalch': [180, 150, 108, 129, 187, 172, 178, 160, 163, 147],
    'restecg': [1, 1, 1, 1, 0, 1, 0, 1, 0, 1],
    'oldpeak': [3.5, 2.3, 1.5, 2.6, 3.5, 1.4, 0.8, 3.6, 0.6, 1.4],
    'slope': [2, 2, 1, 1, 2, 0, 0, 2, 0, 1],
    'ca': [2, 0, 3, 2, 0, 0, 0, 2, 0, 1],
    'thal': [0, 0, 2, 2, 0, 0, 0, 2, 0, 1]
})

# Make prediction using the model pipeline
predictions = model_pipeline.predict(input_data)

# Print the prediction results for each patient
for i, prediction in enumerate(predictions):
    result = 'The Person does not have Heart Disease' if prediction == 0 else 'The Person has Heart Disease'
    print(f'Patient {i+1}: {result}')


Patient 1: The Person has Heart Disease
Patient 2: The Person does not have Heart Disease
Patient 3: The Person has Heart Disease
Patient 4: The Person has Heart Disease
Patient 5: The Person has Heart Disease
Patient 6: The Person does not have Heart Disease
Patient 7: The Person does not have Heart Disease
Patient 8: The Person has Heart Disease
Patient 9: The Person does not have Heart Disease
Patient 10: The Person has Heart Disease


In [35]:
# Existing model pipeline loaded from 'heart_disease_risk_model.pkl'
# We'll use this model to predict heart disease for multiple new patients.

# Define data for more new patients
new_patient_data = pd.DataFrame({
    'age': [25, 60, 45, 70, 50, 34, 67, 54, 62, 41, 55, 38, 46, 59, 61],
    'sex': [0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1],
    'cp': [0, 1, 3, 2, 1, 2, 0, 1, 2, 0, 3, 1, 2, 0, 1],
    'trestbps': [110, 140, 130, 150, 120, 115, 135, 145, 155, 125, 135, 140, 138, 142, 126],
    'chol': [150, 230, 180, 250, 200, 210, 190, 220, 240, 170, 160, 225, 215, 205, 245],
    'thalch': [180, 150, 160, 140, 170, 155, 145, 160, 130, 175, 165, 135, 170, 158, 150],
    'restecg': [0, 1, 0, 2, 1, 0, 1, 0, 2, 0, 1, 1, 2, 0, 1],
    'oldpeak': [0.0, 1.5, 1.0, 2.5, 0.5, 0.2, 1.8, 1.0, 3.0, 0.3, 0.7, 1.2, 2.0, 0.6, 1.3],
    'slope': [0, 1, 2, 2, 1, 2, 0, 1, 2, 0, 1, 2, 2, 0, 1],
    'ca': [0, 2, 1, 3, 0, 0, 1, 2, 3, 1, 0, 2, 2, 1, 2],
    'thal': [0, 1, 2, 2, 1, 0, 1, 2, 2, 0, 1, 2, 1, 0, 2]
})

# Make predictions using the model pipeline
predictions = model_pipeline.predict(new_patient_data)

# Print the prediction results for each patient
for i, prediction in enumerate(predictions):
    if prediction == 0:
        print(f"Patient {i+1}: The Person does not have Heart Disease")
    else:
        print(f"Patient {i+1}: The Person has Heart Disease")


Patient 1: The Person does not have Heart Disease
Patient 2: The Person has Heart Disease
Patient 3: The Person has Heart Disease
Patient 4: The Person has Heart Disease
Patient 5: The Person does not have Heart Disease
Patient 6: The Person does not have Heart Disease
Patient 7: The Person does not have Heart Disease
Patient 8: The Person has Heart Disease
Patient 9: The Person has Heart Disease
Patient 10: The Person does not have Heart Disease
Patient 11: The Person has Heart Disease
Patient 12: The Person has Heart Disease
Patient 13: The Person has Heart Disease
Patient 14: The Person does not have Heart Disease
Patient 15: The Person has Heart Disease
