Data Prepared

In [1]:
import warnings
warnings.filterwarnings('ignore')

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# Neural Network

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import confusion_matrix, classification_report



In [2]:
# Read dataset
df=pd.read_csv('https://raw.githubusercontent.com/cwa312/Project4/refs/heads/main/Resources/heart.csv')
df


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


## our target:
1 means positive or *Precsence*  0 means negative or Abscence





# **description**



* cp = chestpain
* trestbps=resting blood pressure
* chol =serum cholestoral
* fbs= fasting blood sugar > 120 mg/dl
* thalach=maximum heart rate achieved
* exang =exercise induced angina
* oldpeak=ST depression induced by exercise relative to rest
* restecg=Resting electrocardiographic results
* ca =number of major vessels colored by fluoroscopy
* thal =thalium stress test result
* target presence of heart disease
* slope = slope of the peak excercise st segment




 **Data preprocessing**

In [3]:
# display the summary of the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [4]:
# define the continous feature(Numercal feature)
numerical_feature=["age","trestbps","chol","fbs","thalach","oldpeak"]

In [5]:
# identify the catagorical variable using list comperhensive
categorical_feature = [feature for feature in df.columns if feature not in numerical_feature]
df_categorical = df[categorical_feature]
df_categorical

Unnamed: 0,sex,cp,restecg,exang,slope,ca,thal,target
0,1,3,0,0,0,0,1,1
1,1,2,1,0,0,0,2,1
2,0,1,0,0,2,0,2,1
3,1,1,1,0,2,0,2,1
4,0,0,1,1,2,0,2,1
...,...,...,...,...,...,...,...,...
298,0,0,1,1,1,0,3,0
299,1,3,1,0,1,0,3,0
300,1,0,1,0,1,2,3,0
301,1,0,1,1,1,1,3,0


In [6]:
# remove target from categorical feature
no_target=[target for target in df_categorical if target!='target']
category_df =df[no_target]
category_df

Unnamed: 0,sex,cp,restecg,exang,slope,ca,thal
0,1,3,0,0,0,0,1
1,1,2,1,0,0,0,2
2,0,1,0,0,2,0,2
3,1,1,1,0,2,0,2
4,0,0,1,1,2,0,2
...,...,...,...,...,...,...,...
298,0,0,1,1,1,0,3
299,1,3,1,0,1,0,3
300,1,0,1,0,1,2,3
301,1,0,1,1,1,1,3


In [7]:
#Convert the identified category features to object data type
category_df=category_df.astype('object')


In [8]:
# check the dtypes after it is converted
df.dtypes

Unnamed: 0,0
age,int64
sex,int64
cp,int64
trestbps,int64
chol,int64
fbs,int64
restecg,int64
thalach,int64
exang,int64
oldpeak,float64


**Explore data**

In [9]:
# get the summary statistics for numerical variable
df[numerical_feature].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,303.0,54.366337,9.082101,29.0,47.5,55.0,61.0,77.0
trestbps,303.0,131.623762,17.538143,94.0,120.0,130.0,140.0,200.0
chol,303.0,246.264026,51.830751,126.0,211.0,240.0,274.5,564.0
fbs,303.0,0.148515,0.356198,0.0,0.0,0.0,0.0,1.0
thalach,303.0,149.646865,22.905161,71.0,133.5,153.0,166.0,202.0
oldpeak,303.0,1.039604,1.161075,0.0,0.0,0.8,1.6,6.2


In [10]:
# get the summary statistics for categorical variable
df[categorical_feature].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sex,303.0,0.683168,0.466011,0.0,0.0,1.0,1.0,1.0
cp,303.0,0.966997,1.032052,0.0,0.0,1.0,2.0,3.0
restecg,303.0,0.528053,0.52586,0.0,0.0,1.0,1.0,2.0
exang,303.0,0.326733,0.469794,0.0,0.0,0.0,1.0,1.0
slope,303.0,1.39934,0.616226,0.0,1.0,1.0,2.0,2.0
ca,303.0,0.729373,1.022606,0.0,0.0,0.0,1.0,4.0
thal,303.0,2.313531,0.612277,0.0,2.0,2.0,3.0,3.0
target,303.0,0.544554,0.498835,0.0,0.0,1.0,1.0,1.0


In [11]:
# check for missing value in the dataset
df.isnull().sum()

Unnamed: 0,0
age,0
sex,0
cp,0
trestbps,0
chol,0
fbs,0
restecg,0
thalach,0
exang,0
oldpeak,0


In [12]:
# check outlier for the numerical variable
q1=df[numerical_feature].quantile(.25)
q3=df[numerical_feature].quantile(.75)
IQR=q3-q1
lower_bound=q1-1.5*IQR
upper_bound =q3+1.5*IQR
out_lier_count =(df[numerical_feature]<lower_bound) | (df[numerical_feature]>upper_bound)
out_lier_count.sum()

Unnamed: 0,0
age,0
trestbps,9
chol,5
fbs,45
thalach,1
oldpeak,5


In [13]:
# encode the categorical variable using get_dummies()

cat_encode =pd.get_dummies(df[no_target],dtype=int)
cat_encode

Unnamed: 0,sex,cp,restecg,exang,slope,ca,thal
0,1,3,0,0,0,0,1
1,1,2,1,0,0,0,2
2,0,1,0,0,2,0,2
3,1,1,1,0,2,0,2
4,0,0,1,1,2,0,2
...,...,...,...,...,...,...,...
298,0,0,1,1,1,0,3
299,1,3,1,0,1,0,3
300,1,0,1,0,1,2,3
301,1,0,1,1,1,1,3


In [14]:
# concate the categorical and numerical variable and target
concate_df=pd.concat([cat_encode,df[numerical_feature],df['target']],axis=1)
concate_df.head()

Unnamed: 0,sex,cp,restecg,exang,slope,ca,thal,age,trestbps,chol,fbs,thalach,oldpeak,target
0,1,3,0,0,0,0,1,63,145,233,1,150,2.3,1
1,1,2,1,0,0,0,2,37,130,250,0,187,3.5,1
2,0,1,0,0,2,0,2,41,130,204,0,172,1.4,1
3,1,1,1,0,2,0,2,56,120,236,0,178,0.8,1
4,0,0,1,1,2,0,2,57,120,354,0,163,0.6,1


our target is target positive

In [15]:
# Split our preprocessed data into our features and target arrays
y=concate_df['target'].values
x=concate_df.drop(columns=['target']).values

**Scale the dataset**

In [16]:
# # Splitting data into train and test sets
x_train,x_test,y_train,y_test =train_test_split(x,y,random_state=0,stratify=y)



# KNN Model

In [18]:


# Create a KNN pipeline
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Step 1: Standardize the features
    ('knn', KNeighborsClassifier())  # Step 2: KNN classifier
])

# Define the parameter grid for KNN
knn_param_grid = {
    'knn__n_neighbors':list(range(1,10)),    # Number of neighbors
    'knn__weights': ['uniform', 'distance'], # Weight function used in prediction
    'knn__metric': ['euclidean', 'manhattan', 'minkowski'] # Distance metric
}

def tune_hyperparameters(pipeline, param_grid, x_train, y_train):
    # Create a GridSearchCV object
    grid_search = GridSearchCV(pipeline, param_grid, refit=True, verbose=2, cv=5)

    # Perform Grid Search and fit the model
    grid_search.fit(x_train,y_train)

    # Get the best model and hyperparameters
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    return best_model, best_params

# Tune hyperparameters for KNN
best_knn, best_knn_hyperparams = tune_hyperparameters(knn_pipeline, knn_param_grid, x_train, y_train)
print('KNN Optimal Hyperparameters: \n', best_knn_hyperparams)

# Evaluate the best KNN model
y_pred = best_knn.predict(x_test)



Fitting 5 folds for each of 54 candidates, totalling 270 fits
[CV] END knn__metric=euclidean, knn__n_neighbors=1, knn__weights=uniform; total time=   0.0s
[CV] END knn__metric=euclidean, knn__n_neighbors=1, knn__weights=uniform; total time=   0.0s
[CV] END knn__metric=euclidean, knn__n_neighbors=1, knn__weights=uniform; total time=   0.0s
[CV] END knn__metric=euclidean, knn__n_neighbors=1, knn__weights=uniform; total time=   0.0s
[CV] END knn__metric=euclidean, knn__n_neighbors=1, knn__weights=uniform; total time=   0.0s
[CV] END knn__metric=euclidean, knn__n_neighbors=1, knn__weights=distance; total time=   0.0s
[CV] END knn__metric=euclidean, knn__n_neighbors=1, knn__weights=distance; total time=   0.0s
[CV] END knn__metric=euclidean, knn__n_neighbors=1, knn__weights=distance; total time=   0.0s
[CV] END knn__metric=euclidean, knn__n_neighbors=1, knn__weights=distance; total time=   0.0s
[CV] END knn__metric=euclidean, knn__n_neighbors=1, knn__weights=distance; total time=   0.0s
[CV

 Check for overfitting

In [19]:
# print the classification report for the test
print(classification_report(y_test,y_pred,target_names=["Absence","Presence"]))

              precision    recall  f1-score   support

     Absence       0.90      0.74      0.81        35
    Presence       0.81      0.93      0.86        41

    accuracy                           0.84        76
   macro avg       0.85      0.83      0.84        76
weighted avg       0.85      0.84      0.84        76



In [20]:
# print classification report for the train dataset
print(classification_report(y_train,best_knn.predict(x_train),target_names=["Absence","Presence"]))

              precision    recall  f1-score   support

     Absence       1.00      1.00      1.00       103
    Presence       1.00      1.00      1.00       124

    accuracy                           1.00       227
   macro avg       1.00      1.00      1.00       227
weighted avg       1.00      1.00      1.00       227



#Note:
the data is not  overfitted as we see that the recall on both train and test are not big different.

#pattern:
93% the model identified most of  the heart disease.

In [21]:
# Pass the variable on the test clssfication report
out_put=classification_report(y_test,best_knn.predict(x_test),output_dict=True,target_names=["Absence","Presence"])
out_put

{'Absence': {'precision': 0.896551724137931,
  'recall': 0.7428571428571429,
  'f1-score': 0.8125,
  'support': 35.0},
 'Presence': {'precision': 0.8085106382978723,
  'recall': 0.926829268292683,
  'f1-score': 0.8636363636363636,
  'support': 41.0},
 'accuracy': 0.8421052631578947,
 'macro avg': {'precision': 0.8525311812179017,
  'recall': 0.8348432055749129,
  'f1-score': 0.8380681818181819,
  'support': 76.0},
 'weighted avg': {'precision': 0.8490558751978994,
  'recall': 0.8421052631578947,
  'f1-score': 0.8400867224880383,
  'support': 76.0}}

In [22]:
# convert to dataframe
KNN_report_df = pd.DataFrame(out_put).transpose()
KNN_report_df

Unnamed: 0,precision,recall,f1-score,support
Absence,0.896552,0.742857,0.8125,35.0
Presence,0.808511,0.926829,0.863636,41.0
accuracy,0.842105,0.842105,0.842105,0.842105
macro avg,0.852531,0.834843,0.838068,76.0
weighted avg,0.849056,0.842105,0.840087,76.0


In [23]:
# filter the only recall that have positive result
knn_recal = KNN_report_df[KNN_report_df.index=='Presence']
knn_recal

Unnamed: 0,precision,recall,f1-score,support
Presence,0.808511,0.926829,0.863636,41.0


# SVM Model

In [24]:
# Define the parameter grid for SVM


In [25]:
# Create a GridSearchCV object



In [27]:
# Get the best parameters



Best parameters: {'C': 10, 'gamma': 1, 'kernel': 'linear'}


In [28]:
# make prediction on test dataset


check for overfitting by printing classification report on both train and test and check the difference on recall

In [56]:
# print classification report on test dataset



In [57]:
# print classification report on train dataset.


In [58]:
# convert the test classification report to dictionary


In [59]:
# convert the test classification report to dataframe


In [60]:
# filter out the positive result from datafram


#Random Forest

In [61]:
# Define the parameter grid for Random Forest




In [62]:
# Create a GridSearchCV object

# fit the train data to the model


In [63]:
# Get the best parameters


# Make predictions using the best model


 ** DT MODEL**

**LogisticRegression**