## Import Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

## Import Models

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

## Data Collection and Preprocessing

In [3]:
# load data to a pandas dataframe
heart_data = pd.read_csv("heart.csv")
heart_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [5]:
# shape of data
heart_data.shape

(303, 14)

In [6]:
# basic info
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [7]:
# distribution of target
heart_data.target.value_counts()

1    165
0    138
Name: target, dtype: int64

### Splitting Features and Target

In [8]:
X = heart_data.drop(columns="target", axis=1)
Y = heart_data['target']

## Train, Test and Split

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2, stratify=Y, random_state=3)

In [11]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(242, 13)
(61, 13)
(242,)
(61,)


### Comparing Performance of model

In [15]:
# list of models

models = [LogisticRegression(max_iter=1000), SVC(kernel='linear'), KNeighborsClassifier(), RandomForestClassifier()]

In [16]:
def compare_models():
    for model in models:
        model.fit(X_train, Y_train)
        test_data_prediction = model.predict(X_test)
        accuracy = accuracy_score(Y_test, test_data_prediction)
        print(f"Accuracy Score for {model} is: {accuracy}")

In [17]:
compare_models()

Accuracy Score for LogisticRegression(max_iter=1000) is: 0.7704918032786885
Accuracy Score for SVC(kernel='linear') is: 0.7704918032786885
Accuracy Score for KNeighborsClassifier() is: 0.6557377049180327
Accuracy Score for RandomForestClassifier() is: 0.7868852459016393


## Cross Validation

In [22]:
def compute_cv():
    for model in models:
        cv_score = cross_val_score(model, X, Y, cv=5)
        print(f"Cross Validation accuracies for {model} is: {cv_score}")
        print(f"Average CV Score for {model} : {round((sum(cv_score)/len(cv_score))*100,2)}")
        print("_"*50)

In [23]:
compute_cv()

Cross Validation accuracies for LogisticRegression(max_iter=1000) is: [0.80327869 0.86885246 0.85245902 0.86666667 0.75      ]
Average CV Score for LogisticRegression(max_iter=1000) : 82.83
__________________________________________________
Cross Validation accuracies for SVC(kernel='linear') is: [0.81967213 0.8852459  0.80327869 0.86666667 0.76666667]
Average CV Score for SVC(kernel='linear') : 82.83
__________________________________________________
Cross Validation accuracies for KNeighborsClassifier() is: [0.60655738 0.6557377  0.57377049 0.73333333 0.65      ]
Average CV Score for KNeighborsClassifier() : 64.39
__________________________________________________
Cross Validation accuracies for RandomForestClassifier() is: [0.83606557 0.90163934 0.81967213 0.8        0.76666667]
Average CV Score for RandomForestClassifier() : 82.48
__________________________________________________
