In [54]:
# import basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [55]:
# import the dataset
df = pd.read_csv('heart.csv')

In [56]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


# Preprocessing

In [57]:
df.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [58]:
# creating train, dev set
train = df.loc[:900] # trainig set
dev = df.loc[901:] # development set to test overfitting

In [59]:
train.shape, dev.shape

((901, 14), (124, 14))

In [60]:
train.target.value_counts()

target
1    464
0    437
Name: count, dtype: int64

In [61]:
dev.target.value_counts()

target
1    62
0    62
Name: count, dtype: int64

In [62]:
# creating dependent and independent matrix of features
x = train.iloc[:, :-1]
y = train.iloc[:, -1]

# Modeling

In [63]:
# create training and test sets
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.25, random_state = 31)

In [64]:
len(x_train), len(x_test), len(y_train), len(y_test)

(675, 226, 675, 226)

In [65]:
x_train.shape

(675, 13)

# Logistic Regression

### Train set

In [66]:
from sklearn.linear_model import LogisticRegression
log_clf = LogisticRegression(max_iter = 1000, random_state = 31)
log_clf.fit(x_train, y_train)
print(log_clf.score(x_test, y_test))

0.8938053097345132


In [67]:
from sklearn.metrics import classification_report
y_preds = log_clf.predict(x_test)
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.89      0.87      0.88       100
           1       0.90      0.91      0.91       126

    accuracy                           0.89       226
   macro avg       0.89      0.89      0.89       226
weighted avg       0.89      0.89      0.89       226



In [68]:
# confusion matrix
from sklearn.metrics import confusion_matrix
conf_class=confusion_matrix(y_test,y_preds)
print(conf_class)

[[ 87  13]
 [ 11 115]]


### dev set (to test overfitting)

In [69]:
devx = dev.drop('target', axis = 1)
devy = dev['target']

In [70]:
dev_preds = log_clf.predict(devx)
log_clf.score(devx, devy)

0.782258064516129

In [71]:
print(classification_report(devy, dev_preds))

              precision    recall  f1-score   support

           0       0.82      0.73      0.77        62
           1       0.75      0.84      0.79        62

    accuracy                           0.78       124
   macro avg       0.79      0.78      0.78       124
weighted avg       0.79      0.78      0.78       124



# KNN

### Train set

In [72]:
from sklearn.neighbors import KNeighborsClassifier

# Parameters taken from grid search best params.
knn_clf = KNeighborsClassifier(algorithm = 'auto',
                               leaf_size = 10,
                               n_neighbors = 2,
                               p = 2)
knn_clf.fit(x_train, y_train)
print(knn_clf.score(x_test, y_test))

0.8805309734513275


In [73]:
y_preds = knn_clf.predict(x_test)
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.79      0.99      0.88       100
           1       0.99      0.79      0.88       126

    accuracy                           0.88       226
   macro avg       0.89      0.89      0.88       226
weighted avg       0.90      0.88      0.88       226



### dev set (to test overfitting)

In [74]:
dev_preds = knn_clf.predict(devx)
knn_clf.score(devx, devy)

0.8790322580645161

In [75]:
print(classification_report(devy, dev_preds))

              precision    recall  f1-score   support

           0       0.82      0.97      0.89        62
           1       0.96      0.79      0.87        62

    accuracy                           0.88       124
   macro avg       0.89      0.88      0.88       124
weighted avg       0.89      0.88      0.88       124



# SVC

### Train Set

In [76]:
from sklearn import svm
svc_clf = svm.SVC(random_state = 7)
svc_clf.fit(x_train, y_train)
print(svc_clf.score(x_test, y_test))

0.7256637168141593


In [77]:
y_preds = svc_clf.predict(x_test)
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.71      0.65      0.68       100
           1       0.74      0.79      0.76       126

    accuracy                           0.73       226
   macro avg       0.72      0.72      0.72       226
weighted avg       0.72      0.73      0.72       226



### dev set (to test overfitting)

In [78]:
dev_preds = svc_clf.predict(devx)
svc_clf.score(devx, devy)

0.7016129032258065

In [79]:
print(classification_report(devy, dev_preds))

              precision    recall  f1-score   support

           0       0.77      0.58      0.66        62
           1       0.66      0.82      0.73        62

    accuracy                           0.70       124
   macro avg       0.71      0.70      0.70       124
weighted avg       0.71      0.70      0.70       124



# Random Forest

### Train set

In [80]:
from sklearn.ensemble import RandomForestClassifier
rand_clf = RandomForestClassifier(random_state = 31)
rand_clf.fit(x_train, y_train)
print(rand_clf.score(x_test, y_test))

0.9867256637168141


In [81]:
y_preds = rand_clf.predict(x_test)
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       100
           1       0.99      0.98      0.99       126

    accuracy                           0.99       226
   macro avg       0.99      0.99      0.99       226
weighted avg       0.99      0.99      0.99       226



### dev set (to test overfitting)

In [82]:
dev_preds = rand_clf.predict(devx)
rand_clf.score(devx, devy)

0.9758064516129032

In [83]:
print(classification_report(devy, dev_preds))

              precision    recall  f1-score   support

           0       0.98      0.97      0.98        62
           1       0.97      0.98      0.98        62

    accuracy                           0.98       124
   macro avg       0.98      0.98      0.98       124
weighted avg       0.98      0.98      0.98       124



In [84]:
import pickle

In [85]:
filename="trained_heart_model.sav"

In [86]:
pickle.dump(rand_clf,open(filename,'wb'))

In [87]:
#Loading The Model

In [88]:
loadmodel=pickle.load(open('trained_heart_model.sav','rb'))

Testing output

In [89]:
input_data=(53,1,0,140,203,1,0,155,1,3.1,0,0,3)

In [90]:
numpy_array=np.asarray(input_data)

In [91]:
input_data_reshape=numpy_array.reshape(1,-1)

In [92]:
print(input_data_reshape)

[[ 53.    1.    0.  140.  203.    1.    0.  155.    1.    3.1   0.    0.
    3. ]]


In [93]:
prediction=loadmodel.predict(input_data_reshape)



In [94]:
print(prediction)

[0]


In [95]:
if (prediction[0]==0):
    print("The person does not have heart attack")
else:
    print("The person have heart attack")

The person does not have heart attack
