# Heart prediction

This database contains 76 attributes, but all published experiments refer to using a subset of 14 of them. In particular, the Cleveland database is the only one that has been used by ML researchers to
this date. The "goal" field refers to the presence of heart disease in the patient. It is integer valued from 0 (no presence) to 4.

Attribute Information:

age,,
sex,,
chest pain type (4 values),,
resting blood pressure,,
serum cholestoral in mg/dl,,
fasting blood sugar > 120 mg/dl,,
resting electrocardiographic results (values 0,1,2),,
maximum heart rate achieved,,
exercise induced angina,,
oldpeak = ST depression induced by exercise relative to rest,,
the slope of the peak exercise ST segment,,
number of major vessels (0-3) colored by flourosopy,,
thal: 3 = normal; 6 = fixed defect; 7 = reversable defect.

In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [4]:
data=pd.read_csv("heart.csv")
data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [5]:
data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [6]:
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [7]:
data.tail()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0
302,57,0,1,130,236,0,0,174,0,0.0,1,1,2,0


In [9]:
# checking the shape of dataset

data.shape

(303, 14)

In [10]:
# taking information about data

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [12]:
# getting description about the data

data.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [13]:
# checking thwe values of target variables 

data['target'].value_counts()

1    165
0    138
Name: target, dtype: int64

In [32]:
names_col=['age','sex','chest pain','rest BP','cholestrol','fasting blood sugar ','resting ECG','max heart rate','exe induced angina','ST depression','slope','ca','thal','target']

In [33]:
data=pd.read_csv('heart.csv',names=names_col)
data

Unnamed: 0,age,sex,chest pain,rest BP,cholestrol,fasting blood sugar,resting ECG,max heart rate,exe induced angina,ST depression,slope,ca,thal,target
0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
1,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
2,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
3,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
4,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
300,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
301,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
302,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [34]:
data=data.iloc[1:]
data

Unnamed: 0,age,sex,chest pain,rest BP,cholestrol,fasting blood sugar,resting ECG,max heart rate,exe induced angina,ST depression,slope,ca,thal,target
1,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
2,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
3,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
4,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
5,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
300,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
301,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
302,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


 in target column 
 0---- healthy heart
1---- defective heart

# distributing the data

In [36]:
x=data.drop(columns=['target'])
y=data['target']

In [37]:
x

Unnamed: 0,age,sex,chest pain,rest BP,cholestrol,fasting blood sugar,resting ECG,max heart rate,exe induced angina,ST depression,slope,ca,thal
1,63,1,3,145,233,1,0,150,0,2.3,0,0,1
2,37,1,2,130,250,0,1,187,0,3.5,0,0,2
3,41,0,1,130,204,0,0,172,0,1.4,2,0,2
4,56,1,1,120,236,0,1,178,0,0.8,2,0,2
5,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
299,57,0,0,140,241,0,1,123,1,0.2,1,0,3
300,45,1,3,110,264,0,1,132,0,1.2,1,0,3
301,68,1,0,144,193,1,1,141,0,3.4,1,2,3
302,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [38]:
y

1      1
2      1
3      1
4      1
5      1
      ..
299    0
300    0
301    0
302    0
303    0
Name: target, Length: 303, dtype: object

# now splitting the data 

In [40]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=2)

In [41]:
x.shape

(303, 13)

In [42]:
y.shape

(303,)

# training  the data

In [43]:
model=LogisticRegression()
model.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [44]:
y_pred=model.predict(x_test)

In [46]:
print('accuracy',metrics.accuracy_score(y_test,y_pred))

accuracy 0.9210526315789473


# predicting the new data

In [60]:
input_data=(56,1,1,120,236,0,1,178,0,0.8,2,0,2)

In [61]:
# now changing the input to numpy array

input_to_numpy=np.asarray(input_data)

In [62]:
input_to_numpy

array([ 56. ,   1. ,   1. , 120. , 236. ,   0. ,   1. , 178. ,   0. ,
         0.8,   2. ,   0. ,   2. ])

In [66]:
# now changing the dimension

reshaped_input=input_to_numpy.reshape(1,-1)

In [67]:
reshaped_input

array([[ 56. ,   1. ,   1. , 120. , 236. ,   0. ,   1. , 178. ,   0. ,
          0.8,   2. ,   0. ,   2. ]])

In [68]:
prediction=model.predict(reshaped_input)

In [69]:
print(prediction)

['1']
