In [37]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [38]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


sex --> 0(female) & 1(male)
CP --> Chest Pain types(4)
trestbps --> resting blood pressure
chol --> Serume cholestrol (mg)
fbs --> Fasting Blood Sugar>200 mg/dl
restecg --> restinf elcetrocardiographic result (values - 0,1,2)
thalach --> maximum heart rate achieved
exangta --> exercise induced angina
oldpeak --> ST depression induced by thallium stress test
slope --> the slope of the peak exercise ST segment
ca --> number of major vessele(0-3) colored by floursopy
thal --> 0 = normal; 1 = fixed defect; 2 = reversable defect
target --> 0 = no disease and 1 = disease.


In [39]:
df.shape

(1025, 14)

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


In [41]:
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [42]:
df.duplicated().sum()

723

In [43]:
main_df = df.drop_duplicates()
main_df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
723,68,0,2,120,211,0,0,115,0,1.5,1,0,2,1
733,44,0,2,108,141,0,1,175,0,0.6,1,0,2,1
739,52,1,0,128,255,0,1,161,1,0.0,2,1,3,0
843,59,1,3,160,273,0,0,125,0,0.0,2,0,2,0


In [44]:
main_df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0
mean,54.42053,0.682119,0.963576,131.602649,246.5,0.149007,0.52649,149.569536,0.327815,1.043046,1.397351,0.718543,2.31457,0.543046
std,9.04797,0.466426,1.032044,17.563394,51.753489,0.356686,0.526027,22.903527,0.470196,1.161452,0.616274,1.006748,0.613026,0.49897
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,133.25,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.5,1.0,1.0,130.0,240.5,0.0,1.0,152.5,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.75,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [45]:
main_df['target'].value_counts()

target
1    164
0    138
Name: count, dtype: int64

In [46]:
from sklearn.model_selection import train_test_split

In [47]:
X = main_df.drop(['target'] , axis=1)
Y = main_df['target']

In [48]:
X_train , x_test , Y_train , y_test = train_test_split(X, Y , test_size=0.2 , random_state=20)

In [59]:
X_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
281,35,1,1,122,192,0,1,174,0,0.0,2,0,2
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3
479,58,1,0,128,216,0,0,131,1,2.2,1,3,3
25,61,0,0,145,307,0,0,146,1,1.0,1,0,3
652,66,0,3,150,226,0,1,114,0,2.6,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,63,1,0,140,187,0,0,144,1,4.0,2,2,3
566,50,0,0,110,254,0,0,159,0,0.0,2,0,2
328,70,1,0,130,322,0,0,109,0,2.4,1,3,2
337,54,1,2,125,273,0,0,152,0,0.5,0,1,2


In [50]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()

In [51]:
LR.fit(X_train , Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [63]:
y_pred = LR.predict(x_test)
y_pred

array([0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0], dtype=int64)

In [65]:
from sklearn.metrics import r2_score
r2_score(y_pred , y_test)

0.7208237986270023

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test , y_pred)

0.9344262295081968

In [None]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [None]:
#External input 1st convert into np array
#reshape is required when you want to predict only one values 
input = (44,	0	,2,	108	,141,	0,	1,	175,	0,	0.6,	1	,0	,2)
reshape = np.asarray(input)


In [None]:
print(LR.predict([reshape]))
Prediction = LR.predict([reshape])

if Prediction[0] == 0:
    print("DOES NOT HAVE HEART DISEAS")
else:
    print("YES YOU HAVE....")

[1]
YES YOU HAVE....


