Importing the Dependencies

In [76]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings('ignore')

Data Collection and Processing

In [77]:
# loading the csv data to a Pandas DataFrame
data = pd.read_csv('/content/heart_2020_cleaned.csv')

In [78]:
# print first 5 rows of the dataset
data.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [79]:
# print last 5 rows of the dataset
data.tail()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
319790,Yes,27.41,Yes,No,No,7.0,0.0,Yes,Male,60-64,Hispanic,Yes,No,Fair,6.0,Yes,No,No
319791,No,29.84,Yes,No,No,0.0,0.0,No,Male,35-39,Hispanic,No,Yes,Very good,5.0,Yes,No,No
319792,No,24.24,No,No,No,0.0,0.0,No,Female,45-49,Hispanic,No,Yes,Good,6.0,No,No,No
319793,No,32.81,No,No,No,0.0,0.0,No,Female,25-29,Hispanic,No,No,Good,12.0,No,No,No
319794,No,46.56,No,No,No,0.0,0.0,No,Female,80 or older,Hispanic,No,Yes,Good,8.0,No,No,No


In [80]:
# number of rows and columns in the dataset
data.shape

(319795, 18)

In [81]:
# getting some info about the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  object 
 1   BMI               319795 non-null  float64
 2   Smoking           319795 non-null  object 
 3   AlcoholDrinking   319795 non-null  object 
 4   Stroke            319795 non-null  object 
 5   PhysicalHealth    319795 non-null  float64
 6   MentalHealth      319795 non-null  float64
 7   DiffWalking       319795 non-null  object 
 8   Sex               319795 non-null  object 
 9   AgeCategory       319795 non-null  object 
 10  Race              319795 non-null  object 
 11  Diabetic          319795 non-null  object 
 12  PhysicalActivity  319795 non-null  object 
 13  GenHealth         319795 non-null  object 
 14  SleepTime         319795 non-null  float64
 15  Asthma            319795 non-null  object 
 16  KidneyDisease     31

In [82]:
data['Race'].unique()

array(['White', 'Black', 'Asian', 'American Indian/Alaskan Native',
       'Other', 'Hispanic'], dtype=object)

In [83]:
data['GenHealth'].unique()

array(['Very good', 'Fair', 'Good', 'Poor', 'Excellent'], dtype=object)

In [84]:
data['Smoking']=data['Smoking'].map({'Yes':1,'No':0})
data['AlcoholDrinking']=data['AlcoholDrinking'].map({'Yes':1,'No':0})
data['Stroke']=data['Stroke'].map({'Yes':1,'No':0})
data['DiffWalking']=data['DiffWalking'].map({'Yes':1,'No':0})
data['Diabetic']=data['Diabetic'].map({'Yes':1,'No':0})
data['Asthma']=data['Asthma'].map({'Yes':1,'No':0})
data['KidneyDisease']=data['KidneyDisease'].map({'Yes':1,'No':0})
data['SkinCancer']=data['SkinCancer'].map({'Yes':1,'No':0})
data['PhysicalActivity']=data['PhysicalActivity'].map({'Yes':1,'No':0})
data['Sex']=data['Sex'].map({'Male':1,'Female':0})
data['Race']=data['Race'].map({'White':5, 'Black':4, 'Asian':3, 'American Indian/Alaskan Native':2,'Other':1, 'Hispanic':0})
data['GenHealth']=data['GenHealth'].map({'Very good':4, 'Fair':3, 'Good':2, 'Poor':1, 'Excellent':0})
data['HeartDisease']=data['HeartDisease'].map({'Yes':1, 'No':0})

In [85]:
data.replace('80 or older','80-85',inplace=True)
data.replace('80 or older','80-85',inplace=True)

In [86]:
data.dtypes

HeartDisease          int64
BMI                 float64
Smoking               int64
AlcoholDrinking       int64
Stroke                int64
PhysicalHealth      float64
MentalHealth        float64
DiffWalking           int64
Sex                   int64
AgeCategory          object
Race                  int64
Diabetic            float64
PhysicalActivity      int64
GenHealth             int64
SleepTime           float64
Asthma                int64
KidneyDisease         int64
SkinCancer            int64
dtype: object

**To convert the categorical to numerical we use the LabelEncoder in sklearn**

In [87]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['Sex']=le.fit_transform(data['Sex'])
data['AgeCategory']=le.fit_transform(data['AgeCategory'])
data['Race']=le.fit_transform(data['Race'])
data['Diabetic']=le.fit_transform(data['Diabetic'])
data['GenHealth']=le.fit_transform(data['GenHealth'])

In [88]:
# checking for missing values
data.isnull().sum()

HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64

In [89]:
# statistical measures about the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  int64  
 1   BMI               319795 non-null  float64
 2   Smoking           319795 non-null  int64  
 3   AlcoholDrinking   319795 non-null  int64  
 4   Stroke            319795 non-null  int64  
 5   PhysicalHealth    319795 non-null  float64
 6   MentalHealth      319795 non-null  float64
 7   DiffWalking       319795 non-null  int64  
 8   Sex               319795 non-null  int64  
 9   AgeCategory       319795 non-null  int64  
 10  Race              319795 non-null  int64  
 11  Diabetic          319795 non-null  int64  
 12  PhysicalActivity  319795 non-null  int64  
 13  GenHealth         319795 non-null  int64  
 14  SleepTime         319795 non-null  float64
 15  Asthma            319795 non-null  int64  
 16  KidneyDisease     31

In [90]:
# checking the distribution of Target Variable
data['HeartDisease'].value_counts()

HeartDisease
0    292422
1     27373
Name: count, dtype: int64

0 --> Healthy Heart

1 --> Defective Heart

Splitting the Features and Target

In [91]:
X = data.drop(columns='HeartDisease', axis=1)
Y = data['HeartDisease']

In [92]:
print(X)

          BMI  Smoking  AlcoholDrinking  Stroke  PhysicalHealth  MentalHealth  \
0       16.60        1                0       0             3.0          30.0   
1       20.34        0                0       1             0.0           0.0   
2       26.58        1                0       0            20.0          30.0   
3       24.21        0                0       0             0.0           0.0   
4       23.71        0                0       0            28.0           0.0   
...       ...      ...              ...     ...             ...           ...   
319790  27.41        1                0       0             7.0           0.0   
319791  29.84        1                0       0             0.0           0.0   
319792  24.24        0                0       0             0.0           0.0   
319793  32.81        0                0       0             0.0           0.0   
319794  46.56        0                0       0             0.0           0.0   

        DiffWalking  Sex  A

In [93]:
print(Y)

0         0
1         0
2         0
3         0
4         0
         ..
319790    1
319791    0
319792    0
319793    0
319794    0
Name: HeartDisease, Length: 319795, dtype: int64


Splitting the Data into Training data & Test Data

In [94]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

In [95]:
print(X.shape, X_train.shape, X_test.shape)

(319795, 17) (255836, 17) (63959, 17)


Model Training

Logistic Regression

In [96]:
model = LogisticRegression()

In [98]:
# training the LogisticRegression model with Training data
model.fit(X_train, Y_train)

Naive Bayes

In [103]:
gnb = GaussianNB()

In [104]:
# training the Gaussian Naive Bayes model with Training data
gnb.fit(X_train,Y_train)

Model Evaluation

Accuracy Score

In [99]:
# accuracy on training data in Logistic Regression
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [100]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9146718991854157


In [101]:
# accuracy on test data in Logistic Regression
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [102]:
print('Accuracy on Test data : ', test_data_accuracy)

Accuracy on Test data :  0.9156959927453525


In [105]:
# accuracy on training data in Naive Bayes
X_tr_predict_nb = gnb.predict(X_train)
training_data_accuracy_nb = accuracy_score(X_tr_predict_nb, Y_train)

In [106]:
print('Accuracy on Training data : ', training_data_accuracy_nb)

Accuracy on Training data :  0.846499319876796


In [107]:
# accuracy on test data in Logistic Regression
X_test_predict_nb = gnb.predict(X_test)
test_data_accuracy_nb = accuracy_score(X_test_predict_nb, Y_test)

In [108]:
print('Accuracy on Test data : ', test_data_accuracy_nb)

Accuracy on Test data :  0.8459794555887366


So, It's better to work with Logistic Regression Model in this case.

In [111]:
data.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,16.6,1,0,0,3.0,30.0,0,0,7,5,1,1,4,5.0,1,0,1
1,0,20.34,0,0,1,0.0,0.0,0,0,12,5,0,1,4,7.0,0,0,0
2,0,26.58,1,0,0,20.0,30.0,0,1,9,5,1,1,3,8.0,1,0,0
3,0,24.21,0,0,0,0.0,0.0,0,0,11,5,0,0,2,6.0,0,0,1
4,0,23.71,0,0,0,28.0,0.0,1,0,4,5,0,1,4,8.0,0,0,0


Building a Predictive System

In [112]:
input_data = (16.6,1,0,0,3.0,30.0,0,0,7,5,1,1,4,5.0,1,0,1)

# change the input data to a numpy array
input_data_as_numpy_array= np.asarray(input_data)

# reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('The Person does not have a Heart Disease')
else:
  print('The Person has Heart Disease')

[0]
The Person does not have a Heart Disease


**Saving The Trained Model**

In [113]:
import pickle

In [114]:
file = 'trained_model_heartDisease.sav'
pickle.dump(model, open(file, 'wb'))

In [115]:
#loading the saved model
loaded_model = pickle.load(open('trained_model_heartDisease.sav', 'rb'))

In [117]:
input_data = (16.6,1,0,0,3.0,30.0,0,0,7,5,1,1,4,5.0,1,0,1)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)


prediction = loaded_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[0]
The person is not diabetic
