In [None]:
'''
Analyse the heartDisease.csv dataset to explore the machine learning algorithms and build decision tree models to predict the 
disease.
'''

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("HeartDisease.csv")
df

Unnamed: 0,age,gender,chest_pain,rest_bps,cholestrol,fasting_blood_sugar,rest_ecg,thalach,exer_angina,old_peak,slope,ca,thalassemia,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  303 non-null    int64  
 1   gender               303 non-null    int64  
 2   chest_pain           303 non-null    int64  
 3   rest_bps             303 non-null    int64  
 4   cholestrol           303 non-null    int64  
 5   fasting_blood_sugar  303 non-null    int64  
 6   rest_ecg             303 non-null    int64  
 7   thalach              303 non-null    int64  
 8   exer_angina          303 non-null    int64  
 9   old_peak             303 non-null    float64
 10  slope                303 non-null    int64  
 11  ca                   303 non-null    int64  
 12  thalassemia          303 non-null    int64  
 13  target               303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [4]:
df["target"].value_counts()

1    165
0    138
Name: target, dtype: int64

In [6]:
x = df.iloc[:,:-1]
x

Unnamed: 0,age,gender,chest_pain,rest_bps,cholestrol,fasting_blood_sugar,rest_ecg,thalach,exer_angina,old_peak,slope,ca,thalassemia
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [7]:
y = df.iloc[:,-1]
y

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: target, Length: 303, dtype: int64

In [8]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y, test_size = 0.3, random_state = 42)

In [9]:
def mymodel(model):
    model.fit(xtrain, ytrain)
    ypred = model.predict(xtest)
    acc = accuracy_score(ytest, ypred)
    cr = classification_report(ytest, ypred)
    print(f"Accuracy is {acc} \n\n Classfication Report \n {cr}")

In [10]:
lr = LogisticRegression()
mymodel(lr)

Accuracy is 0.8131868131868132 

 Classfication Report 
               precision    recall  f1-score   support

           0       0.80      0.78      0.79        41
           1       0.82      0.84      0.83        50

    accuracy                           0.81        91
   macro avg       0.81      0.81      0.81        91
weighted avg       0.81      0.81      0.81        91



In [11]:
dt = DecisionTreeClassifier()
mymodel(dt)

Accuracy is 0.7362637362637363 

 Classfication Report 
               precision    recall  f1-score   support

           0       0.68      0.78      0.73        41
           1       0.80      0.70      0.74        50

    accuracy                           0.74        91
   macro avg       0.74      0.74      0.74        91
weighted avg       0.74      0.74      0.74        91



In [13]:
dt.score(xtrain, ytrain)

1.0

In [14]:
dt.score(xtest, ytest)

0.7362637362637363

In [16]:
acc_list = []
depth_list = []
for i in range(1,20):
    dt1 = DecisionTreeClassifier(max_depth = i)
    dt1.fit(xtrain, ytrain)
    ypred = dt1.predict(xtest)
    acc = accuracy_score(ytest, ypred)
    
    acc_list.append(acc)
    depth_list.append(i)
    
    print(f"The accuracy for max_depth = {i} is {acc}")

The accuracy for max_depth = 1 is 0.6923076923076923
The accuracy for max_depth = 2 is 0.6923076923076923
The accuracy for max_depth = 3 is 0.7692307692307693
The accuracy for max_depth = 4 is 0.7362637362637363
The accuracy for max_depth = 5 is 0.7582417582417582
The accuracy for max_depth = 6 is 0.7582417582417582
The accuracy for max_depth = 7 is 0.7252747252747253
The accuracy for max_depth = 8 is 0.7362637362637363
The accuracy for max_depth = 9 is 0.7252747252747253
The accuracy for max_depth = 10 is 0.7142857142857143
The accuracy for max_depth = 11 is 0.7252747252747253
The accuracy for max_depth = 12 is 0.7252747252747253
The accuracy for max_depth = 13 is 0.7472527472527473
The accuracy for max_depth = 14 is 0.7142857142857143
The accuracy for max_depth = 15 is 0.7362637362637363
The accuracy for max_depth = 16 is 0.7582417582417582
The accuracy for max_depth = 17 is 0.7362637362637363
The accuracy for max_depth = 18 is 0.7472527472527473
The accuracy for max_depth = 19 is 0.

In [19]:
max_acc = acc_list[0]
for i in range(1,len(acc_list)):
    if acc_list[i] > max_acc:
        max_acc = acc_list[i]

max_index = acc_list.index(max_acc)
print(f"We observed that, Highest accuracy for our model is we get when max_depth is {max_index + 1}")

We observed that, Highest accuracy for our model is we get when max_depth is 3


In [24]:
dt2 = DecisionTreeClassifier(max_depth = 3)
mymodel(dt2)

Accuracy is 0.7692307692307693 

 Classfication Report 
               precision    recall  f1-score   support

           0       0.73      0.78      0.75        41
           1       0.81      0.76      0.78        50

    accuracy                           0.77        91
   macro avg       0.77      0.77      0.77        91
weighted avg       0.77      0.77      0.77        91



In [25]:
dt2.score(xtrain, ytrain)

0.8679245283018868

In [33]:
acc_list = []
split_list = []
for i in range(2,100):
    dt3 = DecisionTreeClassifier(min_samples_split = i)
    dt3.fit(xtrain, ytrain)
    ypred = dt3.predict(xtest)
    acc = accuracy_score(ytest, ypred)
    
    acc_list.append(acc)
    split_list.append(i)
    
    print(f"The accuracy for min_samples_split = {i} is {acc}")

The accuracy for min_samples_split = 2 is 0.7252747252747253
The accuracy for min_samples_split = 3 is 0.7142857142857143
The accuracy for min_samples_split = 4 is 0.7362637362637363
The accuracy for min_samples_split = 5 is 0.7252747252747253
The accuracy for min_samples_split = 6 is 0.7582417582417582
The accuracy for min_samples_split = 7 is 0.7472527472527473
The accuracy for min_samples_split = 8 is 0.7472527472527473
The accuracy for min_samples_split = 9 is 0.7252747252747253
The accuracy for min_samples_split = 10 is 0.7142857142857143
The accuracy for min_samples_split = 11 is 0.7142857142857143
The accuracy for min_samples_split = 12 is 0.7142857142857143
The accuracy for min_samples_split = 13 is 0.7142857142857143
The accuracy for min_samples_split = 14 is 0.7142857142857143
The accuracy for min_samples_split = 15 is 0.7142857142857143
The accuracy for min_samples_split = 16 is 0.7142857142857143
The accuracy for min_samples_split = 17 is 0.7142857142857143
The accuracy for

In [34]:
max_acc = acc_list[0]
for i in range(1,len(acc_list)):
    if acc_list[i] > max_acc:
        max_acc = acc_list[i]

max_index = acc_list.index(max_acc)
print(f"We observed that, Highest accuracy for our model is we get when min_samples_split is {max_index + 2}")

We observed that, Highest accuracy for our model is we get when min_samples_split is 20


In [35]:
dt4 = DecisionTreeClassifier(min_samples_split = 20)
mymodel(dt4)

Accuracy is 0.7692307692307693 

 Classfication Report 
               precision    recall  f1-score   support

           0       0.71      0.83      0.76        41
           1       0.84      0.72      0.77        50

    accuracy                           0.77        91
   macro avg       0.77      0.77      0.77        91
weighted avg       0.78      0.77      0.77        91



In [36]:
dt4.score(xtrain, ytrain)

0.8867924528301887

In [47]:
acc_list = []
leaf_list = []
for i in range(1,50):
    dt5 = DecisionTreeClassifier(min_samples_leaf = i)
    dt5.fit(xtrain, ytrain)
    ypred = dt5.predict(xtest)
    acc = accuracy_score(ytest, ypred)
    
    acc_list.append(acc)
    leaf_list.append(i)
    
    print(f"The accuracy for min_samples_leaf = {i} is {acc}")

The accuracy for min_samples_leaf = 1 is 0.7252747252747253
The accuracy for min_samples_leaf = 2 is 0.6813186813186813
The accuracy for min_samples_leaf = 3 is 0.7472527472527473
The accuracy for min_samples_leaf = 4 is 0.7692307692307693
The accuracy for min_samples_leaf = 5 is 0.7142857142857143
The accuracy for min_samples_leaf = 6 is 0.7142857142857143
The accuracy for min_samples_leaf = 7 is 0.7142857142857143
The accuracy for min_samples_leaf = 8 is 0.7142857142857143
The accuracy for min_samples_leaf = 9 is 0.7692307692307693
The accuracy for min_samples_leaf = 10 is 0.8021978021978022
The accuracy for min_samples_leaf = 11 is 0.8021978021978022
The accuracy for min_samples_leaf = 12 is 0.8021978021978022
The accuracy for min_samples_leaf = 13 is 0.8021978021978022
The accuracy for min_samples_leaf = 14 is 0.8021978021978022
The accuracy for min_samples_leaf = 15 is 0.8021978021978022
The accuracy for min_samples_leaf = 16 is 0.7802197802197802
The accuracy for min_samples_leaf

In [48]:
max_acc = acc_list[0]
for i in range(1,len(acc_list)):
    if acc_list[i] > max_acc:
        max_acc = acc_list[i]

max_index = acc_list.index(max_acc)
print(f"We observed that, Highest accuracy for our model is we get when min_samples_leaf is {max_index + 1}")

We observed that, Highest accuracy for our model is we get when min_samples_leaf is 10


In [49]:
dt6 = DecisionTreeClassifier(min_samples_leaf = 10)
mymodel(dt6)

Accuracy is 0.8021978021978022 

 Classfication Report 
               precision    recall  f1-score   support

           0       0.77      0.80      0.79        41
           1       0.83      0.80      0.82        50

    accuracy                           0.80        91
   macro avg       0.80      0.80      0.80        91
weighted avg       0.80      0.80      0.80        91



In [50]:
dt6.score(xtrain, ytrain)

0.8726415094339622

In [51]:
dt7 = DecisionTreeClassifier(max_depth = 3, min_samples_split = 20,min_samples_leaf = 10)
mymodel(dt7)

Accuracy is 0.8021978021978022 

 Classfication Report 
               precision    recall  f1-score   support

           0       0.79      0.76      0.77        41
           1       0.81      0.84      0.82        50

    accuracy                           0.80        91
   macro avg       0.80      0.80      0.80        91
weighted avg       0.80      0.80      0.80        91



In [52]:
dt7.score(xtrain, ytrain)

0.8632075471698113