In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [3]:
df = pd.read_csv("heartfailure.csv")

In [4]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [6]:
sex_dummies = df['Sex'].map({'M': 1, 'F': 0})
exercise_angina_dummies = df['ExerciseAngina'].map({'Y': 1, 'N': 0})
resting_ecg_dummies = pd.get_dummies(df['RestingECG'])
st_slope_dummies = pd.get_dummies(df['ST_Slope'])
chest_pain_type_dummies = pd.get_dummies(df['ChestPainType'])

df = pd.concat(
    [df.drop(columns=['Sex', 'ExerciseAngina', 'RestingECG', 'ST_Slope', 'ChestPainType']),sex_dummies, exercise_angina_dummies, resting_ecg_dummies,st_slope_dummies, chest_pain_type_dummies],axis =1)

In [7]:
df.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex,ExerciseAngina,LVH,Normal,ST,Down,Flat,Up,ASY,ATA,NAP,TA
0,40,140,289,0,172,0.0,0,1,0,0,1,0,0,0,1,0,1,0,0
1,49,160,180,0,156,1.0,1,0,0,0,1,0,0,1,0,0,0,1,0
2,37,130,283,0,98,0.0,0,1,0,0,0,1,0,0,1,0,1,0,0
3,48,138,214,0,108,1.5,1,0,1,0,1,0,0,1,0,1,0,0,0
4,54,150,195,0,122,0.0,0,1,0,0,1,0,0,0,1,0,0,1,0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   RestingBP       918 non-null    int64  
 2   Cholesterol     918 non-null    int64  
 3   FastingBS       918 non-null    int64  
 4   MaxHR           918 non-null    int64  
 5   Oldpeak         918 non-null    float64
 6   HeartDisease    918 non-null    int64  
 7   Sex             918 non-null    int64  
 8   ExerciseAngina  918 non-null    int64  
 9   LVH             918 non-null    uint8  
 10  Normal          918 non-null    uint8  
 11  ST              918 non-null    uint8  
 12  Down            918 non-null    uint8  
 13  Flat            918 non-null    uint8  
 14  Up              918 non-null    uint8  
 15  ASY             918 non-null    uint8  
 16  ATA             918 non-null    uint8  
 17  NAP             918 non-null    uin

In [9]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex,ExerciseAngina,LVH,Normal,ST,Down,Flat,Up,ASY,ATA,NAP,TA
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377,0.78976,0.404139,0.204793,0.601307,0.1939,0.068627,0.501089,0.430283,0.540305,0.188453,0.221133,0.050109
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414,0.407701,0.490992,0.40377,0.489896,0.395567,0.252957,0.500271,0.495386,0.498645,0.391287,0.415236,0.218289
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
df.isnull().sum()

Age               0
RestingBP         0
Cholesterol       0
FastingBS         0
MaxHR             0
Oldpeak           0
HeartDisease      0
Sex               0
ExerciseAngina    0
LVH               0
Normal            0
ST                0
Down              0
Flat              0
Up                0
ASY               0
ATA               0
NAP               0
TA                0
dtype: int64

TEST SPLITTING


In [11]:
x = df.drop('HeartDisease',axis=1)
y = df['HeartDisease']

In [15]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=40)

In [16]:
model = RandomForestClassifier()
model.fit(x_train,y_train)

In [17]:
y_pred = model.predict(x_test)
y_pred

array([1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0])

In [18]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.81      0.81      0.81        75
           1       0.87      0.87      0.87       109

    accuracy                           0.85       184
   macro avg       0.84      0.84      0.84       184
weighted avg       0.85      0.85      0.85       184

