In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

### Data Preprocessing

In [2]:
df = pd.read_csv("heart.csv")

In [3]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [4]:
sex_dummies = df['Sex'].map({'M': 1, 'F': 0})
exercise_angina_dummies = df['ExerciseAngina'].map({'Y': 1, 'N': 0})
resting_ecg_dummies = pd.get_dummies(df['RestingECG'])
st_slope_dummies = pd.get_dummies(df['ST_Slope'])
chest_pain_type_dummies = pd.get_dummies(df['ChestPainType'])

df = pd.concat(
    [
        df.drop(
            columns=['Sex', 'ExerciseAngina', 'RestingECG', 'ST_Slope', 'ChestPainType']
        ),
        sex_dummies, exercise_angina_dummies, resting_ecg_dummies, 
        st_slope_dummies, chest_pain_type_dummies
    ]
    , 
    axis =1
)


In [5]:
df.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex,ExerciseAngina,LVH,Normal,ST,Down,Flat,Up,ASY,ATA,NAP,TA
0,40,140,289,0,172,0.0,0,1,0,False,True,False,False,False,True,False,True,False,False
1,49,160,180,0,156,1.0,1,0,0,False,True,False,False,True,False,False,False,True,False
2,37,130,283,0,98,0.0,0,1,0,False,False,True,False,False,True,False,True,False,False
3,48,138,214,0,108,1.5,1,0,1,False,True,False,False,True,False,True,False,False,False
4,54,150,195,0,122,0.0,0,1,0,False,True,False,False,False,True,False,False,True,False


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   RestingBP       918 non-null    int64  
 2   Cholesterol     918 non-null    int64  
 3   FastingBS       918 non-null    int64  
 4   MaxHR           918 non-null    int64  
 5   Oldpeak         918 non-null    float64
 6   HeartDisease    918 non-null    int64  
 7   Sex             918 non-null    int64  
 8   ExerciseAngina  918 non-null    int64  
 9   LVH             918 non-null    bool   
 10  Normal          918 non-null    bool   
 11  ST              918 non-null    bool   
 12  Down            918 non-null    bool   
 13  Flat            918 non-null    bool   
 14  Up              918 non-null    bool   
 15  ASY             918 non-null    bool   
 16  ATA             918 non-null    bool   
 17  NAP             918 non-null    boo

In [7]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex,ExerciseAngina
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377,0.78976,0.404139
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414,0.407701,0.490992
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0,0.0,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0,1.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0,1.0,0.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0,1.0,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0,1.0,1.0


In [8]:
df.isnull().sum()

Age               0
RestingBP         0
Cholesterol       0
FastingBS         0
MaxHR             0
Oldpeak           0
HeartDisease      0
Sex               0
ExerciseAngina    0
LVH               0
Normal            0
ST                0
Down              0
Flat              0
Up                0
ASY               0
ATA               0
NAP               0
TA                0
dtype: int64

### Train Test Splitting

In [9]:
x = df.drop('HeartDisease',axis=1)
y = df['HeartDisease']

In [10]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

### Model Training

In [11]:
model = RandomForestClassifier()
model.fit(x_train,y_train)

In [12]:
y_pred = model.predict(x_test)

In [13]:
y_pred

array([0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 0, 1], dtype=int64)

### Evaluating the model

In [14]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.85      0.88      0.87        77
           1       0.91      0.89      0.90       107

    accuracy                           0.89       184
   macro avg       0.88      0.89      0.88       184
weighted avg       0.89      0.89      0.89       184

