In [1]:
#importing modules

import pandas as pd
import numpy as np

from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


In [2]:
#Loading the dataframe

df = pd.read_csv('heart.csv')


In [3]:
#df.dtypes

In [4]:
#converting all categorical data to numeric data by creating dummies using one hot encoding method


dummy_variables = pd.get_dummies(df[["sex","dataset","cp","fbs","restecg","exang","slope","thal"]])


In [5]:
#merge dummy variables with the main dataframe
df = pd.concat([df, dummy_variables], axis=1)

In [6]:

#dropping original categorical data
df.drop(["sex","dataset","cp","fbs","restecg","exang","slope","thal"], axis = 1, inplace=True)

In [7]:
df[["trestbps","thalch","oldpeak","ca","chol"]].describe()


Unnamed: 0,trestbps,thalch,oldpeak,ca,chol
count,861.0,865.0,858.0,309.0,890.0
mean,132.132404,137.545665,0.878788,0.676375,199.130337
std,19.06607,25.926276,1.091226,0.935653,110.78081
min,0.0,60.0,-2.6,0.0,0.0
25%,120.0,120.0,0.0,0.0,175.0
50%,130.0,140.0,0.5,0.0,223.0
75%,140.0,157.0,1.5,1.0,268.0
max,200.0,202.0,6.2,3.0,603.0


In [8]:
mean_trestbps = df["trestbps"].mean()
df["trestbps"].replace(np.nan, mean_trestbps, inplace = True)

mean_thalch = df["thalch"].mean()
df["thalch"].replace(np.nan, mean_thalch, inplace = True)


median_oldpeak = df["oldpeak"].median()
df["oldpeak"].replace(np.nan, median_oldpeak, inplace = True)

median_chol = df["chol"].median()
df["chol"].replace(np.nan, median_chol, inplace = True)

df["ca"].replace(np.nan,0.0, inplace = True)

In [9]:
#df.dropna(subset = ["trestbps","thalch","oldpeak","chol","ca"], axis = 0, inplace = True)

In [10]:
#Splitting data into training and testing set

heart_training_set, heart_test_set = train_test_split(df, test_size=0.20,random_state=42)

heart_training_data, heart_training_target = heart_training_set[['id', 'age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca',
       'sex_Female', 'sex_Male', 'dataset_Cleveland', 'dataset_Hungary',
       'dataset_Switzerland', 'dataset_VA Long Beach', 'cp_asymptomatic',
       'cp_atypical angina', 'cp_non-anginal', 'cp_typical angina',
       'fbs_False', 'fbs_True', 'restecg_lv hypertrophy', 'restecg_normal',
       'restecg_st-t abnormality', 'exang_False', 'exang_True',
       'slope_downsloping', 'slope_flat', 'slope_upsloping',
       'thal_fixed defect', 'thal_normal', 'thal_reversable defect']], heart_training_set["num"]

heart_test_data, heart_test_target = heart_test_set[['id', 'age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca',
       'sex_Female', 'sex_Male', 'dataset_Cleveland', 'dataset_Hungary',
       'dataset_Switzerland', 'dataset_VA Long Beach', 'cp_asymptomatic',
       'cp_atypical angina', 'cp_non-anginal', 'cp_typical angina',
       'fbs_False', 'fbs_True', 'restecg_lv hypertrophy', 'restecg_normal',
       'restecg_st-t abnormality', 'exang_False', 'exang_True',
       'slope_downsloping', 'slope_flat', 'slope_upsloping',
       'thal_fixed defect', 'thal_normal', 'thal_reversable defect']], heart_test_set["num"]


In [11]:
#df.columns[df.isna().any()].tolist()


In [12]:
clf = RandomForestClassifier(n_estimators=150)
clf.fit(heart_training_data, heart_training_target)
heart_test_target_predict=clf.predict(heart_test_data)

In [13]:
matrix = confusion_matrix(heart_test_target,heart_test_target_predict)
print(matrix)

[[70  3  2  0  0]
 [14 32  5  3  0]
 [ 6 10  5  4  0]
 [ 6 12  1  6  1]
 [ 1  0  1  1  1]]


In [14]:
acc = accuracy_score(heart_test_target,heart_test_target_predict)
print('The accuracy is', acc)

The accuracy is 0.6195652173913043


In [15]:
print(classification_report(heart_test_target, heart_test_target_predict))

              precision    recall  f1-score   support

           0       0.72      0.93      0.81        75
           1       0.56      0.59      0.58        54
           2       0.36      0.20      0.26        25
           3       0.43      0.23      0.30        26
           4       0.50      0.25      0.33         4

    accuracy                           0.62       184
   macro avg       0.51      0.44      0.46       184
weighted avg       0.58      0.62      0.59       184

