In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/heart-failure-prediction/heart.csv


# Data importing
Let's see what kind of data we are going to analyze

In [2]:
data = pd.read_csv("/kaggle/input/heart-failure-prediction/heart.csv")
data

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [3]:
data.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [4]:
data.dtypes

Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
HeartDisease        int64
dtype: object

# Preprocessing 

In [5]:
target = pd.DataFrame()
target = data.HeartDisease
data = data.drop("HeartDisease",axis=1)

In [6]:
data.ChestPainType.unique()

array(['ATA', 'NAP', 'ASY', 'TA'], dtype=object)

In [7]:
data.RestingECG.unique()

array(['Normal', 'ST', 'LVH'], dtype=object)

In [8]:
data.ExerciseAngina.unique()

array(['N', 'Y'], dtype=object)

In [9]:
data.ST_Slope.unique()

array(['Up', 'Flat', 'Down'], dtype=object)

*OK,it looks like we need to convert the category data into dummy variables; using "get_dummies" is suitable.

In [10]:
X_0 = pd.get_dummies(data.Sex,prefix="Sex")
X_1 = pd.get_dummies(data.ChestPainType,prefix="CPT")
X_2 = pd.get_dummies(data.RestingECG,prefix="RECG")
X_3 = pd.get_dummies(data.ExerciseAngina,prefix="EA")
X_4 = pd.get_dummies(data.ST_Slope,prefix="ST_S")

data = data.join([X_0,X_1,X_2,X_3,X_4])

Delete any unnecessary category data that has already been converted.

In [11]:
data = data.drop(["Sex","ChestPainType","RestingECG","ExerciseAngina","ST_Slope"],axis=1)

It is well known that cholesterol levels can be divided into "dangerous", "border" and "normal" ranges. Therefore, we will create a new column for categorical data according to the numerical value.

In [12]:
data["normal_chol"] = 0
data["border_chol"] = 0
data["dangerous_chol"] = 0

for i in range(len(data.index)):
    if (data.Cholesterol.iloc[i] <= 239) and (data.Cholesterol.iloc[i] >=200):
        data["border_chol"].iloc[i] = 1
    elif (data.Cholesterol.iloc[i] < 200):
        data["normal_chol"].iloc[i] =1
    else:
        data["dangerous_chol"].iloc[i] = 1        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Ok, let's look at the data at this point.

In [13]:
data

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_F,Sex_M,CPT_ASY,CPT_ATA,...,RECG_Normal,RECG_ST,EA_N,EA_Y,ST_S_Down,ST_S_Flat,ST_S_Up,normal_chol,border_chol,dangerous_chol
0,40,140,289,0,172,0.0,0,1,0,1,...,1,0,1,0,0,0,1,0,0,1
1,49,160,180,0,156,1.0,1,0,0,0,...,1,0,1,0,0,1,0,1,0,0
2,37,130,283,0,98,0.0,0,1,0,1,...,0,1,1,0,0,0,1,0,0,1
3,48,138,214,0,108,1.5,1,0,1,0,...,1,0,0,1,0,1,0,0,1,0
4,54,150,195,0,122,0.0,0,1,0,0,...,1,0,1,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110,264,0,132,1.2,0,1,0,0,...,1,0,1,0,0,1,0,0,0,1
914,68,144,193,1,141,3.4,0,1,1,0,...,1,0,1,0,0,1,0,1,0,0
915,57,130,131,0,115,1.2,0,1,1,0,...,1,0,0,1,0,1,0,1,0,0
916,57,130,236,0,174,0.0,1,0,0,1,...,0,0,1,0,0,1,0,0,1,0


# Training and Predicting

In this time, I would like to use "Voting Clasifier" by using "Random Forest Regression","Extra Trees Regression" ,"Liner SVC" and "Neural Network" to try to predict heart failure.I hope it works :D

First of all, we will split the data into training and test data.

In [14]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(data,target,random_state=42,test_size=0.3)

Let's import the libraries we need.

In [15]:
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

rnd_clf = RandomForestClassifier(n_estimators=100,random_state=42)
extra_trees_clf = ExtraTreesClassifier(n_estimators=100,random_state=42)
svm_clf = SVC(max_iter=100,tol=20,random_state=42)
mlp_clf = MLPClassifier(random_state=42)

In [16]:
estimators = [rnd_clf,extra_trees_clf,svm_clf,mlp_clf]

for estimator in estimators:
    print("Training the", estimator)
    estimator.fit(X_train,y_train)

Training the RandomForestClassifier(random_state=42)
Training the ExtraTreesClassifier(random_state=42)
Training the SVC(max_iter=100, random_state=42, tol=20)
Training the MLPClassifier(random_state=42)


estimator_results = [estimator.score(X_test,y_test) for estimator in estimators]
print(estimator_results)

# Voting method

Prediction using Random Forest seems to be the best. On the other hand, SVM looks pretty poor.

Let's take a look at the results of letting them vote.

In [17]:
from sklearn.ensemble import VotingClassifier

named_estimators = [
    ("random_forest_clf",rnd_clf),
    ("Extra_trees_clf",extra_trees_clf),
    ("SVM",svm_clf),
    ("MLP",mlp_clf)
]

voting_clf = VotingClassifier(named_estimators)

In [18]:
voting_clf.fit(X_train,y_train)

VotingClassifier(estimators=[('random_forest_clf',
                              RandomForestClassifier(random_state=42)),
                             ('Extra_trees_clf',
                              ExtraTreesClassifier(random_state=42)),
                             ('SVM',
                              SVC(max_iter=100, random_state=42, tol=20)),
                             ('MLP', MLPClassifier(random_state=42))])

In [19]:
voting_clf.score(X_test,y_test)

0.894927536231884

# Sumarry
We checked basic data preprocessing methods, the implementation of well-known techniques, and the improvement of prediction accuracy through voting. Thank you.