In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import matplotlib.pyplot as plt

This program is to determine whether an individual is at the risk of heart disease. The data is imported from Kaggle Heart Failure Prediction Dataset which contains 11 features:

Age: age of the patient [years]
Sex: sex of the patient [M: Male, F: Female]
ChestPainType: chest pain type [TA: Typical Angina, ATA: Atypical Angina, NAP: Non-Anginal Pain, ASY: Asymptomatic]
RestingBP: resting blood pressure [mm Hg]
Cholesterol: serum cholesterol [mm/dl]
FastingBS: fasting blood sugar [1: if FastingBS > 120 mg/dl, 0: otherwise]
RestingECG: resting electrocardiogram results [Normal: Normal, ST: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV), LVH: showing probable or definite left ventricular hypertrophy by Estes' criteria]
MaxHR: maximum heart rate achieved [Numeric value between 60 and 202]
ExerciseAngina: exercise-induced angina [Y: Yes, N: No]
Oldpeak: oldpeak = ST [Numeric value measured in depression]
ST_Slope: the slope of the peak exercise ST segment [Up: upsloping, Flat: flat, Down: downsloping]
HeartDisease: output class [1: heart disease, 0: Normal]




In [3]:
df = pd.read_csv("C:/Users/Fnu Aymen/Documents/Machine Learning Data/decisiontree/heart.csv")
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


Sex, ChestPainType, RestingECG, ExerciseAngina, and ST_Slope variables are categorical variables so we have to one-hot encode them

In [4]:
cat_variables = ['Sex',
'ChestPainType',
'RestingECG',
'ExerciseAngina',
'ST_Slope'
]

# This will replace the columns with the one-hot encoded ones and keep the columns outside 'columns' argument as it is.
df = pd.get_dummies(data = df, prefix = cat_variables, columns = cat_variables)

In [5]:
df.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_F,Sex_M,ChestPainType_ASY,...,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,False,True,False,...,False,False,False,True,False,True,False,False,False,True
1,49,160,180,0,156,1.0,1,True,False,False,...,True,False,False,True,False,True,False,False,True,False
2,37,130,283,0,98,0.0,0,False,True,False,...,False,False,False,False,True,True,False,False,False,True
3,48,138,214,0,108,1.5,1,True,False,True,...,False,False,False,True,False,False,True,False,True,False
4,54,150,195,0,122,0.0,0,False,True,False,...,True,False,False,True,False,True,False,False,False,True


In [6]:
#Remove the target variable from the features list
features = [x for x in df.columns if x not in 'HeartDisease']

In [7]:
X_train, X_cvtest, y_train, y_cvtest = train_test_split(df[features], df['HeartDisease'], test_size=0.40, random_state=1)   #Splitting the data into training,cross-validation and test sets
X_cv, X_test, y_cv, y_test = train_test_split(X_cvtest, y_cvtest, test_size=0.50, random_state=1)

In [11]:
xgb_model = XGBClassifier(n_estimators = 500, learning_rate = 0.1,verbosity = 1, random_state = 1, early_stopping_rounds = 10)
xgb_model.fit(X_train,y_train, eval_set = [(X_cv,y_cv)])
xgb_model.best_iteration

[0]	validation_0-logloss:0.63635
[1]	validation_0-logloss:0.59795
[2]	validation_0-logloss:0.56533
[3]	validation_0-logloss:0.53928
[4]	validation_0-logloss:0.51747
[5]	validation_0-logloss:0.50362
[6]	validation_0-logloss:0.48984
[7]	validation_0-logloss:0.47889
[8]	validation_0-logloss:0.47096
[9]	validation_0-logloss:0.46275
[10]	validation_0-logloss:0.45728
[11]	validation_0-logloss:0.44875
[12]	validation_0-logloss:0.44244
[13]	validation_0-logloss:0.43795
[14]	validation_0-logloss:0.43342
[15]	validation_0-logloss:0.43065
[16]	validation_0-logloss:0.42834
[17]	validation_0-logloss:0.42697
[18]	validation_0-logloss:0.42358
[19]	validation_0-logloss:0.42249
[20]	validation_0-logloss:0.41945
[21]	validation_0-logloss:0.41914
[22]	validation_0-logloss:0.41837
[23]	validation_0-logloss:0.41958
[24]	validation_0-logloss:0.42120
[25]	validation_0-logloss:0.42300
[26]	validation_0-logloss:0.42546
[27]	validation_0-logloss:0.42707
[28]	validation_0-logloss:0.42820
[29]	validation_0-loglos

22