In [19]:
### 

%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier

In [3]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [4]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [8]:
relevant_cols = ['RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']

mean = df[relevant_cols].mean()
std_dev = df[relevant_cols].std()

z_score = (df[relevant_cols] - mean) / std_dev

df_clean = df[(z_score.abs() < 3).all(axis=1)]


In [13]:
encoded_cols = df_clean.copy()

binary = ['Sex', 'ExerciseAngina']
le = LabelEncoder()
for col in binary:
    encoded_cols[col] = le.fit_transform(encoded_cols[col])

encoded_cols.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,ATA,140,289,0,Normal,172,0,0.0,Up,0
1,49,0,NAP,160,180,0,Normal,156,0,1.0,Flat,1
2,37,1,ATA,130,283,0,ST,98,0,0.0,Up,0
3,48,0,ASY,138,214,0,Normal,108,1,1.5,Flat,1
4,54,1,NAP,150,195,0,Normal,122,0,0.0,Up,0


In [14]:
# dummy vars to encode remaining cols

encoded_cols = pd.get_dummies(encoded_cols, columns=['ChestPainType', 'RestingECG', 'ST_Slope'], drop_first=True)

encoded_cols.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ST_Slope_Flat,ST_Slope_Up
0,40,1,140,289,0,172,0,0.0,0,True,False,False,True,False,False,True
1,49,0,160,180,0,156,0,1.0,1,False,True,False,True,False,True,False
2,37,1,130,283,0,98,0,0.0,0,True,False,False,False,True,False,True
3,48,0,138,214,0,108,1,1.5,1,False,False,False,True,False,True,False
4,54,1,150,195,0,122,0,0.0,0,False,True,False,True,False,False,True


In [15]:
# feature scaling

X = encoded_cols.drop('HeartDisease', axis=1)
y = encoded_cols['HeartDisease']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify=y, random_state=10)

In [23]:
# SVM 

svm_cv = cross_val_score(SVC(), X, y, cv=5)
svm_cv.mean()

np.float64(0.6895779019242706)

In [24]:
# Bagged SVM ---> does not show much improvement as bagging is meant for models with high
#                 variance

svm_bagged = BaggingClassifier(
    estimator=SVC(),
    n_estimators=100,
    max_samples=0.8,
    random_state=10,
    oob_score=True
)

score_svm_bag = cross_val_score(svm_bagged, X, y, cv=5)
score_svm_bag.mean()

np.float64(0.6906828057107386)

In [25]:
# Decision Tree

dt = DecisionTreeClassifier()

score_dt = cross_val_score(dt, X, y, cv=5)
score_dt.mean()

np.float64(0.7251396648044693)

In [27]:
# Tree Bagging

bagged_dt = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=100,
    max_samples=0.8,
    random_state=10,
    oob_score=True
)

bagged_dt_score = cross_val_score(bagged_dt, X, y, cv=5)
bagged_dt_score.mean()

np.float64(0.801893234016139)

In [28]:
### Random Fores ---> implemented internally as a bagged tree

score_rf = cross_val_score(RandomForestClassifier(), X, y, cv=5)
score_rf.mean()

np.float64(0.8186157666045932)