1. Load heart disease dataset in pandas dataframe
2. Remove outliers using Z score. Usual guideline is to remove anything that has Z score > 3 formula or Z score < -3
3. Convert text columns to numbers using label encoding and one hot encoding
4. Apply scaling
5. Build a classification model using support vector machine. Use stand alone model as well as Bagging model and check if you see any difference in the performance.
6. Now use decision tree classifier. Use standalone model as well as Bagging and check if you notice any difference in performance
7. Comparing performance of svm and decision tree classifier figure out where it makes most sense to use bagging and why. Use internet to figure out in what conditions bagging works the best.

In [41]:
import numpy as np
import pandas as pd

In [42]:
df = pd.read_csv("heart_disease.csv")
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [43]:
# Checking if there are Null values in data
df.isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [44]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [45]:
df.shape

(918, 12)

In [46]:
# Removing the Outliers
col_for_outliers = ['RestingBP','Cholesterol','FastingBS','MaxHR','Oldpeak']

for col in col_for_outliers:
    df=df[df[col]<=df[col].mean()+3*df[col].std()]
    df=df[df[col]>=df[col].mean()-3*df[col].std()]
    print(df.shape)

(910, 12)
(907, 12)
(907, 12)
(906, 12)
(899, 12)


In [47]:
df.columns

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'],
      dtype='object')

In [48]:
df.ChestPainType.unique()

array(['ATA', 'NAP', 'ASY', 'TA'], dtype=object)

In [49]:
df.RestingECG.unique()

array(['Normal', 'ST', 'LVH'], dtype=object)

In [50]:
df.ST_Slope.unique()

array(['Up', 'Flat', 'Down'], dtype=object)

In [51]:
df.Sex.replace(
    {'F':0,'M':1},
    inplace=True
)
df.RestingECG.replace(
    {'Normal':1, 'ST':2,'LVH':3},
    inplace =True
)
df.ST_Slope.replace(
    {'Down':1, 'Flat':2,'Up':3},
    inplace=True
)
df.ExerciseAngina.replace(
    {'N':0, 'Y':1},
    inplace=True
)
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,ATA,140,289,0,1,172,0,0.0,3,0
1,49,0,NAP,160,180,0,1,156,0,1.0,2,1
2,37,1,ATA,130,283,0,2,98,0,0.0,3,0
3,48,0,ASY,138,214,0,1,108,1,1.5,2,1
4,54,1,NAP,150,195,0,1,122,0,0.0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,TA,110,264,0,1,132,0,1.2,2,1
914,68,1,ASY,144,193,1,1,141,0,3.4,2,1
915,57,1,ASY,130,131,0,1,115,1,1.2,2,1
916,57,0,ATA,130,236,0,3,174,0,0.0,2,1


In [53]:
df=pd.get_dummies(df, drop_first=True)
df

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA
0,40,1,140,289,0,1,172,0,0.0,3,0,1,0,0
1,49,0,160,180,0,1,156,0,1.0,2,1,0,1,0
2,37,1,130,283,0,2,98,0,0.0,3,0,1,0,0
3,48,0,138,214,0,1,108,1,1.5,2,1,0,0,0
4,54,1,150,195,0,1,122,0,0.0,3,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,110,264,0,1,132,0,1.2,2,1,0,0,1
914,68,1,144,193,1,1,141,0,3.4,2,1,0,0,0
915,57,1,130,131,0,1,115,1,1.2,2,1,0,0,0
916,57,0,130,236,0,3,174,0,0.0,2,1,1,0,0


In [55]:
y=df.HeartDisease.values
X=df.drop('HeartDisease', axis=1)

In [58]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_x=scaler.fit_transform(X)
scaled_x

array([[-1.42815446,  0.515943  ,  0.46590022, ...,  2.06332497,
        -0.5349047 , -0.22955001],
       [-0.47585532, -1.93819859,  1.63471366, ..., -0.48465463,
         1.86949191, -0.22955001],
       [-1.7455875 ,  0.515943  , -0.1185065 , ...,  2.06332497,
        -0.5349047 , -0.22955001],
       ...,
       [ 0.3706328 ,  0.515943  , -0.1185065 , ..., -0.48465463,
        -0.5349047 , -0.22955001],
       [ 0.3706328 , -1.93819859, -0.1185065 , ...,  2.06332497,
        -0.5349047 , -0.22955001],
       [-1.63977649,  0.515943  ,  0.34901888, ..., -0.48465463,
         1.86949191, -0.22955001]])

In [59]:
from sklearn.model_selection import train_test_split 
x_train, x_test, y_train, y_test = train_test_split(scaled_x, y, test_size=0.2, stratify=y, random_state=42)

Building a SVM model

In [64]:
from sklearn.svm import SVC
svm = SVC()
svm.fit(x_train,y_train)
svm.score(x_test, y_test)*100

89.44444444444444

In [62]:
from sklearn.model_selection import cross_val_score
score = cross_val_score(SVC(), scaled_x,y, cv=5)*100
score

array([84.44444444, 83.88888889, 82.22222222, 80.55555556, 76.53631285])

In [63]:
score.mean()

81.52948479205462

In [67]:
from sklearn.ensemble import BaggingClassifier
bag_model_svm=BaggingClassifier(base_estimator=SVC(), n_estimators=100, max_samples=0.8, oob_score=True, random_state=42)
bag_model_svm.fit(x_train,y_train)
bag_model_svm.oob_score_*100

85.67454798331016

In [69]:
bag_model_svm.score(x_test,y_test)*100

88.88888888888889

In [72]:
score=cross_val_score(bag_model_svm, scaled_x,y, cv=5)*100
score

array([83.33333333, 83.88888889, 83.33333333, 81.11111111, 77.09497207])

In [73]:
score.mean()

81.75232774674116

Building Decision Tree model

In [75]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(x_train,y_train)
dt.score(x_test,y_test)*100

83.88888888888889

In [76]:
score = cross_val_score(DecisionTreeClassifier(), scaled_x, y, cv=5)*100
score

array([75.        , 75.        , 76.66666667, 66.66666667, 64.80446927])

In [77]:
score.mean()

71.62756052141528

In [78]:
bag_model_dt=BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100, max_samples=0.8, oob_score=True, random_state=42)
bag_model_dt.fit(x_train, y_train)
bag_model_dt.oob_score_*100

82.89290681502086

In [79]:
bag_model_dt.score(x_test,y_test)*100

85.55555555555556

In [80]:
score = cross_val_score(bag_model_dt, scaled_x, y, cv=5)*100
score

array([84.44444444, 82.22222222, 81.11111111, 78.88888889, 72.06703911])

In [81]:
score.mean()

79.74674115456237

In [83]:
from sklearn.ensemble import RandomForestClassifier
scores = cross_val_score(RandomForestClassifier(), scaled_x, y, cv=5)
scores.mean()*100

81.6374922408442

Due to bagging there is increase in the accuracy of the decision tree model, but there is decrease in the accuracy of svm model slightly. </br>
As a technique, bagging works particularly well with algorithms that are less stable. One that are more stable or subject to high amounts of bias do not provide as much benefit as there’s less variation within the dataset of the model