# Ensemble learning - Bagging

In [1]:
import pandas as pd

df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
#Find NaN
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [3]:
#statistics
df.describe() #it seems that there isn't outliers

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [4]:
#check if there is any imbalance
df.Outcome.value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [6]:
ratio = 268/500
ratio #there is some imbalance (2:1) but is not major. Major -> 10:1 or 100:1

0.536

In [8]:
X = df.drop('Outcome', axis = 1)
y = df.Outcome

In [9]:
#scale the data. in glucose the min is 0 and max 199, it's not a huge difference in scale but is better to scale it
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled[:3]

array([[ 0.63994726,  0.84832379,  0.14964075,  0.90726993, -0.69289057,
         0.20401277,  0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575,  0.53090156, -0.69289057,
        -0.68442195, -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, -1.28821221, -0.69289057,
        -1.10325546,  0.60439732, -0.10558415]])

In [10]:
from sklearn.model_selection import train_test_split
#stratify assures that test and trains has equal propotion since there in imbalance
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2, stratify = y, random_state = 10)

In [11]:
X_train.shape

(614, 8)

In [12]:
X_test.shape

(154, 8)

In [13]:
y_train.value_counts()

0    400
1    214
Name: Outcome, dtype: int64

In [14]:
ratio = 214/400
ratio  #same ratio has before

0.535

In [18]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

scores = cross_val_score(DecisionTreeClassifier(), X, y, cv = 5)
scores

array([0.67532468, 0.64935065, 0.7012987 , 0.79738562, 0.7124183 ])

In [16]:
scores.mean()

0.7123334182157712

In [20]:
from sklearn.ensemble import BaggingClassifier

bag_model = BaggingClassifier(base_estimator = DecisionTreeClassifier(), 
                  n_estimators = 100, max_samples = 0.8, oob_score = True,
                 random_state = 0)
bag_model.fit(X_train, y_train)
bag_model.oob_score_ #test with the samples from training not used

0.760586319218241

In [21]:
bag_model.score(X_test,y_test)

0.8051948051948052

In [22]:
bag_model = BaggingClassifier(base_estimator = DecisionTreeClassifier(), 
                  n_estimators = 100, max_samples = 0.8, oob_score = True,
                 random_state = 0)

scores = cross_val_score(bag_model, X, y, cv = 5)
scores.mean()

0.7578728461081402

In [23]:
from sklearn.ensemble import RandomForestClassifier

scores = cross_val_score(RandomForestClassifier(), X, y, cv = 5)
scores.mean()

0.7669977081741788

Download heart disease dataset heart.csv in Exercise folder and do following, (credits of dataset: https://www.kaggle.com/fedesoriano/heart-failure-prediction)

- Load heart disease dataset in pandas dataframe
- Remove outliers using Z score. Usual guideline is to remove anything that has Z score > 3 formula or Z score < -3
- Convert text columns to numbers using label encoding and one hot encoding
- Apply scaling
- Build a classification model using support vector machine. Use standalone model as well as Bagging model and check if you see any difference in the performance.
- Now use decision tree classifier. Use standalone model as well as Bagging and check if you notice any difference in performance
- Comparing performance of svm and decision tree classifier figure out where it makes most sense to use bagging and why. Use internet to figure out in what conditions bagging works the best.

In [27]:
heart = pd.read_csv('heart.csv')
heart.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [29]:
from scipy.stats import zscore
import numpy as np
#calculate z-scores
columns_to_remove_outliers = ['RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak', 'FastingBS']
z_scores = zscore(heart[columns_to_remove_outliers])
# Identify rows with outliers based on Z-scores
outliers = (np.abs(z_scores) > 3).any(axis=1)
# Remove rows with outliers from the original DataFrame
data = heart[~outliers]
data.shape

(899, 12)

In [30]:
#see the unique values
for column in data.columns:
    unique_values = data[column].unique()
    print(f"Unique values for {column}:", unique_values)
    print()

Unique values for Age: [40 49 37 48 54 39 45 58 42 38 43 60 36 44 53 52 51 56 41 32 65 35 59 50
 47 31 46 57 55 63 66 34 33 61 29 62 28 30 74 68 72 64 69 67 73 70 77 75
 76 71]

Unique values for Sex: ['M' 'F']

Unique values for ChestPainType: ['ATA' 'NAP' 'ASY' 'TA']

Unique values for RestingBP: [140 160 130 138 150 120 110 136 115 100 124 113 125 145 112 132 170 142
 118 135 180 108 155 128 106  92 122  98 105 133  95  80 137 185 165 126
 152 116 144 154 134 104 139 131 141 178 146 158 123 102  96 143 172 156
 114 127 101 174  94 148 117 129 164]

Unique values for Cholesterol: [289 180 283 214 195 339 237 208 207 284 211 164 204 234 273 196 201 248
 267 223 184 288 215 209 260 468 188 518 167 224 172 186 254 306 250 177
 227 230 294 264 259 175 318 216 340 233 205 245 194 270 213 365 342 253
 277 202 297 225 246 412 265 182 218 268 163 100 206 238 139 263 291 229
 307 210 329 147  85 269 275 179 392 466 129 255 276 282 338 160 156 272
 240 393 161 228 292 388 241 166 247 331 341 2

In [31]:
#encode the categorical variables with one hot encoding
data = pd.get_dummies(data, columns=['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], drop_first=True)
data.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,1,1,0,0,1,0,0,0,1
1,49,160,180,0,156,1.0,1,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0.0,0,1,1,0,0,0,1,0,0,1
3,48,138,214,0,108,1.5,1,0,0,0,0,1,0,1,1,0
4,54,150,195,0,122,0.0,0,1,0,1,0,1,0,0,0,1


In [32]:
X = data.drop('HeartDisease', axis = 1)
y = data['HeartDisease']

In [33]:
#scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled

array([[-1.42815446,  0.46590022,  0.84963584, ..., -0.8229452 ,
        -0.99888827,  1.13469459],
       [-0.47585532,  1.63471366, -0.16812204, ..., -0.8229452 ,
         1.00111297, -0.88129441],
       [-1.7455875 , -0.1185065 ,  0.79361247, ..., -0.8229452 ,
        -0.99888827,  1.13469459],
       ...,
       [ 0.3706328 , -0.1185065 , -0.62564622, ...,  1.21514774,
         1.00111297, -0.88129441],
       [ 0.3706328 , -0.1185065 ,  0.35476274, ..., -0.8229452 ,
         1.00111297, -0.88129441],
       [-1.63977649,  0.34901888, -0.21480818, ..., -0.8229452 ,
        -0.99888827,  1.13469459]])

In [38]:
from sklearn.svm import SVC

scores = cross_val_score(SVC(), X_scaled, y, cv=5)
scores.mean()

0.8319615145872129

In [39]:
bag_model = BaggingClassifier(base_estimator = SVC(), 
                  n_estimators = 100, max_samples = 0.8,
                 random_state = 0)

scores = cross_val_score(bag_model, X_scaled, y, cv=5)
scores.mean()

0.8319553072625698

Using bagging in case of SVM doesn't make much difference in terms of model accuracy. Bagging is effective when we have high variance and instable model such as decision tree.

In [40]:
scores = cross_val_score(DecisionTreeClassifier(random_state=0), X_scaled, y, cv=5)
scores.mean()

0.7262756052141527

In [41]:
bag_model = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(random_state=0), 
    n_estimators=100, 
    max_samples=0.9, 
    oob_score=True,
    random_state=0
)

scores = cross_val_score(bag_model, X_scaled, y, cv=5)
scores.mean()

0.798578522656735

In [42]:
from sklearn.ensemble import RandomForestClassifier

scores = cross_val_score(RandomForestClassifier(), X, y, cv=5)
scores.mean()

0.824171322160149

In random forest underneath is used bagging where it sampled not only data rows but also the columns (or features).