In [1]:
'''
Ensemble Learning: Bagging Tutorial
We will use indian diabetes dataset to predict 
if a person has a diabetes or not based on certain features such as blood pressure, 
skin thickness, age etc. We will train a standalone model first and then use bagging 
ensemble technique to check how it can improve the performance of the model.
'''

'\nEnsemble Learning: Bagging Tutorial\nWe will use indian diabetes dataset to predict \nif a person has a diabetes or not based on certain features such as blood pressure, \nskin thickness, age etc. We will train a standalone model first and then use bagging \nensemble technique to check how it can improve the performance of the model.\n'

In [2]:
import pandas as pd

df = pd.read_csv("diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
# if any columns have null values
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [4]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [5]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [6]:
#how many times each unique value appears in the column Outcome
# In this case 1 appears 268 times and 0 appears 268.
df.Outcome.value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [7]:

X = df.drop("Outcome",axis="columns")
y = df.Outcome

In [8]:
# just to be on the safe side we can use scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled[:3]

array([[ 0.63994726,  0.84832379,  0.14964075,  0.90726993, -0.69289057,
         0.20401277,  0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575,  0.53090156, -0.69289057,
        -0.68442195, -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, -1.28821221, -0.69289057,
        -1.10325546,  0.60439732, -0.10558415]])

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X_scaled, y, test_size=0.2, stratify=y, random_state=10)
# we are telling statify that in train data and test data (y_train and y_test), the proportions of 0s and 1s (targets) should be the same.
# statify only effects y_train and y_test. not x_train and x_test
'''
stratify=y tells train_test_split to split the data so that 
the train and test sets have the same class proportions as the original dataset.

eg:
if your dataset is imbalanced (e.g., many 0s, few 1s or vice versa) in class(target), 
so there will be a mismatch between the proporsions of 0s and 1s between train data and test data.
hence a  normal split might give Biased model or Unreliable test accuracy
'''

'\nstratify=y tells train_test_split to split the data so that \nthe train and test sets have the same class proportions as the original dataset.\n\neg:\nif your dataset is imbalanced (e.g., many 0s, few 1s or vice versa) in class(target), \nso there will be a mismatch between the proporsions of 0s and 1s between train data and test data.\nhence a  normal split might give Biased model or Unreliable test accuracy\n'

In [10]:

X_train.shape

(614, 8)

In [11]:

X_test.shape

(154, 8)

In [12]:
y_train.value_counts()

Outcome
0    400
1    214
Name: count, dtype: int64

In [13]:

y_test.value_counts()

Outcome
0    100
1     54
Name: count, dtype: int64

In [14]:
# just using decision tree model and k fold cross validation
# k fold = 5 , runs 5 iteration.
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

scores = cross_val_score(DecisionTreeClassifier(), X, y, cv=5)
scores

array([0.67532468, 0.65584416, 0.67532468, 0.78431373, 0.7124183 ])

In [None]:
#calculating accuracy for a stand alone model.
# we can can average of k fold to get accuracy
scores.mean()

0.7006451065274595

In [None]:
# Doing Ensemble using Bagging Classifier

'''
what is oob_score
here each tree is built using a random sample of the data, so some datapoints are naturally left out 
for that tree—these are the out-of-bag (OOB) datapoints. For every data point, the model gathers predictions 
from all the trees that did not see that datapoint during training we take all their predictions.
It then compares this combined prediction with the true label(y value ,ie,truth). After doing this for all data points, 
the model calculates the overall accuracy of these OOB predictions, and that value becomes bag_model.oob_score_.
by saying oob = True, we are activating this feature
'''

from sklearn.ensemble import BaggingClassifier
bag_model = BaggingClassifier(
    estimator=DecisionTreeClassifier(), 
    n_estimators=100,  #im training 100 trees in parallel.
    max_samples=0.8, # use 80% of samples for bootstrap sampling.
    oob_score=True,
    random_state=0 # just locking randomness at zero.
)
bag_model.fit(X_train, y_train)
bag_model.oob_score_


#this model have 76% accuracy.

0.760586319218241

In [None]:
# test score using regular test samples
# since the oob samples are entirely new (not trained using these samples) - gives lower accuracy.
bag_model.score(X_test, y_test)

0.8051948051948052

In [None]:
# Doing Ensemble using Bagging Classifier with cross validation.

bag_model = BaggingClassifier(
    estimator=DecisionTreeClassifier(), 
    n_estimators=100, 
    max_samples=0.8, 
    oob_score=True,
    random_state=0
)
scores = cross_val_score(bag_model, X, y, cv=5)
scores

array([0.75324675, 0.72727273, 0.74675325, 0.82352941, 0.73856209])

In [19]:

scores.mean()
# We can see some improvement in test score with bagging classifier as compared to a standalone classifier

0.7578728461081402

In [20]:
# Train using Random Forest
from sklearn.ensemble import RandomForestClassifier

scores = cross_val_score(RandomForestClassifier(n_estimators=50), X, y, cv=5)
scores.mean()

0.7630336983278161