Modify the Bagging scratch code in our lecture such that:
- Calculate for oob evaluation for each bootstrapped dataset, and also the average score
- Change the code to "without replacement"
- Put everything into a class <code>Bagging</code>.  It should have at least two methods, <code>fit(X_train, y_train)</code>, and <code>predict(X_test)</code>
- Modify the code from above to randomize features.  Set the number of features to be used in each tree to be <code>sqrt(n)</code>, and then select a subset of features for each tree.  This can be easily done by setting our DecisionTreeClassifier <code>max_features</code> to 'sqrt'

#### st122645

In [23]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import random
import math
from scipy import stats
from sklearn.metrics import classification_report, accuracy_score


iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y,
                test_size=0.3, shuffle=True, random_state=42)


In [113]:
class Bagging:
    def __init__(self, B=5, bootstrap_ratio=1, tree_params={}, without_replacement=True):
        # Bootstrap Times
        self.B = B
        # Bootstrap Ratio
        self.bootstrap_ratio = bootstrap_ratio
        self.tree_params = tree_params
        self.models = [DecisionTreeClassifier(**tree_params) for _ in range(self.B)]
        self.without_replacement = without_replacement
        self.avg_oob_score = 0

    def fit(self, X_train, y_train):
        m, n = X_train.shape
        
        sample_size = int(self.bootstrap_ratio * len(X_train))
        
        xsamples = np.zeros((self.B, sample_size, n))
        ysamples = np.zeros((self.B, sample_size))

        xoobs = []
        yoobs = []

        for i in range(self.B):
            oob_idx = []
            idxe = []

            for j in range(sample_size):
                idx = random.randrange(m)
                if (self.without_replacement == True):
                    while idx in idxe:
                        idx = random.randrange(m)
                idxe.append(idx)
                oob_idx.append(idx)
                xsamples[i, j, :] = X_train[idx]
                ysamples[i, j] = y_train[idx]
                mask = np.zeros((m), dtype=bool)
                mask[oob_idx] = True

                xoobs.append(X_train[~mask])
                yoobs.append(y_train[~mask])


                oob_score = 0


            for i, model in enumerate(self.models):
                _X = xsamples[i, :]
                _y = ysamples[i, :]
                model.fit(_X, _y)

                _X_oob = np.asarray(xoobs[i]) #X test
                _y_oob = np.asarray(yoobs[i]) #y test
                yhat = model.predict(_X_oob)

                oob_score += accuracy_score(_y_oob, yhat)
                print(f"Tree {i}", accuracy_score(_y_oob, yhat))
            self.avg_oob_score = oob_score / len(self.models)
            print("======Average out of bag score======")
            print(self.avg_oob_score)

    def predict(self, X_test):
        predictions = np.zeros((self.B, X_test.shape[0]))
        for i, model in enumerate(self.models):
            yhat = model.predict(X_test)
            predictions[i, :] = yhat

        return stats.mode(predictions)[0][0]

clfmodel = Bagging(tree_params =  {'max_depth': 2, 'criterion':'gini', 'min_samples_split': 5, 'max_features':'sqrt'})

clfmodel.fit(X_train, y_train)

predict = clfmodel.predict(X_test)

print(classification_report(y_test, predict))

Tree 0 0.9423076923076923
Tree 1 0.2912621359223301
Tree 2 0.29411764705882354
Tree 3 0.297029702970297
Tree 4 0.3
0.42494343565182857
Tree 0 0.9423076923076923
Tree 1 0.941747572815534
Tree 2 0.29411764705882354
Tree 3 0.297029702970297
Tree 4 0.3
0.5550405230304694
Tree 0 0.9423076923076923
Tree 1 0.941747572815534
Tree 2 0.9411764705882353
Tree 3 0.297029702970297
Tree 4 0.3
0.6844522877363517
Tree 0 0.9423076923076923
Tree 1 0.941747572815534
Tree 2 0.9411764705882353
Tree 3 0.9207920792079208
Tree 4 0.3
0.8092047629838766
Tree 0 0.9230769230769231
Tree 1 0.941747572815534
Tree 2 0.8137254901960784
Tree 3 0.8217821782178217
Tree 4 0.94
0.8880664328612713
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      0.92      0.96        13
           2       0.93      1.00      0.96        13

    accuracy                           0.98        45
   macro avg       0.98      0.97      0.97        45
weighte