<a href="https://colab.research.google.com/github/gnitnaw/Programs/blob/master/Chapter8/BaggingExample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!python --version

Python 3.7.13


# Example 8.3
## Import modules

In [2]:
import numpy as np
from sklearn.datasets import make_friedman1
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import r2_score, mean_squared_error
print(np.__version__)

1.21.5


## Set random seed

In [3]:
np.random.seed(100)

## Original Program

In [4]:
# create regression problem
n_points = 1000 # points
x, y =  make_friedman1(n_samples=n_points, n_features=15, 
                       noise=1.0, random_state=100)

# split to train/test set
x_train, x_test, y_train, y_test = \
        train_test_split(x, y, test_size=0.33, random_state=100)

# training
regTree = DecisionTreeRegressor(random_state=100)
regTree.fit(x_train,y_train)

# test
yhat = regTree.predict(x_test)

# Bagging construction
n_estimators=500
bag = np.empty((n_estimators), dtype=object)
bootstrap_ds_arr = np.empty((n_estimators), dtype=object)
for i in range(n_estimators):
    # sample bootsraped dataset
    ids = np.random.choice(range(0,len(x_test)),size=len(x_test),
                     replace=True)
    x_boot = x_train[ids]
    y_boot = y_train[ids]
    bootstrap_ds_arr[i] = np.unique(ids)
    
    bag[i] = DecisionTreeRegressor()
    bag[i].fit(x_boot,y_boot)

# bagging prediction
yhatbag = np.zeros(len(y_test))   
for i in range(n_estimators): 
    yhatbag = yhatbag + bag[i].predict(x_test)
        
yhatbag = yhatbag/n_estimators

# out of bag loss estimation
oob_pred_arr = np.zeros(len(x_train))
for i in range(len(x_train)):
    x = x_train[i].reshape(1, -1)
    C = []
    for b in range(n_estimators):
        if(np.isin(i, bootstrap_ds_arr[b])==False):
            C.append(b)
    for pred in bag[C]:       
        oob_pred_arr[i] = oob_pred_arr[i] + (pred.predict(x)/len(C))        

L_oob = r2_score(y_train, oob_pred_arr)

print("DecisionTreeRegressor R^2 score = ",r2_score(y_test, yhat),  
      "\nBagging R^2 score = ", r2_score(y_test, yhatbag),
      "\nBagging OOB R^2 score = ",L_oob)

DecisionTreeRegressor R^2 score =  0.5652585102808715 
Bagging R^2 score =  0.7594897731192692 
Bagging OOB R^2 score =  0.7755235429272777


## Compare loss of N-fold cross-validation and OOB

In [5]:
n = 1000
Bmax = 500
X, Y =  make_friedman1(n_samples=n, n_features=15, noise=1.0, random_state=100)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=100)
clf = DecisionTreeRegressor(random_state=0)
cv_results = cross_validate(clf, X_train, Y_train, cv=Y_train.shape[0], scoring ='neg_mean_squared_error', return_train_score=True)

error_B = np.zeros(Bmax)
OOB_B = np.zeros(Bmax)
# Bagging construction
n_estimators=Bmax
bag = np.empty((n_estimators), dtype=object)
bootstrap_ds_arr = np.empty((n_estimators), dtype=object)
for i in range(n_estimators):
    # sample bootsraped dataset
    ids = np.random.choice(range(0,len(X_train)),size=len(X_train),
                     replace=True)
    
    x_boot = X_train[ids]
    y_boot = Y_train[ids]
    bootstrap_ds_arr[i] = np.unique(ids)
    bag[i] = DecisionTreeRegressor()
    bag[i].fit(x_boot,y_boot)

# bagging prediction
yhatbag = np.zeros(len(Y_test))   
for i in range(n_estimators): 
    yhatbag = yhatbag + bag[i].predict(X_test)
    error_B[i] = mean_squared_error(Y_test, yhatbag/(i+1))

# out of bag loss estimation
oob_pred_arr = np.zeros(len(X_train))
for i in range(len(X_train)):
    x = X_train[i].reshape(1, -1)
    C = []
    for b in range(n_estimators):
        if(np.isin(i, bootstrap_ds_arr[b])==False):
            C.append(b)
    #print(len(C))
    for pred in  bag[C]:       
        oob_pred_arr[i] = oob_pred_arr[i] + (pred.predict(x)/len(C))

LL_oob = mean_squared_error(Y_train, oob_pred_arr)
print("cross_validate loss = ", np.mean(cv_results['test_score']), 
      "\nBagging OOB MSE score = ",LL_oob)
print(error_B)

cross_validate loss =  -9.25439805855385 
Bagging OOB MSE score =  4.339658951166955
[12.57943888  9.69760299  8.37439952  7.23119866  6.89097184  6.70714173
  6.41927994  6.27012489  6.18052977  6.09244196  6.05690628  5.98188085
  5.85176005  5.6993827   5.69934466  5.63317978  5.58116661  5.4542918
  5.44618434  5.50870398  5.42924013  5.44626065  5.52056038  5.42594414
  5.38024633  5.34380359  5.27987388  5.28907612  5.34522285  5.33414652
  5.31258058  5.28481526  5.23779702  5.23209615  5.22672713  5.21694649
  5.20577009  5.19319425  5.17886283  5.19247903  5.16110621  5.18340333
  5.17720803  5.17695094  5.15135618  5.1577897   5.12670755  5.11420038
  5.09963584  5.11308145  5.10711712  5.11340031  5.08058492  5.08207168
  5.08203463  5.09064725  5.06085549  5.0524102   5.05710643  5.04816921
  5.04524883  5.05552588  5.04555752  5.06363624  5.07610039  5.07787476
  5.10143426  5.09460889  5.10719633  5.09745801  5.0784167   5.07588832
  5.06568152  5.06418016  5.04815061  5.