In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import sys
import matplotlib.pyplot as plt
from sklearn import datasets, model_selection
sns.set(rc={'figure.figsize':(10,10)})

Leo Brieman [first described bagging in 1996](https://link.springer.com/content/pdf/10.1023%2FA%3A1018054314350.pdf)

### Get some data

In [2]:
boston_data = datasets.load_boston()
df_boston = pd.DataFrame(boston_data.data,columns=boston_data.feature_names)
df_boston['target'] = pd.Series(boston_data.target)
train, test = model_selection.train_test_split(df_boston, test_size=0.2)
train_X = train.drop('target', axis=1).values
train_y = train['target'].values
test_X = test.drop('target', axis=1).values
test_y = test['target'].values
train.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
126,0.38735,0.0,25.65,0.0,0.581,5.613,95.6,1.7572,2.0,188.0,19.1,359.29,27.26,15.7
496,0.2896,0.0,9.69,0.0,0.585,5.39,72.9,2.7986,6.0,391.0,19.2,396.9,21.14,19.7
43,0.15936,0.0,6.91,0.0,0.448,6.211,6.5,5.7209,3.0,233.0,17.9,394.46,7.44,24.7
194,0.01439,60.0,2.93,0.0,0.401,6.604,18.8,6.2196,1.0,265.0,15.6,376.7,4.38,29.1
140,0.2909,0.0,21.89,0.0,0.624,6.174,93.6,1.6119,4.0,437.0,21.2,388.08,24.16,14.0


### Regression Tree Algorithm

In [3]:
def compute_sum_split_variance(xs, y, v):
    '''xs - 1D array of scalars
        v - scalar to split on'''
    left = y[xs <= v]
    right = y[xs > v]
    left_var = 0 if len(left) == 0 else ((left - left.mean()) ** 2).sum()
    right_var = 0 if len(right) == 0 else ((right - right.mean()) ** 2).sum()
    return  left_var + right_var

def node(i, s, p, c, l, r):
    return {'internal': i,
            'split': s,
            'p': p,
            'c':c,
            'l':l,
            'r':r}

def splitf(X, y, max_leaf_n):
    if X.shape[0] <= max_leaf_n:
        return node(False, None, None, y.mean(), None, None)
    lowest_var, best_p_idx, best_split = sys.float_info.max, None, None
    for p_idx in range(0, X.shape[1]):
        for n_idx in range(0, X.shape[0]):
            split = X[n_idx][p_idx]
            var = compute_sum_split_variance(X[:,p_idx], y, split)
            if var < lowest_var:
                lowest_var = var
                best_p_idx = p_idx
                best_split = split
    left_idxs = X[:, best_p_idx] <= best_split
    right_idxs = X[:, best_p_idx] > best_split
    if len(y[left_idxs]) == 0 or len(y[right_idxs]) == 0:
        # this can only happen if yi = yi+1 ... = yn
        # in which case.. there is no benefit to branching more.
        assert(np.unique(y).size == 1)
        return node(False, None, None, y.mean(), None, None)
    l = splitf(X[left_idxs], y[left_idxs], max_leaf_n)
    r = splitf(X[right_idxs], y[right_idxs], max_leaf_n)
    return node(True, best_split, best_p_idx, None, l, r)

def predict(x, model):
    if not model['internal']:
        return model['c']
    if x[model['p']] <= model['split']:
        return predict(x, model['l'])
    else:
        return predict(x, model['r'])

In [4]:
max_leaf_n = 2
tree = splitf(train_X, train_y, max_leaf_n)

# evaluate training error
preds = [predict(r, tree) for r in train_X]
print('training error: {}'.format(np.sqrt(((train_y - preds) ** 2).sum() / train_y.shape[0])))

# evaluate test error
preds = [predict(r, tree) for r in test_X]
print('test error: {}'.format(np.sqrt(((test_y - preds) ** 2).sum() / test_y.shape[0])))

training error: 0.6138359344771579
test error: 4.781918045730021


### Bagging

In [5]:
def bagged_predict(x, trees):
    return np.array([predict(x, t) for t in trees]).mean()
    
def bagged_trees(X, y, max_leaf_n, b):
    n = X.shape[0]
    trees = []
    for _ in range(0,b):
        b_sample = np.random.randint(0, n, n)
        trees.append(splitf(X[b_sample], y[b_sample], max_leaf_n))
    return trees

In [7]:
max_leaf_n = 2
b = 100
trees = bagged_trees(train_X, train_y, max_leaf_n, b)

# evaluate test error
preds = [bagged_predict(x, trees) for x in test_X]
print('test error: {}'.format(np.sqrt(((test_y - preds) ** 2).sum() / test_y.shape[0])))

test error: 3.3628760457411464
