In [1]:
import numpy as np

## Bootstrap

In [2]:
class Bootstrap:
    
    def create_dataset(self, X, y, seed=None):
        '''creates bootstrapped dataset'''
        if seed:
            np.random.seed(seed)
        n_rows = len(X)
        indices = np.arange(n_rows)
        indices = np.random.choice(indices, size=n_rows, replace=True)
        return X[indices], y[indices]

## Create Data

In [3]:
np.random.seed(10)

In [4]:
X = np.random.randint(0,10,50).reshape(10,5)
X

array([[9, 4, 0, 1, 9],
       [0, 1, 8, 9, 0],
       [8, 6, 4, 3, 0],
       [4, 6, 8, 1, 8],
       [4, 1, 3, 6, 5],
       [3, 9, 6, 9, 1],
       [9, 4, 2, 6, 7],
       [8, 8, 9, 2, 0],
       [6, 7, 8, 1, 7],
       [1, 4, 0, 8, 5]])

In [5]:
y = np.random.binomial(1, 0.5, 10)
y

array([1, 0, 1, 1, 1, 0, 1, 0, 0, 1])

### Bootstrap Example

In [6]:
bs = Bootstrap()
X, y = bs.create_dataset(X,y)

In [7]:
X

array([[9, 4, 0, 1, 9],
       [9, 4, 0, 1, 9],
       [9, 4, 2, 6, 7],
       [1, 4, 0, 8, 5],
       [0, 1, 8, 9, 0],
       [6, 7, 8, 1, 7],
       [1, 4, 0, 8, 5],
       [0, 1, 8, 9, 0],
       [8, 6, 4, 3, 0],
       [6, 7, 8, 1, 7]])

In [8]:
y

array([1, 1, 1, 1, 0, 0, 1, 0, 1, 0])

## Bagging

In [9]:
class Bagging(Bootstrap):
    
    def __init__(self):
        self.estimator = None
        
    def simulate(self, estimator, X, y, n_datasets, seed=None):
        if seed:
            self.seed=seed
        self.coefficients_ = []
        self.estimator = estimator
        for dataset in range(n_datasets):
            X_, y_ = self.create_dataset(X, y, seed=self.seed)
            self.estimator.fit(X_, y_)
            self.coefficients_.append(self.estimator.coef_)
            self.seed += 1

### Example: Sklearn vs Bagging

#### Sklearn

In [10]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [11]:
lr.fit(X,y)
print(lr.coef_)

[ 0.05415476  0.04757173 -0.13439324  0.03477829 -0.04744752]


#### Bagging

In [12]:
bg = Bagging()
bg.simulate(lr, X, y, n_datasets=100, seed=42)

In [13]:
bg.coefficients_[:10]

[array([ 0.03065716, -0.0095945 , -0.11071628,  0.04298948,  0.03107712]),
 array([ 0.03202357,  0.00323003, -0.10583687,  0.05571963,  0.03346221]),
 array([ 0.01504485,  0.09281765, -0.10585113,  0.0013086 , -0.02779966]),
 array([ 0.03890223,  0.04224749, -0.1397625 ,  0.01358421, -0.05403209]),
 array([ 0.05415476,  0.04757173, -0.13439324,  0.03477829, -0.04744752]),
 array([ 0.01504485,  0.09281765, -0.10585113,  0.0013086 , -0.02779966]),
 array([ 0.0225789 ,  0.04256523, -0.084117  ,  0.0519456 ,  0.04574701]),
 array([ 0.05415476,  0.04757173, -0.13439324,  0.03477829, -0.04744752]),
 array([ 0.03202357,  0.00323003, -0.10583687,  0.05571963,  0.03346221]),
 array([ 0.02997327, -0.04066438, -0.09418353,  0.0071274 ,  0.01730941])]

#### Mean of Coefficients

In [14]:
np.mean(bg.coefficients_, axis=0)

array([ 0.03915596,  0.0205273 , -0.11609587,  0.02856614, -0.01729375])

#### Standard Error of Coefficients

In [15]:
np.std(bg.coefficients_, axis=0)

array([ 0.01944324,  0.03295672,  0.02520118,  0.01921173,  0.03869426])