In [3]:
# simulate drawing a bootstrap sample from 10 items.
import numpy as np

N = 10
original = np.arange(N)
bootstrap = np.random.choice(original, size=N, replace=True)

original, bootstrap

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), array([9, 0, 1, 4, 3, 3, 2, 2, 2, 3]))

In [5]:
# demo: verify the 63.2% fact
import numpy as np

def bootstrap_unique_fraction(N=1000, trials=5000):
    fracs = []
    for _ in range(trials):
        sample = np.random.choice(N, size=N, replace=True)
        fracs.append(len(np.unique(sample)) / N)
    return np.mean(fracs)

bootstrap_unique_fraction()

0.6324734

In [6]:
# a full random forest simulation
import numpy as np
from sklearn.tree import DecisionTreeClassifier

def bootstrap_sample(X, y):
    N = len(X)
    idx = np.random.choice(N, size=N, replace=True)
    return X[idx], y[idx], idx

def random_forest(X, y, n_trees=100):
    trees = []
    oob_sets = []

    for _ in range(n_trees):
        Xb, yb, idx = bootstrap_sample(X, y)
        tree = DecisionTreeClassifier()
        tree.fit(Xb, yb)
        trees.append(tree)

        # OOB samples = those not in bootstrap
        oob = np.setdiff1d(np.arange(len(X)), idx)
        oob_sets.append(oob)

    return trees, oob_sets

In [21]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
X, y = make_moons(n_samples=100, noise=0.20, random_state=42)

random_forest(X, y, n_trees = 10)

([DecisionTreeClassifier(),
  DecisionTreeClassifier(),
  DecisionTreeClassifier(),
  DecisionTreeClassifier(),
  DecisionTreeClassifier(),
  DecisionTreeClassifier(),
  DecisionTreeClassifier(),
  DecisionTreeClassifier(),
  DecisionTreeClassifier(),
  DecisionTreeClassifier()],
 [array([ 0,  3,  5, 15, 17, 19, 20, 22, 24, 25, 29, 32, 36, 39, 40, 41, 42,
         44, 50, 55, 56, 57, 58, 62, 63, 67, 70, 75, 76, 79, 81, 82, 84, 85,
         86, 87, 91, 92, 93, 97, 98]),
  array([ 7,  8, 10, 12, 13, 14, 17, 20, 22, 24, 28, 31, 34, 35, 38, 39, 40,
         44, 45, 49, 51, 52, 54, 56, 58, 59, 61, 62, 64, 65, 66, 68, 69, 75,
         80, 82, 84, 87, 92]),
  array([ 0,  3,  5,  8, 10, 11, 13, 15, 16, 18, 21, 28, 29, 30, 33, 36, 37,
         40, 47, 48, 49, 50, 52, 53, 58, 68, 70, 72, 74, 76, 84, 87, 89, 92,
         93, 94, 95, 97, 98, 99]),
  array([ 2,  3,  6,  8, 13, 15, 18, 19, 20, 21, 29, 30, 37, 40, 41, 42, 43,
         47, 53, 56, 58, 62, 68, 75, 76, 79, 82, 84, 85, 86, 88, 90, 91, 92

In [22]:
import numpy as np

arr = np.array([[1, 2, 3],
                [4, 5, 6]])

arr.size, arr.shape, len(arr)     # 6, (2, 3), 2

(6, (2, 3), 2)