# Purpose
The purpose of this notebook is to empirically check that
$$\frac{1}{n}\sum_{i=1}^n\sum_{j=1}^k \mathbb{V}[\hat Y_{i,j}^1] = \sum_{j=1}^k \mathbb{V}[\hat Y_{1,j}^1]$$

if *both* the $X_i$ and the trees will be sampled at random. Note that the assertion does *not* hold anymore if the $X_i$ are fixed before, i.e. already determined. Here, we suppose that they are yet to be sampled. We only know how many elements will be sampled.

In [1]:
from sklearn.datasets import fetch_openml, make_classification

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

import itertools as it
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from tqdm.notebook import tqdm

In [2]:
def get_random_dataset(n, d, k):
    
    X = np.random.rand(n, d)
    y = np.random.randint(0, k, n)
    
    return X, y

k = 2
d = 2
X, y = get_random_dataset(2000, d=d, k=k)
    
X_train, _, y_train, _ = train_test_split(X, y, stratify=y, random_state=0)

In [3]:
rf = ExtraTreesClassifier(n_estimators=1000).fit(X_train, y_train)

In [6]:
yhat = []
for tree in rf.estimators_:
    X_val, _ = get_random_dataset(100, d=d, k=k)
    yhat.append(tree.predict_proba(X_val))
yhat = np.array(yhat)
yhat.shape

(1000, 100, 2)

In [7]:
print(np.var(yhat, axis=0).sum(axis=1).mean()) # mean(V[Y_ij])
print(np.var(yhat, axis=(0,1)).sum())          # V[Y_1j]

0.4990993200000017
0.4996844927999663
