In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from copy import deepcopy

import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor

from imodels import HSTreeRegressorCV

from tqdm import tqdm

In [None]:
def simulation1(n=500, noise="gaussian"):
    X = np.random.uniform(0, 1, (n, 50))
    y = X[:, :10].sum(axis=1)
    if noise == "gaussian":
        y += np.random.normal(0, 0.01, n)
    elif noise == "laplacian":
        y += np.random.laplace(0, 0.01, n)
    return X, y


def simulation2(n=500, noise="gaussian"):
    X = np.random.uniform(0, 1, (n, 50))
    y = X[:, :10].sum(axis=1) + X[:, 0]*X[:, 1] + X[:, 4]*X[:, 5] + X[:, 10]*X[:, 11]
    if noise == "gaussian":
        y += np.random.normal(0, 0.01, n)
    elif noise == "laplacian":
        y += np.random.laplace(0, 0.01, n)
    return X, y

In [None]:
def experiment(max_leaf_nodes, X_test, y_test, simulation_f, noise="gaussian", N=100):
    CART_mse = list()
    CART_pred = list()
    hsCART_mse = list()
    hsCART_pred = list()
    hsCART_lambda = list()

    for _ in range(N):
        X, y = simulation_f(noise=noise)

        CART = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes)
        CART.fit(X, y)
        hsCART = HSTreeRegressorCV(deepcopy(CART),  [0.1, 1, 10, 25, 50, 100])
        hsCART.fit(X, y)
        hsCART_lambda.append(hsCART.reg_param)

        y_pred = CART.predict(X_test)
        CART_mse.append(mean_squared_error(y_test, y_pred))
        CART_pred.append(y_pred)
        y_pred = hsCART.predict(X_test)
        hsCART_mse.append(mean_squared_error(y_test, y_pred))
        hsCART_pred.append(y_pred)

    CART_mean_pred = np.array(CART_pred).mean(axis=0)
    CART_MSE = np.mean(CART_mse)
    CART_bias2 = np.power(CART_mean_pred-y_test, 2).mean()
    CART_variance = np.array(CART_pred).var(axis=0).mean()
    hsCART_mean_pred = np.array(hsCART_pred).mean(axis=0)
    hsCART_MSE = np.mean(hsCART_mse)
    hsCART_bias2 = np.power(hsCART_mean_pred-y_test, 2).mean()
    hsCART_variance = np.array(hsCART_pred).var(axis=0).mean()
    
    return CART_MSE, CART_bias2, CART_variance, hsCART_MSE, hsCART_bias2, hsCART_variance, np.mean(hsCART_lambda)

In [None]:
def bias_variance(simulation_function, noise="gaussian", save=False):
    simulation_f = simulation1 if simulation_function == "simulation_1" else simulation2
    
    CART_MSE, CART_bias2, CART_variance = list(), list(), list()
    hsCART_MSE, hsCART_bias2, hsCART_variance, hsCART_lambda = list(), list(), list(), list()

    leaf_nodes = [2, 4, 8, 12, 16, 20, 24, 28, 30, 32] + list(range(40, 151, 10))

    X_test, y_test = simulation_f(noise=None)

    for max_leaf_nodes in tqdm(leaf_nodes):
        (
            cart_mse, cart_bias2, cart_variance,
            hscart_mse, hscart_bias2, hscart_variance, hscart_lambda
        ) = experiment(max_leaf_nodes, X_test, y_test, simulation_f, noise)
        CART_MSE.append(cart_mse)
        CART_bias2.append(cart_bias2)
        CART_variance.append(cart_variance)
        hsCART_MSE.append(hscart_mse)
        hsCART_bias2.append(hscart_bias2)
        hsCART_variance.append(hscart_variance)
        hsCART_lambda.append(hscart_lambda)
    
    fig, ax = plt.subplots()
    lns1 = ax.plot(leaf_nodes, CART_MSE, color="lightsalmon", label="CART MSE")
    lns2 = ax.plot(leaf_nodes, CART_bias2, color="lightsalmon", linestyle="dotted", label="CART Bias2")
    lns3 = ax.plot(leaf_nodes, CART_variance, color="lightsalmon", linestyle="--", label="CART Variance")
    lns4 = ax.plot(leaf_nodes, hsCART_MSE, color="firebrick", label="hsCART MSE")
    lns5 = ax.plot(leaf_nodes, hsCART_bias2, color="firebrick", linestyle="dotted", label="hsCART Bias2")
    lns6 = ax.plot(leaf_nodes, hsCART_variance, color="firebrick", linestyle="--", label="hsCART Variance")
    ax.set_xlabel("Number of Leaves")
    ax.set_ylabel("Error")
    ax2 = ax.twinx()
    lns7 = ax2.plot(leaf_nodes, hsCART_lambda, color="skyblue", linestyle="dashdot", label="hsCART Lambda")
    ax2.set_ylabel("Lambda")
    lns = lns1+lns2+lns3+lns4+lns5+lns6+lns7
    labs = [l.get_label() for l in lns]
    ax.legend(lns, labs, prop={'size': 6})
    if save:
        plt.savefig(f"../graphs/miscelenious/bias_variance/{simulation_function}_{noise}", bbox_inches="tight", facecolor="white", edgecolor="auto")

In [None]:
bias_variance("simulation_1", "gaussian", save=False)

In [None]:
bias_variance("simulation_1", "laplacian", save=False)

In [None]:
bias_variance("simulation_2", "gaussian", save=False)

In [None]:
bias_variance("simulation_2", "laplacian", save=False)