# XGBoost histogramming yay or nay

Instead of the $\epsilon$ parameter, as discussed in the XGBoost paper in eq. 9, we use here `n_bins`. 

**Summary**

Testing up to 10k samples and 10 columns / features we did not find a significant speed difference between using and not using the histogramming technique below that amount of samples. But the classification example potentially already benefits slightly. 

Note the authors demonstrated their more fancy version of what was implemented here on 10 million samples.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import itertools
import time
import typing as T

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn.datasets as sk_datasets
from sklearn import metrics

import random_tree_models.models.xgboost as xgboost

In [None]:
X, y = sk_datasets.make_classification(
    n_samples=1_000,
    n_features=2,
    n_classes=2,
    n_redundant=0,
    class_sep=2,
    random_state=42,
)

X_hist, all_x_bin_edges = xgboost.xgboost_histogrammify_with_h(
    X, np.ones_like(X[:, 0]), n_bins=10
)

In [None]:
X_hist2 = xgboost.xgboost_histogrammify_with_x_bin_edges(X, all_x_bin_edges)

In [None]:
(X_hist2 == X_hist).all()

## Classification

In [None]:
def get_class_stats(
    use_hist: bool,
    n_bins: int,
    n_samples_arr: T.List[int],
    n_features_arr: T.List[int],
) -> pd.DataFrame:
    execution_stats = []

    for n_samples, n_features in itertools.product(n_samples_arr, n_features_arr):
        X, y = sk_datasets.make_classification(
            n_samples=n_samples,
            n_features=n_features,
            n_classes=2,
            n_redundant=0,
            class_sep=2,
            random_state=42,
        )
        # sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y, alpha=0.3);

        model = xgboost.XGBoostClassifier(use_hist=use_hist, n_bins=n_bins, max_depth=4)

        t0 = time.time()
        model.fit(X, y)

        t1 = time.time()
        y_pred = model.predict_proba(X)

        t2 = time.time()

        t_fit = t1 - t0
        t_pred = t2 - t1
        execution_stats.append(
            {
                "n_samples": n_samples,
                "n_features": n_features,
                "t_fit": t_fit,
                "t_pred": t_pred,
                "auc": metrics.roc_auc_score(y, y_pred[:, 1]),
            }
        )

    execution_stats = pd.DataFrame(execution_stats)
    return execution_stats

In [None]:
n_samples_arr = [100, 500, 1_000, 5_000, 10_000]
n_features_arr = [2, 5, 10]

In [None]:
execution_stats_reg_vanilla = get_class_stats(False, 256, n_samples_arr, n_features_arr)

In [None]:
execution_stats_reg_with_hist = get_class_stats(
    True, 256, n_samples_arr, n_features_arr
)

In [None]:
execution_stats_reg = pd.concat(
    [
        execution_stats_reg_vanilla.assign(group="vanilla"),
        execution_stats_reg_with_hist.assign(group="histogram"),
    ],
    ignore_index=True,
)
execution_stats_reg.head()

In [None]:
fig, axs = plt.subplots(figsize=(12, 9), nrows=3, sharex=True)

ax = axs[0]
sns.lineplot(
    data=execution_stats_reg,
    x="n_samples",
    y="t_fit",
    hue="n_features",
    style="group",
    ax=ax,
)
ax.set(title=".fit", ylabel="time [s]")
ax = axs[1]
sns.lineplot(
    data=execution_stats_reg,
    x="n_samples",
    y="t_pred",
    hue="n_features",
    style="group",
    ax=ax,
)
ax.set(title=".predict_proba", ylabel="time [s]")
ax = axs[2]
sns.lineplot(
    data=execution_stats_reg,
    x="n_samples",
    y="auc",
    hue="n_features",
    style="group",
    ax=ax,
)
ax.set(title="score", ylabel="score [roc auc]")

plt.suptitle("Time DecisionTreeClassifier.* took based on n_samples and n_features")
plt.tight_layout()

In [None]:
fig, axs = plt.subplots(figsize=(12, 9), nrows=3)

ax = axs[0]
sns.lineplot(
    data=execution_stats_reg,
    x="n_features",
    y="t_fit",
    hue="n_samples",
    style="group",
    ax=ax,
)

ax = axs[1]
sns.lineplot(
    data=execution_stats_reg,
    x="n_features",
    y="t_pred",
    hue="n_samples",
    style="group",
    ax=ax,
)

ax = axs[2]
sns.lineplot(
    data=execution_stats_reg,
    x="n_features",
    y="auc",
    hue="n_samples",
    style="group",
    ax=ax,
)


plt.suptitle("Time take based on n_features")
plt.tight_layout()

## Regression

split score:

* variance

In [None]:
def get_reg_stats(
    use_hist: bool,
    n_bins: int,
    n_samples_arr: T.List[int],
    n_features_arr: T.List[int],
) -> pd.DataFrame:
    execution_stats = []

    for n_samples, n_features in itertools.product(n_samples_arr, n_features_arr):
        X, y = sk_datasets.make_classification(
            n_samples=n_samples,
            n_features=n_features,
            n_classes=2,
            n_redundant=0,
            class_sep=2,
            random_state=42,
        )
        # sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y, alpha=0.3);

        model = xgboost.XGBoostRegressor(use_hist=use_hist, n_bins=n_bins, max_depth=4)

        t0 = time.time()
        model.fit(X, y)

        t1 = time.time()
        y_pred = model.predict(X)

        t2 = time.time()

        t_fit = t1 - t0
        t_pred = t2 - t1

        execution_stats.append(
            {
                "n_samples": n_samples,
                "n_features": n_features,
                "t_fit": t_fit,
                "t_pred": t_pred,
                "mse": metrics.mean_squared_error(y, y_pred),
            }
        )

    execution_stats = pd.DataFrame(execution_stats)
    return execution_stats

In [None]:
n_samples_arr = [100, 500, 1_000, 5_000, 10_000]
n_features_arr = [2, 5, 10]

In [None]:
execution_stats_class_vanilla = get_reg_stats(False, 256, n_samples_arr, n_features_arr)

In [None]:
execution_stats_class_with_hist = get_reg_stats(
    True, 100, n_samples_arr, n_features_arr
)

In [None]:
execution_stats_class = pd.concat(
    [
        execution_stats_class_vanilla.assign(group="vanilla"),
        execution_stats_class_with_hist.assign(group="histogram"),
    ],
    ignore_index=True,
)
execution_stats_class.head()

In [None]:
fig, axs = plt.subplots(figsize=(12, 9), nrows=3, sharex=True)

ax = axs[0]
sns.lineplot(
    data=execution_stats_class,
    x="n_samples",
    y="t_fit",
    hue="n_features",
    style="group",
    ax=ax,
)
ax.set(title=".fit", ylabel="time [s]")
ax = axs[1]
sns.lineplot(
    data=execution_stats_class,
    x="n_samples",
    y="t_pred",
    hue="n_features",
    style="group",
    ax=ax,
)
ax.set(title=".predict_proba", ylabel="time [s]")
ax = axs[2]
sns.lineplot(
    data=execution_stats_class,
    x="n_samples",
    y="mse",
    hue="n_features",
    style="group",
    ax=ax,
)
ax.set(title="score", ylabel="score [mse]")

plt.suptitle("Time DecisionTreeRegressor.* took based on n_samples and n_features")
plt.tight_layout()

In [None]:
fig, axs = plt.subplots(figsize=(12, 9), nrows=3)

ax = axs[0]
sns.lineplot(
    data=execution_stats_class,
    x="n_features",
    y="t_fit",
    hue="n_samples",
    style="group",
    ax=ax,
)

ax = axs[1]
sns.lineplot(
    data=execution_stats_class,
    x="n_features",
    y="t_pred",
    hue="n_samples",
    style="group",
    ax=ax,
)

ax = axs[2]
sns.lineplot(
    data=execution_stats_class,
    x="n_features",
    y="mse",
    hue="n_samples",
    style="group",
    ax=ax,
)

plt.suptitle("Time take based on n_features")
plt.tight_layout()