# Random Forest Boosting
## Classification and regression

This notebook demonstrates how to use the `RFGBoost` to perform gradient boosting with random forests as base learners.

Author: https://www.github.com/deburky

In [1]:
from cmap import Colormap

cmap1 = "colorcet:cet_l19"
cmap2 = "chrisluts:ygc_3c"
cm1 = Colormap(cmap1).to_mpl()
cm2 = Colormap(cmap2).to_mpl()

In [2]:
from rfgboost import RFGBoost
from sklearn.datasets import make_classification, make_regression
from sklearn.ensemble import (
    HistGradientBoostingClassifier,
    HistGradientBoostingRegressor,
)
from sklearn.metrics import log_loss, mean_squared_error
from sklearn.model_selection import train_test_split

# Create a dataset
X_reg, y_reg = make_regression(
    n_samples=1000, n_features=10, noise=0.1, random_state=42
)
X_class, y_class = make_classification(
    n_samples=1000, n_features=10, n_informative=5, random_state=42
)

# Split datasets
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(
    X_class, y_class, test_size=0.2, random_state=42
)

# Random Forest Boost for Regression
rf_boost_reg = RFGBoost(
    n_estimators=10,
    rf_params={"n_estimators": 50, "max_depth": 5},
    task="regression",
    learning_rate=0.1,
)
rf_boost_reg.fit(X_train_reg, y_train_reg)
y_pred_reg = rf_boost_reg.predict(X_test_reg)

# Random Forest Boost for Classification
rf_boost_class = RFGBoost(
    n_estimators=10,
    rf_params={"n_estimators": 50, "max_depth": 5},
    task="classification",
    learning_rate=0.1,
)
rf_boost_class.fit(X_train_class, y_train_class)
y_pred_class = rf_boost_class.predict(X_test_class)

# HistGradientBoosting for Comparison
hgb_reg = HistGradientBoostingRegressor(
    max_iter=10, learning_rate=0.1, max_depth=5, random_state=42
)
hgb_reg.fit(X_train_reg, y_train_reg)
y_pred_hgb_reg = hgb_reg.predict(X_test_reg)

hgb_class = HistGradientBoostingClassifier(
    max_iter=10, learning_rate=0.1, max_depth=5, random_state=42
)
hgb_class.fit(X_train_class, y_train_class)
y_pred_hgb_class = hgb_class.predict_proba(X_test_class)[:, 1]

# Evaluation
mse_rf_boost_reg = mean_squared_error(y_test_reg, y_pred_reg)
mse_hgb_reg = mean_squared_error(y_test_reg, y_pred_hgb_reg)

logloss_rf_boost_class = log_loss(y_test_class, y_pred_class)
logloss_hgb_class = log_loss(y_test_class, y_pred_hgb_class)

print(
    f"Regression\nRFGBoost MSE: {mse_rf_boost_reg:.4f}\nHistGradientBoosting MSE: {mse_hgb_reg:.4f}"
)
print(
    f"\nClassification\nRFGBoost Log Loss: {logloss_rf_boost_class:.4f}\nHistGradientBoosting Log Loss: {logloss_hgb_class:.4f}"
)

Regression
RFGBoost MSE: 6009.7267
HistGradientBoosting MSE: 6667.1591

Classification
RFGBoost Log Loss: 0.2989
HistGradientBoosting Log Loss: 0.3254


In [3]:
import catboost as cb

# CatBoost for Regression
cb_reg = cb.CatBoostRegressor(
    iterations=10,
    learning_rate=0.1,
    depth=5,
    verbose=0,
    random_state=42,
    allow_writing_files=False,
)
cb_reg.fit(X_train_reg, y_train_reg)

# CatBoost for Classification
cb_class = cb.CatBoostClassifier(
    iterations=10,
    learning_rate=0.1,
    depth=5,
    verbose=0,
    random_state=42,
    allow_writing_files=False,
)
cb_class.fit(X_train_class, y_train_class)

# Evaluation
y_pred_cb_reg = cb_reg.predict(X_test_reg)
mse_cb_reg = mean_squared_error(y_test_reg, y_pred_cb_reg)

y_pred_cb_class = cb_class.predict_proba(X_test_class)[:, 1]

logloss_cb_class = log_loss(y_test_class, y_pred_cb_class)

print(
    f"CatBoost\nRegression MSE: {mse_cb_reg:.4f}\nClassification Log Loss: {logloss_cb_class:.4f}"
)

import xgboost as xgb

# XGBoost for Regression
xgb_reg = xgb.XGBRegressor(
    n_estimators=10, learning_rate=0.1, max_depth=5, random_state=42
)
xgb_reg.fit(X_train_reg, y_train_reg)

# XGBoost for Classification
xgb_class = xgb.XGBClassifier(
    n_estimators=10, learning_rate=0.1, max_depth=5, random_state=42
)
xgb_class.fit(X_train_class, y_train_class)

# Evaluation
y_pred_xgb_reg = xgb_reg.predict(X_test_reg)
mse_xgb_reg = mean_squared_error(y_test_reg, y_pred_xgb_reg)

y_pred_xgb_class = xgb_class.predict_proba(X_test_class)[:, 1]

logloss_xgb_class = log_loss(y_test_class, y_pred_xgb_class)

print(
    f"\nXGBoost\nRegression MSE: {mse_xgb_reg:.4f}\nClassification Log Loss: {logloss_xgb_class:.4f}"
)

CatBoost
Regression MSE: 7140.9811
Classification Log Loss: 0.4020

XGBoost
Regression MSE: 6229.6563
Classification Log Loss: 0.3143


## Regression and classification visualization

### Regression

In [4]:
import catboost as cb
import imageio.v3 as iio
import matplotlib.pyplot as plt
import numpy as np
import xgboost as xgb
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

plt.rcParams["font.family"] = "Arial"
plt.rcParams.update({"font.size": 13})

# Create a 2D regression dataset
X_reg, y_reg = make_regression(n_samples=500, n_features=2, noise=0.5, random_state=42)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

# Create meshgrid for decision boundary
x_min, x_max = X_reg[:, 0].min() - 1, X_reg[:, 0].max() + 1
y_min, y_max = X_reg[:, 1].min() - 1, X_reg[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200), np.linspace(y_min, y_max, 200))

# Models and parameters
n_iterations = 50  # Reduced iterations for faster computation
rf_boost_reg = RFGBoost(
    n_estimators=n_iterations,
    rf_params={"n_estimators": 20, "max_depth": 5},
    task="regression",
    learning_rate=0.1,
)
hgb_reg = HistGradientBoostingRegressor(
    max_iter=n_iterations, learning_rate=0.1, max_depth=5, random_state=42
)
cb_reg = None  # Initialize empty CatBoost model
xgb_reg = xgb.XGBRegressor(
    n_estimators=n_iterations, learning_rate=0.1, max_depth=5, random_state=42
)

models = [
    ("RFGBoost", rf_boost_reg),
    ("HistGradientBoosting", hgb_reg),
    ("CatBoost", None),  # Placeholder for CatBoost
    ("XGBoost", xgb_reg),
]

frames = []

# Generate decision boundaries for all models across iterations
for iteration in range(1, n_iterations + 1):
    fig, axs = plt.subplots(2, 2, figsize=(10, 8), dpi=120)  # 2x2 grid
    # fig.set_size_inches(12, 10)  # Slightly larger canvas
    fig.subplots_adjust(
        top=0.85,  # move suptitle down
        bottom=0.08,  # move subplot grid down a little
        left=0.1,
        right=0.9,
        wspace=0.3,
        hspace=0.3,
    )
    axs = axs.ravel()  # Flatten axes for easier access

    for idx, (name, model) in enumerate(models):
        if name == "CatBoost":
            if iteration == 1:
                cb_reg = cb.CatBoostRegressor(
                    iterations=1,
                    learning_rate=0.1,
                    depth=5,
                    verbose=0,
                    random_state=42,
                    allow_writing_files=False,
                )
                cb_reg.fit(X_train_reg, y_train_reg)
            else:
                cb_reg.fit(X_train_reg, y_train_reg, init_model=cb_reg)
            model = cb_reg

        elif iteration == 1:
            model.fit(X_train_reg, y_train_reg)  # Fit full model initially

        # Adjust training for models that require iterative updates
        if name == "HistGradientBoosting":
            model.set_params(max_iter=iteration)
            model.fit(X_train_reg, y_train_reg)
        elif name == "XGBoost":
            model.set_params(n_estimators=iteration)
            model.fit(X_train_reg, y_train_reg)
        elif name == "RFGBoost":
            model = RFGBoost(
                n_estimators=iteration,
                rf_params={"n_estimators": 20, "max_depth": 5},
                task="regression",
                learning_rate=0.1,
            )
            model.fit(X_train_reg, y_train_reg)

        # Predict decision boundary
        preds = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)

        # Plot on the current subplot
        axs[idx].contourf(xx, yy, preds, alpha=1.0, levels=10, cmap=cm1)
        axs[idx].scatter(
            X_train_reg[:, 0],
            X_train_reg[:, 1],
            c=y_train_reg,
            edgecolor="k",
            cmap=cm1,
            alpha=0.8,
        )
        axs[idx].set_title(f"{name}")
        fig.suptitle(
            f"Regression (Boosting Iteration {iteration})", fontsize=25, y=0.96
        )

        # Add MSE to the plot
        mse = mean_squared_error(y_test_reg, model.predict(X_test_reg))
        axs[idx].text(
            0.02, 0.03, f"MSE: {mse:.4f}", transform=axs[idx].transAxes, fontsize=12
        )
        axs[idx].set_xlim(x_min, x_max)
        axs[idx].set_ylim(y_min, y_max)

    # plt.tight_layout()

    # Save frame for GIF
    fig.canvas.draw()
    image = np.array(fig.canvas.buffer_rgba()).reshape(
        fig.canvas.get_width_height()[::-1] + (4,)
    )[:, :, :3]  # Convert RGBA to RGB
    frames.append(image)
    plt.close(fig)

# Save the animation as a GIF
iio.imwrite("regression_animation_grid.gif", frames, fps=3)

### Classification

In [5]:
import catboost as cb
import imageio.v3 as iio
import matplotlib.pyplot as plt
import numpy as np
import xgboost as xgb
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

plt.rcParams["font.family"] = "Arial"
plt.rcParams.update({"font.size": 13})

X = np.random.rand(500, 2) * 6 - 3  # Random points in the range [-3, 3]
y = (X[:, 1] > np.sin(X[:, 0])).astype(int)
X_class, y_class = X, y

# Correctly splitting the XOR data
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(
    X_class, y_class, test_size=0.2, random_state=42
)

# Create meshgrid for decision boundary
x_min, x_max = X_class[:, 0].min() - 1, X_class[:, 0].max() + 1
y_min, y_max = X_class[:, 1].min() - 1, X_class[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 500), np.linspace(y_min, y_max, 500))

# Models and parameters
n_iterations = 50  # Reduced iterations for efficiency
rf_boost_class = RFGBoost(
    n_estimators=n_iterations,
    rf_params={"n_estimators": 20, "max_depth": 5},
    task="classification",
    learning_rate=0.1,
)
hgb_class = HistGradientBoostingClassifier(
    max_iter=n_iterations, learning_rate=0.1, max_depth=5, random_state=42
)
cb_class = None  # Initialize empty CatBoost model
xgb_class = xgb.XGBClassifier(
    n_estimators=n_iterations, learning_rate=0.1, max_depth=5, random_state=42
)

models = [
    ("RFGBoost", rf_boost_class),
    ("HistGradientBoosting", hgb_class),
    ("CatBoost", None),  # Placeholder for CatBoost
    ("XGBoost", xgb_class),
]

frames = []

# Generate decision boundaries for all models across iterations
for iteration in range(1, n_iterations + 1):
    fig, axs = plt.subplots(2, 2, figsize=(10, 8), dpi=120)  # 2x2 grid
    # fig.set_size_inches(12, 10)  # Slightly larger canvas
    fig.subplots_adjust(
        top=0.85,  # move suptitle down
        bottom=0.08,  # move subplot grid down a little
        left=0.1,
        right=0.9,
        wspace=0.3,
        hspace=0.3,
    )
    axs = axs.ravel()  # Flatten axes for easier access

    for idx, (name, model) in enumerate(models):
        if name == "CatBoost":
            if iteration == 1:
                cb_class = cb.CatBoostClassifier(
                    iterations=1,
                    learning_rate=0.1,
                    depth=5,
                    verbose=0,
                    random_state=42,
                    allow_writing_files=False,
                )
                cb_class.fit(X_train_class, y_train_class)
            else:
                cb_class.fit(X_train_class, y_train_class, init_model=cb_class)
            model = cb_class

        elif iteration == 1:
            model.fit(X_train_class, y_train_class)  # Fit full model initially

        # Adjust training for models that require iterative updates
        if name == "HistGradientBoosting":
            model.set_params(max_iter=iteration)
            model.fit(X_train_class, y_train_class)
        elif name == "XGBoost":
            model.set_params(n_estimators=iteration)
            model.fit(X_train_class, y_train_class)
        elif name == "RFGBoost":
            model = RFGBoost(
                n_estimators=iteration,
                rf_params={"n_estimators": 20, "max_depth": 5},
                task="classification",
                learning_rate=0.1,
            )
            model.fit(X_train_class, y_train_class)

        # Predict decision boundary (classification probability for class 1)
        if name == "RFGBoost":
            preds = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
        else:
            preds = model.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1].reshape(
                xx.shape
            )

        # Plot on the current subplot
        axs[idx].contourf(xx, yy, preds, alpha=0.5, levels=10, cmap=cm2)
        axs[idx].scatter(
            X_train_class[:, 0],
            X_train_class[:, 1],
            c=y_train_class,
            edgecolor="k",
            cmap=cm2,
            alpha=0.6,
        )
        axs[idx].set_title(f"{name}")
        fig.suptitle(
            f"Classification (Boosting Iteration {iteration})", fontsize=25, y=0.96
        )

        # Add Log Loss to the plot
        if name == "RFGBoost":
            logloss = log_loss(y_test_class, model.predict(X_test_class))
        else:
            logloss = log_loss(y_test_class, model.predict_proba(X_test_class)[:, 1])
        axs[idx].text(
            0.02,
            0.03,
            f"Log Loss: {logloss:.4f}",
            transform=axs[idx].transAxes,
            fontsize=12,
        )
        axs[idx].set_xlim(x_min, x_max)
        axs[idx].set_ylim(y_min, y_max)

    # Save frame for GIF
    fig.canvas.draw()
    image = np.array(fig.canvas.buffer_rgba()).reshape(
        fig.canvas.get_width_height()[::-1] + (4,)
    )[:, :, :3]  # Convert RGBA to RGB
    frames.append(image)
    plt.close(fig)

# Save the animation as a GIF
iio.imwrite("classification_animation_grid.gif", frames, fps=3)