# catboost-incremental

https://www.github.com/xRiskLab/catboost-incremental

In [None]:
import time
import io
import pyarrow.dataset as ds
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
import imageio.v3 as iio
from matplotlib.ticker import FuncFormatter

from catboost_incremental import DataLoader, CatBoostTrainer

# Use Virgil GS font
plt.rcParams["font.family"] = "Virgil GS"
plt.rcParams["font.size"] = 14

# Load full dataset
dataset_path = "../data/"
dataset = ds.dataset(dataset_path)
full_df = dataset.to_table().to_pandas()
label = "target"

# Create the DataLoader
data_loader = DataLoader(
    dataset_path,
    chunk_size=1000,
    partition_id_col="partition_id",
    label_col=label,
)

# Initialize the trainer with log loss
trainer = CatBoostTrainer(
    data_loader=data_loader,
    label_col=label,
    model_config={
        "iterations": 10,
        "learning_rate": 0.01,
        "verbose": 0,
        "allow_writing_files": False,
    },
    metric_fn=log_loss,
)

# Train the model
start_time = time.perf_counter()
model = trainer.train()
end_time = time.perf_counter()
print(f"Training time: {end_time - start_time:.2f} seconds")

# Evaluate the model
score = trainer.evaluate(full_df)
print(f"Final Log Loss on Full Data: {score:.4f}")

# Generate frames for animation with an expanding window
x = trainer.training_stats["chunk_index"]
y = trainer.training_stats["score"]

frames = []

# Generate frames for animation with expanding window and real sample count on x-axis
chunk_size = 1000  # Same as you used in DataLoader
x = [(i + 1) * chunk_size for i in trainer.training_stats["chunk_index"]]
y = trainer.training_stats["score"]

frames = []

x_start = 0
for i in range(1, len(x) + 1):
    fig, ax = plt.subplots(figsize=(8, 6), dpi=150)
    ax.plot(x[:i], y[:i], linewidth=2, color="#0b99ff")

    x_end = x[i - 1] + chunk_size
    ax.set_xlim(x_start, x_end)

    ax.set_ylim(min(y) * 0.95, max(y) * 1.25)
    ax.set_title("Incremental Training of CatBoost")
    ax.set_xlabel("Samples Seen")
    ax.set_ylabel("Log Loss")
    ax.grid(False)
    
    formatter = FuncFormatter(lambda x_val, _: f"{int(x_val/1000)}k")
    ax.xaxis.set_major_formatter(formatter)

    # Disable spines
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)

    ax.legend(["Log Loss"], loc="upper right", fontsize=14)

    buf = io.BytesIO()
    plt.savefig(buf, format="png", dpi=150)
    buf.seek(0)
    image = iio.imread(buf)
    frames.append(image)
    plt.close(fig)

# Save to GIF
iio.imwrite("log_loss_over_chunks.gif", frames, duration=0.01, loop=1)
print("🎞️ Saved animation to log_loss_over_chunks.gif")