# Final Training

Final training on full padded data. Doing LightGBM because i couldn't get the catboost API to build in CPP.

In [1]:
!pip install lightgbm
!pip install sqlalchemy
!pip install catboost
!pip install optuna

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.6/3.6 MB[0m [31m115.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m84.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Collecting sqlalchemy
  Downloading sqlalchemy-2.0.41-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting greenlet>=1 (from sqlalchemy)
  Downloading greenlet-3.2.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (4.1 kB)
Downloading sqlalchemy-2.0.41-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m

Cat boost takes it again, with a larger margin this time. Think this is the way to go. Next thing to do is to finalise the whole pipeline and do one final train.

In [2]:
import gc
import sqlite3
import pandas as pd
import numpy as np
import lightgbm as lgb
from typing import Tuple

# Paths to files in Drive
_DB_PATH = "/content/drive/MyDrive/Colab Notebooks/database.db"
_STD_VAL_IDS = "val_std_ids.csv"
_STD_TEST_IDS = "test_std_ids.csv"
_COMP_VAL_IDS = "val_comp_ids.csv"
_COMP_TEST_IDS = "test_comp_ids.csv"

# Tuned hyperparameters
_BEST_PARAMS =  {
    "objective": "lambdarank",
    "metric": ["ndcg"],
    "eval_at": [1, 3],
    "boosting_type": "gbdt",
    "verbosity": -1,
    "force_row_wise": True,
    "learning_rate": 0.19426829578921662,
    "num_leaves": 64,
    "min_data_in_leaf": 75,
    "feature_fraction": 0.9047338201190456,
    "bagging_fraction": 0.9624860112209651,
    "lambda_l": 0.9753561860776749,
    "lambda_l2": 4.367377782782343,
    "bagging_freq": 3,
    "max_depth": 10
}


def _compute_ranking_metrics(df: pd.DataFrame, k: int = 3):
    """
    Compute Accuracy@1, Recall@k, and MRR for a ranking prediction dataframe.

    Args:
        df (pd.DataFrame): Must contain columns ['clean_row_id', 'score', 'label']
        k (int): The cutoff rank for recall@k

    Returns:
        Tuple[float, float, float]: (Accuracy@1, Recall@k, MRR)
    """
    # Accuracy@1
    top1 = df.loc[df.groupby("clean_row_id")["score"].idxmax()]
    acc1 = (top1["label"] == 1).mean()

    # Recall@k
    topk = df.groupby("clean_row_id", group_keys=False).apply(
        lambda g: g.nlargest(k, "score")
    )
    recall_k = topk.groupby("clean_row_id")["label"].max().mean()

    # MRR
    def reciprocal_rank(g: pd.DataFrame) -> float:
        labels_sorted = g.sort_values("score", ascending=False)["label"].to_numpy()
        for rank, label in enumerate(labels_sorted, start=1):
            if label == 1:
                return 1.0 / rank
        return 0.0

    mrr = df.groupby("clean_row_id", group_keys=False).apply(reciprocal_rank).mean()

    return acc1, recall_k, mrr


def _train_lightgbm_model(
    parameters: dict,
    train_df: pd.DataFrame,
    val_df: pd.DataFrame,
    n_rounds: int = 500,
    model_output_path: str = "lgihtgbm_model.txt",
    lr_decay_gamma: float = 0.95,
) :
    """
    Trains a LightGbm ranking model using the provided training and validation data.

    Args:
        parameters (dict): Parameters for CatBoostRanker.
        train_df (pd.DataFrame): Training data with label, group info, and features.
        val_df (pd.DataFrame): Validation data with same structure.
        n_rounds (int): Maximum number of boosting rounds.
        model_output_path (str): File path to save the trained CatBoost model.
        CatBoostRanker: Trained CatBoost model.
    """
    drop_cols = ["label", "clean_row_id", "investor", "firm", "template_id"]

    # Learning rate schedule
    def lr_decay(current_round):
        return parameters["learning_rate"] * (lr_decay_gamma ** current_round)

    # Train
    train_group_sizes = train_df.groupby("clean_row_id", sort=False).size().tolist()

    X_train = train_df.drop(columns=drop_cols)
    print(len(X_train.columns))
    y_train = train_df["label"]
    lgb_train = lgb.Dataset(X_train, label=y_train, group=train_group_sizes, free_raw_data=False)
    del train_df  # Free memory early
    del X_train, y_train  # Free memory
    gc.collect()  # Call garbage collector to be extra sure

    # Validation
    val_group_sizes = val_df.groupby("clean_row_id", sort=False).size().tolist()
    print(val_group_sizes)

    X_val = val_df.drop(columns=drop_cols)
    y_val = val_df["label"]

    lgb_val = lgb.Dataset(X_val, label=y_val, group=val_group_sizes, free_raw_data=False)

    # Train model
    model = lgb.train(
        params=parameters,
        train_set=lgb_train,
        num_boost_round=n_rounds,
        valid_sets=[lgb_train, lgb_val],
        valid_names=["train", "val"],
        callbacks=[
            lgb.reset_parameter(learning_rate=lr_decay),
            lgb.early_stopping(stopping_rounds=500),
            lgb.log_evaluation(period=1)
        ]
    )

    # Save model
    model.save_model(model_output_path)
    print(f"\nModel saved to: {model_output_path}")

    # Score model
    preds = model.predict(X_val, num_iteration=model.best_iteration)
    val_df["score"] = preds

    acc1, recall3, mrr = _compute_ranking_metrics(val_df, k=3)

    del X_val, y_val  # Free memory
    gc.collect()  # Call garbage collector to be extra sure

    print("\nEvaluation Metrics (Validation Set):")
    print(f"Accuracy@1 : {acc1:.4f}")
    print(f"Recall@3   : {recall3:.4f}")
    print(f"MRR        : {mrr:.4f}")

    return model


def train_standard_and_complex_model(
    n_rounds: int = 1000,
):
    """
    Trains two LightGBM ranking models (standard and complex) on pre-split data and saves them to
    disk.

    The models are saved in `.cbm` format for compatibility with CatBoost's C++ inference engine.

    Args:
        n_rounds (int): Maximum number of boosting rounds for training (default: 1000).
    """
    # Mount drive
    import sys

    if "google.colab" in sys.modules:
        from google.colab import drive

        drive.mount("/content/drive")

    def train_model(
        data_table: str, val_ids_path: str, test_ids_path: str, model_path: str
    ):
        print(f"Training model: {model_path}")
        # Get ids
        val_ids = (
            pd.read_csv(val_ids_path)["val_ids"].dropna().astype(int).tolist()
        )
        test_ids = (
            pd.read_csv(test_ids_path)["test_ids"].dropna().astype(int).tolist()
        )

        # Get data
        with sqlite3.connect(_DB_PATH) as conn:
            full_df = pd.read_sql_query(f"SELECT * FROM {data_table}", conn)

        # Split the set
        val_ids_set = set(val_ids)
        test_ids_set = set(test_ids)
        excluded_ids = val_ids_set | test_ids_set
        val_df = full_df[full_df["clean_row_id"].isin(val_ids_set)]
        full_df = full_df[~full_df["clean_row_id"].isin(excluded_ids)]

        # Train model
        return _train_lightgbm_model(
            _BEST_PARAMS, full_df, val_df, n_rounds, model_path
        )

    # Start with standard
    std_model = train_model("feature_matrix", _STD_VAL_IDS, _STD_TEST_IDS, "std_lightgbm_model.txt")
    # Then complex
    comp_model = train_model("feature_matrix_complex", _COMP_VAL_IDS, _COMP_TEST_IDS, "comp_lightgbm_model.txt")

    # return std_model, comp_model
    return comp_model


train_standard_and_complex_model()


Mounted at /content/drive
Training model: std_lightgbm_model.txt
27
[84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,

  topk = df.groupby("clean_row_id", group_keys=False).apply(
  mrr = df.groupby("clean_row_id", group_keys=False).apply(reciprocal_rank).mean()



Evaluation Metrics (Validation Set):
Accuracy@1 : 0.9254
Recall@3   : 0.9965
MRR        : 0.9604
Training model: comp_lightgbm_model.txt
27
[358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358, 358

  topk = df.groupby("clean_row_id", group_keys=False).apply(
  mrr = df.groupby("clean_row_id", group_keys=False).apply(reciprocal_rank).mean()



Evaluation Metrics (Validation Set):
Accuracy@1 : 0.7762
Recall@3   : 0.9531
MRR        : 0.8666


<lightgbm.basic.Booster at 0x781d6b85fd90>