# Final Training - GP data added

Final training on full padded data. Catboost added. GP Data added in as well. Full template and token coverage, alot of candidates to rank here.

In [None]:
!pip install lightgbm
!pip install sqlalchemy
!pip install catboost
!pip install optuna

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Collecting sqlalchemy
  Downloading sqlalchemy-2.0.43-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting greenlet>=1 (from sqlalchemy)
  Downloading greenlet-3.2.4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (4.1 kB)
Downloading sqlalchemy-2.0.43-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading greenlet-3.2.4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (587 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━

Cat boost takes it again, with a larger margin this time. Think this is the way to go. Next thing to do is to finalise the whole pipeline and do one final train.

In [None]:
import gc
import sqlite3
import pandas as pd
import numpy as np
from catboost import CatBoostRanker, Pool
from typing import Tuple

# Paths to files in Drive
_DB_PATH = "/content/drive/MyDrive/Colab Notebooks/database.db"
_STD_VAL_IDS = "val_std_ids.csv"
_STD_TEST_IDS = "test_std_ids.csv"
_COMP_VAL_IDS = "val_comp_ids.csv"
_COMP_TEST_IDS = "test_comp_ids.csv"

# Tuned hyperparameters
_BEST_PARAMS = {
    "loss_function": "YetiRank",
    "eval_metric": "NDCG:top=3",
    "random_seed": 42,
    "learning_rate": 0.13275757957731918,
    "depth": 6,
    "l2_leaf_reg": 7.142519331365267,
    "random_strength": 3.395785387976391,
    "min_data_in_leaf": 84,
    "subsample": 0.9048958560910838,
    "colsample_bylevel": 0.511123337191838,
    "grow_policy": "Lossguide",
}


def _compute_ranking_metrics(df: pd.DataFrame, k: int = 3):
    """
    Compute Accuracy@1, Recall@k, and MRR for a ranking prediction dataframe.

    Args:
        df (pd.DataFrame): Must contain columns ['clean_row_id', 'score', 'label']
        k (int): The cutoff rank for recall@k

    Returns:
        Tuple[float, float, float]: (Accuracy@1, Recall@k, MRR)
    """
    # Accuracy@1
    top1 = df.loc[df.groupby("clean_row_id")["score"].idxmax()]
    acc1 = (top1["label"] == 1).mean()

    # Recall@k
    topk = df.groupby("clean_row_id", group_keys=False).apply(
        lambda g: g.nlargest(k, "score")
    )
    recall_k = topk.groupby("clean_row_id")["label"].max().mean()

    # MRR
    def reciprocal_rank(g: pd.DataFrame) -> float:
        labels_sorted = g.sort_values("score", ascending=False)["label"].to_numpy()
        for rank, label in enumerate(labels_sorted, start=1):
            if label == 1:
                return 1.0 / rank
        return 0.0

    mrr = df.groupby("clean_row_id", group_keys=False).apply(reciprocal_rank).mean()

    return acc1, recall_k, mrr


def _train_catboost_model(
    parameters: dict,
    train_df: pd.DataFrame,
    val_df: pd.DataFrame,
    n_rounds: int = 500,
    model_output_path: str = "catboost_model.cbm",
) -> CatBoostRanker:
    """
    Trains a CatBoost ranking model using the provided training and validation data.

    Args:
        parameters (dict): Parameters for CatBoostRanker.
        train_df (pd.DataFrame): Training data with label, group info, and features.
        val_df (pd.DataFrame): Validation data with same structure.
        n_rounds (int): Maximum number of boosting rounds.
        model_output_path (str): File path to save the trained CatBoost model.

    Returns:
        CatBoostRanker: Trained CatBoost model.
    """
    drop_cols = ["label", "clean_row_id", "investor", "firm", "template_id"]

    # Train
    train_group_sizes = train_df.groupby("clean_row_id", sort=False).size().tolist()
    train_group_id = np.repeat(np.arange(len(train_group_sizes)), train_group_sizes)

    X_train = train_df.drop(columns=drop_cols)
    y_train = train_df["label"]
    del train_df  # Free memory early

    train_pool = Pool(data=X_train, label=y_train, group_id=train_group_id)
    del X_train, y_train, train_group_id  # Free memory
    gc.collect()  # Call garbage collector to be extra sure

    # Validation
    val_group_sizes = val_df.groupby("clean_row_id", sort=False).size().tolist()
    val_group_id = np.repeat(np.arange(len(val_group_sizes)), val_group_sizes)

    X_val = val_df.drop(columns=drop_cols)
    y_val = val_df["label"]

    val_pool = Pool(data=X_val, label=y_val, group_id=val_group_id)
    del X_val, y_val, val_group_id  # Free memory
    gc.collect()  # Call garbage collector to be extra sure

    # Train model
    model = CatBoostRanker(iterations=n_rounds, **parameters)
    model.fit(
        train_pool,
        eval_set=val_pool,
        early_stopping_rounds=10,
        verbose=True,
    )

    # Save model
    model.save_model(model_output_path)
    print(f"\nModel saved to: {model_output_path}")

    # Score model
    val_df = val_df.copy()  # preserve original structure
    val_df["score"] = model.predict(val_pool)

    acc1, recall3, mrr = _compute_ranking_metrics(val_df, k=3)

    print("\nEvaluation Metrics (Validation Set):")
    print(f"Accuracy@1 : {acc1:.4f}")
    print(f"Recall@3   : {recall3:.4f}")
    print(f"MRR        : {mrr:.4f}")

    return model


def train_standard_and_complex_model(
    n_rounds: int = 1000,
):
    """
    Trains two CatBoost ranking models (standard and complex) on pre-split data and saves them to
    disk.

    The models are saved in `.cbm` format for compatibility with CatBoost's C++ inference engine.

    Args:
        n_rounds (int): Maximum number of boosting rounds for training (default: 1000).
    """
    # Mount drive
    import sys

    if "google.colab" in sys.modules:
        from google.colab import drive

        drive.mount("/content/drive")

    def train_model(
        data_table: str, val_ids_path: str, test_ids_path: str, model_path: str
    ):
        print(f"Training model: {model_path}")
        # Get ids
        val_ids = (
            pd.read_csv(val_ids_path)["val_ids"].dropna().astype(int).tolist()
        )
        test_ids = (
            pd.read_csv(test_ids_path)["test_ids"].dropna().astype(int).tolist()
        )

        # Get data
        with sqlite3.connect(_DB_PATH) as conn:
            full_df = pd.read_sql_query(f"SELECT * FROM {data_table}", conn)

        # Split the set
        val_ids_set = set(val_ids)
        test_ids_set = set(test_ids)
        excluded_ids = val_ids_set | test_ids_set
        val_df = full_df[full_df["clean_row_id"].isin(val_ids_set)]
        full_df = full_df[~full_df["clean_row_id"].isin(excluded_ids)]

        # Train model
        return _train_catboost_model(
            _BEST_PARAMS, full_df, val_df, n_rounds, model_path
        )

    # Start with standard
    train_model("feature_matrix", _STD_VAL_IDS, _STD_TEST_IDS, "std_lightgbm_model.cbm")
    # Then complex
    train_model("feature_matrix_complex", _COMP_VAL_IDS, _COMP_TEST_IDS, "comp_lightgbm_model.cbm")

    # return std_model, comp_model
    return None


train_standard_and_complex_model()


Mounted at /content/drive
Training model: std_lightgbm_model.cbm
0:	test: 0.9658214	best: 0.9658214 (0)	total: 49.2s	remaining: 13h 38m 40s
1:	test: 0.9658214	best: 0.9658214 (0)	total: 1m 30s	remaining: 12h 29m 22s
2:	test: 0.9714184	best: 0.9714184 (2)	total: 2m 16s	remaining: 12h 33m 41s
3:	test: 0.9720715	best: 0.9720715 (3)	total: 3m 1s	remaining: 12h 34m 39s
4:	test: 0.9721750	best: 0.9721750 (4)	total: 3m 53s	remaining: 12h 53m 13s
5:	test: 0.9724847	best: 0.9724847 (5)	total: 4m 45s	remaining: 13h 9m 38s
6:	test: 0.9727083	best: 0.9727083 (6)	total: 5m 40s	remaining: 13h 25m 8s
7:	test: 0.9729202	best: 0.9729202 (7)	total: 6m 32s	remaining: 13h 31m 20s
8:	test: 0.9729259	best: 0.9729259 (8)	total: 7m 23s	remaining: 13h 33m 48s
9:	test: 0.9728815	best: 0.9729259 (8)	total: 8m 11s	remaining: 13h 30m 22s
10:	test: 0.9729222	best: 0.9729259 (8)	total: 9m 2s	remaining: 13h 32m 42s
11:	test: 0.9729500	best: 0.9729500 (11)	total: 9m 53s	remaining: 13h 34m 30s
12:	test: 0.9730489	best:

  topk = df.groupby("clean_row_id", group_keys=False).apply(
  mrr = df.groupby("clean_row_id", group_keys=False).apply(reciprocal_rank).mean()



Evaluation Metrics (Validation Set):
Accuracy@1 : 0.9336
Recall@3   : 0.9984
MRR        : 0.9652
Training model: comp_lightgbm_model.cbm
0:	test: 0.6158198	best: 0.6158198 (0)	total: 1m 27s	remaining: 1d 18m 39s
1:	test: 0.6376602	best: 0.6376602 (1)	total: 2m 54s	remaining: 1d 9m 27s
2:	test: 0.6603577	best: 0.6603577 (2)	total: 4m 20s	remaining: 1d 2m 41s
3:	test: 0.6780361	best: 0.6780361 (3)	total: 5m 49s	remaining: 1d 8m 32s
4:	test: 0.6963747	best: 0.6963747 (4)	total: 7m 11s	remaining: 23h 51m 4s
5:	test: 0.6959076	best: 0.6963747 (4)	total: 8m 37s	remaining: 23h 47m 35s
6:	test: 0.7388896	best: 0.7388896 (6)	total: 10m 2s	remaining: 23h 43m 42s
7:	test: 0.7450297	best: 0.7450297 (7)	total: 11m 26s	remaining: 23h 38m 18s
8:	test: 0.7474377	best: 0.7474377 (8)	total: 12m 50s	remaining: 23h 34m 7s
9:	test: 0.7575798	best: 0.7575798 (9)	total: 14m 12s	remaining: 23h 26m 59s
10:	test: 0.7776858	best: 0.7776858 (10)	total: 15m 34s	remaining: 23h 20m 51s
11:	test: 0.7792509	best: 0.7

  topk = df.groupby("clean_row_id", group_keys=False).apply(



Evaluation Metrics (Validation Set):
Accuracy@1 : 0.7609
Recall@3   : 0.9729
MRR        : 0.8664


  mrr = df.groupby("clean_row_id", group_keys=False).apply(reciprocal_rank).mean()
