# Final Training - GP data added, pruned templates - no ultra rares.

Real final training, for real this time.

In [None]:
!pip install lightgbm
!pip install sqlalchemy
!pip install catboost
!pip install optuna

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Collecting sqlalchemy
  Downloading sqlalchemy-2.0.43-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting greenlet>=1 (from sqlalchemy)
  Downloading greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (4.1 kB)
Downloading sqlalchemy-2.0.43-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (607 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━

In [None]:
import gc
import sqlite3
import pandas as pd
import numpy as np
from catboost import CatBoostRanker, Pool
from typing import Tuple

# Paths to files in Drive
_DB_PATH = "/content/drive/MyDrive/Colab Notebooks/database.db"
_STD_VAL_IDS = "val_std_ids.csv"
_STD_TEST_IDS = "test_std_ids.csv"
_COMP_VAL_IDS = "val_comp_ids.csv"
_COMP_TEST_IDS = "test_comp_ids.csv"

# Tuned hyperparameters
_BEST_PARAMS = {
    "loss_function": "YetiRank",
    "eval_metric": "NDCG:top=3",
    "random_seed": 42,
    "learning_rate": 0.13275757957731918,
    "depth": 6,
    "l2_leaf_reg": 7.142519331365267,
    "random_strength": 3.395785387976391,
    "min_data_in_leaf": 84,
    "subsample": 0.9048958560910838,
    "colsample_bylevel": 0.511123337191838,
    "grow_policy": "Lossguide",
}


def _compute_ranking_metrics(df: pd.DataFrame, k: int = 3):
    """
    Compute Accuracy@1, Recall@k, and MRR for a ranking prediction dataframe.

    Args:
        df (pd.DataFrame): Must contain columns ['clean_row_id', 'score', 'label']
        k (int): The cutoff rank for recall@k

    Returns:
        Tuple[float, float, float]: (Accuracy@1, Recall@k, MRR)
    """
    # Accuracy@1
    top1 = df.loc[df.groupby("clean_row_id")["score"].idxmax()]
    acc1 = (top1["label"] == 1).mean()

    # Recall@k
    topk = df.groupby("clean_row_id", group_keys=False).apply(
        lambda g: g.nlargest(k, "score")
    )
    recall_k = topk.groupby("clean_row_id")["label"].max().mean()

    # MRR
    def reciprocal_rank(g: pd.DataFrame) -> float:
        labels_sorted = g.sort_values("score", ascending=False)["label"].to_numpy()
        for rank, label in enumerate(labels_sorted, start=1):
            if label == 1:
                return 1.0 / rank
        return 0.0

    mrr = df.groupby("clean_row_id", group_keys=False).apply(reciprocal_rank).mean()

    return acc1, recall_k, mrr


def _train_catboost_model(
    parameters: dict,
    train_df: pd.DataFrame,
    val_df: pd.DataFrame,
    n_rounds: int = 500,
    model_output_path: str = "catboost_model.cbm",
) -> CatBoostRanker:
    """
    Trains a CatBoost ranking model using the provided training and validation data.

    Args:
        parameters (dict): Parameters for CatBoostRanker.
        train_df (pd.DataFrame): Training data with label, group info, and features.
        val_df (pd.DataFrame): Validation data with same structure.
        n_rounds (int): Maximum number of boosting rounds.
        model_output_path (str): File path to save the trained CatBoost model.

    Returns:
        CatBoostRanker: Trained CatBoost model.
    """
    drop_cols = ["label", "clean_row_id", "investor", "firm", "template_id"]

    # Train
    train_group_sizes = train_df.groupby("clean_row_id", sort=False).size().tolist()
    train_group_id = np.repeat(np.arange(len(train_group_sizes)), train_group_sizes)

    X_train = train_df.drop(columns=drop_cols)
    y_train = train_df["label"]
    del train_df  # Free memory early

    train_pool = Pool(data=X_train, label=y_train, group_id=train_group_id)
    del X_train, y_train, train_group_id  # Free memory
    gc.collect()  # Call garbage collector to be extra sure

    # Validation
    val_group_sizes = val_df.groupby("clean_row_id", sort=False).size().tolist()
    val_group_id = np.repeat(np.arange(len(val_group_sizes)), val_group_sizes)

    X_val = val_df.drop(columns=drop_cols)
    y_val = val_df["label"]

    val_pool = Pool(data=X_val, label=y_val, group_id=val_group_id)
    del X_val, y_val, val_group_id  # Free memory
    gc.collect()  # Call garbage collector to be extra sure

    # Train model
    model = CatBoostRanker(iterations=n_rounds, **parameters)
    model.fit(
        train_pool,
        eval_set=val_pool,
        early_stopping_rounds=10,
        verbose=True,
    )

    # Save model
    model.save_model(model_output_path)
    print(f"\nModel saved to: {model_output_path}")

    # Score model
    val_df = val_df.copy()  # preserve original structure
    val_df["score"] = model.predict(val_pool)

    acc1, recall3, mrr = _compute_ranking_metrics(val_df, k=3)

    print("\nEvaluation Metrics (Validation Set):")
    print(f"Accuracy@1 : {acc1:.4f}")
    print(f"Recall@3   : {recall3:.4f}")
    print(f"MRR        : {mrr:.4f}")

    return model


def train_standard_and_complex_model(
    n_rounds: int = 1000,
):
    """
    Trains two CatBoost ranking models (standard and complex) on pre-split data and saves them to
    disk.

    The models are saved in `.cbm` format for compatibility with CatBoost's C++ inference engine.

    Args:
        n_rounds (int): Maximum number of boosting rounds for training (default: 1000).
    """
    # Mount drive
    import sys

    if "google.colab" in sys.modules:
        from google.colab import drive

        drive.mount("/content/drive")

    def train_model(
        data_table: str, val_ids_path: str, test_ids_path: str, model_path: str
    ):
        print(f"Training model: {model_path}")
        # Get ids
        val_ids = (
            pd.read_csv(val_ids_path)["val_ids"].dropna().astype(int).tolist()
        )
        test_ids = (
            pd.read_csv(test_ids_path)["test_ids"].dropna().astype(int).tolist()
        )

        # Get data
        chunk_size = 100000
        chunks = []
        with sqlite3.connect(_DB_PATH) as conn:
            for chunk in pd.read_sql_query(f"SELECT * FROM {data_table}", conn, chunksize=chunk_size):
                chunks.append(chunk)
            full_df = pd.concat(chunks, ignore_index=True)

        # Split the set
        val_ids_set = set(val_ids)
        test_ids_set = set(test_ids)
        excluded_ids = val_ids_set | test_ids_set
        val_df = full_df[full_df["clean_row_id"].isin(val_ids_set)]
        full_df = full_df[~full_df["clean_row_id"].isin(excluded_ids)]

        # Train model
        return _train_catboost_model(
            _BEST_PARAMS, full_df, val_df, n_rounds, model_path
        )

    # Start with standard
    train_model("feature_matrix", _STD_VAL_IDS, _STD_TEST_IDS, "std_lightgbm_model.cbm")
    # Then complex
    train_model("feature_matrix_complex", _COMP_VAL_IDS, _COMP_TEST_IDS, "comp_lightgbm_model.cbm")

    # return std_model, comp_model
    return None


train_standard_and_complex_model()


Mounted at /content/drive
Training model: std_lightgbm_model.cbm
0:	test: 0.9710703	best: 0.9710703 (0)	total: 45.3s	remaining: 12h 34m 25s
1:	test: 0.9728100	best: 0.9728100 (1)	total: 1m 34s	remaining: 13h 4m 11s
2:	test: 0.9733131	best: 0.9733131 (2)	total: 2m 22s	remaining: 13h 11m 2s
3:	test: 0.9737284	best: 0.9737284 (3)	total: 3m 9s	remaining: 13h 5m 39s
4:	test: 0.9737646	best: 0.9737646 (4)	total: 3m 58s	remaining: 13h 11m 26s
5:	test: 0.9741095	best: 0.9741095 (5)	total: 4m 48s	remaining: 13h 16m 7s
6:	test: 0.9744848	best: 0.9744848 (6)	total: 5m 38s	remaining: 13h 20m 49s
7:	test: 0.9746007	best: 0.9746007 (7)	total: 6m 28s	remaining: 13h 23m 12s
8:	test: 0.9744615	best: 0.9746007 (7)	total: 7m 13s	remaining: 13h 15m 36s
9:	test: 0.9744831	best: 0.9746007 (7)	total: 8m 8s	remaining: 13h 26m 8s
10:	test: 0.9744863	best: 0.9746007 (7)	total: 9m	remaining: 13h 30m 12s
11:	test: 0.9744894	best: 0.9746007 (7)	total: 9m 48s	remaining: 13h 27m 32s
12:	test: 0.9745110	best: 0.97460

  topk = df.groupby("clean_row_id", group_keys=False).apply(
  mrr = df.groupby("clean_row_id", group_keys=False).apply(reciprocal_rank).mean()



Evaluation Metrics (Validation Set):
Accuracy@1 : 0.9374
Recall@3   : 0.9984
MRR        : 0.9672
Training model: comp_lightgbm_model.cbm
0:	test: 0.7576870	best: 0.7576870 (0)	total: 2m 23s	remaining: 1d 15h 42m 1s
1:	test: 0.8041258	best: 0.8041258 (1)	total: 5m 4s	remaining: 1d 18h 10m 48s
2:	test: 0.8182105	best: 0.8182105 (2)	total: 7m 31s	remaining: 1d 17h 39m 17s
3:	test: 0.8354011	best: 0.8354011 (3)	total: 10m 4s	remaining: 1d 17h 46m 48s
4:	test: 0.8374243	best: 0.8374243 (4)	total: 12m 32s	remaining: 1d 17h 35m 24s
5:	test: 0.8604993	best: 0.8604993 (5)	total: 14m 55s	remaining: 1d 17h 11m 57s
6:	test: 0.8618760	best: 0.8618760 (6)	total: 17m 23s	remaining: 1d 17h 7m 48s
7:	test: 0.8674218	best: 0.8674218 (7)	total: 19m 47s	remaining: 1d 16h 55m 3s
8:	test: 0.8690335	best: 0.8690335 (8)	total: 22m 18s	remaining: 1d 16h 57m 8s
9:	test: 0.8745844	best: 0.8745844 (9)	total: 24m 54s	remaining: 1d 17h 5m 27s
10:	test: 0.8804390	best: 0.8804390 (10)	total: 27m 16s	remaining: 1d 16

  topk = df.groupby("clean_row_id", group_keys=False).apply(



Evaluation Metrics (Validation Set):
Accuracy@1 : 0.8407
Recall@3   : 0.9860
MRR        : 0.9119


  mrr = df.groupby("clean_row_id", group_keys=False).apply(reciprocal_rank).mean()
