# Placebo Testing Synthetic Investors

The standard feature matrix table will contain synthetic investors that were generated in a statistically faithful approach that attempts to maintain global statistics. The complex set will contain synthetic investors that were randomly duplicated from existing investors.

In [1]:
!pip install sqlalchemy
!pip install catboost

Collecting sqlalchemy
  Downloading sqlalchemy-2.0.43-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting greenlet>=1 (from sqlalchemy)
  Downloading greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (4.1 kB)
Downloading sqlalchemy-2.0.43-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (607 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m607.6/607.6 kB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: greenlet, sqlalchemy
Successfully installed greenlet-3.2.4 sqlalchemy-2.0.43
Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.21-p

In [3]:
import gc
import sqlite3
import pandas as pd
import numpy as np
from catboost import CatBoostRanker, Pool
from typing import Tuple

# Paths to files in Drive
_DB_PATH = "/content/drive/MyDrive/Colab Notebooks/database.db"
TEST_IDS = "test_ids.csv"
VAL_IDS = "val_ids.csv"

# Tuned hyperparameters
_BEST_PARAMS = {
    "loss_function": "YetiRank",
    "eval_metric": "NDCG:top=3",
    "random_seed": 42,
    "learning_rate": 0.13275757957731918,
    "depth": 6,
    "l2_leaf_reg": 7.142519331365267,
    "random_strength": 3.395785387976391,
    "min_data_in_leaf": 84,
    "subsample": 0.9048958560910838,
    "colsample_bylevel": 0.511123337191838,
    "grow_policy": "Lossguide",
}


def _compute_ranking_metrics(df: pd.DataFrame, k: int = 3):
    """
    Compute Accuracy@1, Recall@k, and MRR for a ranking prediction dataframe.

    Args:
        df (pd.DataFrame): Must contain columns ['clean_row_id', 'score', 'label']
        k (int): The cutoff rank for recall@k

    Returns:
        Tuple[float, float, float]: (Accuracy@1, Recall@k, MRR)
    """
    # Accuracy@1
    top1 = df.loc[df.groupby("clean_row_id")["score"].idxmax()]
    acc1 = (top1["label"] == 1).mean()

    # Recall@k
    topk = df.groupby("clean_row_id", group_keys=False).apply(
        lambda g: g.nlargest(k, "score")
    )
    recall_k = topk.groupby("clean_row_id")["label"].max().mean()

    # MRR
    def reciprocal_rank(g: pd.DataFrame) -> float:
        labels_sorted = g.sort_values("score", ascending=False)["label"].to_numpy()
        for rank, label in enumerate(labels_sorted, start=1):
            if label == 1:
                return 1.0 / rank
        return 0.0

    mrr = df.groupby("clean_row_id", group_keys=False).apply(reciprocal_rank).mean()

    return acc1, recall_k, mrr


def _train_catboost_model(
    parameters: dict,
    train_df: pd.DataFrame,
    val_df: pd.DataFrame,
    n_rounds: int = 500,
    model_output_path: str = "catboost_model.cbm",
) -> CatBoostRanker:
    """
    Trains a CatBoost ranking model using the provided training and validation data.

    Args:
        parameters (dict): Parameters for CatBoostRanker.
        train_df (pd.DataFrame): Training data with label, group info, and features.
        val_df (pd.DataFrame): Validation data with same structure.
        n_rounds (int): Maximum number of boosting rounds.
        model_output_path (str): File path to save the trained CatBoost model.

    Returns:
        CatBoostRanker: Trained CatBoost model.
    """
    drop_cols = ["label", "clean_row_id", "investor", "firm", "template_id"]

    # Train
    train_group_sizes = train_df.groupby("clean_row_id", sort=False).size().tolist()
    train_group_id = np.repeat(np.arange(len(train_group_sizes)), train_group_sizes)

    X_train = train_df.drop(columns=drop_cols)
    y_train = train_df["label"]
    del train_df  # Free memory early

    train_pool = Pool(data=X_train, label=y_train, group_id=train_group_id)
    del X_train, y_train, train_group_id  # Free memory
    gc.collect()  # Call garbage collector to be extra sure

    # Validation
    val_group_sizes = val_df.groupby("clean_row_id", sort=False).size().tolist()
    val_group_id = np.repeat(np.arange(len(val_group_sizes)), val_group_sizes)

    X_val = val_df.drop(columns=drop_cols)
    y_val = val_df["label"]

    val_pool = Pool(data=X_val, label=y_val, group_id=val_group_id)
    del X_val, y_val, val_group_id  # Free memory
    gc.collect()  # Call garbage collector to be extra sure

    # Train model
    model = CatBoostRanker(iterations=n_rounds, **parameters)
    model.fit(
        train_pool,
        eval_set=val_pool,
        early_stopping_rounds=10,
        verbose=True,
    )

    # Save model
    model.save_model(model_output_path)
    print(f"\nModel saved to: {model_output_path}")

    # Score model
    val_df = val_df.copy()  # preserve original structure
    val_df["score"] = model.predict(val_pool)

    acc1, recall3, mrr = _compute_ranking_metrics(val_df, k=3)

    print("\nEvaluation Metrics (Validation Set):")
    print(f"Accuracy@1 : {acc1:.4f}")
    print(f"Recall@3   : {recall3:.4f}")
    print(f"MRR        : {mrr:.4f}")

    return model


def train_syn_and_ran_model(
    n_rounds: int = 1000,
):
    """
    Trains two CatBoost ranking models on pre-split data and saves them to disk.

    The models are saved in `.cbm` format for compatibility with CatBoost's C++ inference engine.

    Args:
        n_rounds (int): Maximum number of boosting rounds for training (default: 1000).
    """
    # Mount drive
    import sys

    if "google.colab" in sys.modules:
        from google.colab import drive

        drive.mount("/content/drive")

    def train_model(
        data_table: str, val_ids_path: str, test_ids_path: str, model_path: str
    ):
        print(f"Training model: {model_path}")
        # Get ids
        val_ids = (
            pd.read_csv(val_ids_path)["val_ids"].dropna().astype(int).tolist()
        )
        test_ids = (
            pd.read_csv(test_ids_path)["test_ids"].dropna().astype(int).tolist()
        )

        # Get data
        chunk_size = 100000
        chunks = []
        with sqlite3.connect(_DB_PATH) as conn:
            for chunk in pd.read_sql_query(f"SELECT * FROM {data_table}", conn, chunksize=chunk_size):
                chunks.append(chunk)
            full_df = pd.concat(chunks, ignore_index=True)

        # Split the set
        val_ids_set = set(val_ids)
        test_ids_set = set(test_ids)
        excluded_ids = val_ids_set | test_ids_set
        val_df = full_df[full_df["clean_row_id"].isin(val_ids_set)]
        full_df = full_df[~full_df["clean_row_id"].isin(excluded_ids)]

        # Train model
        return _train_catboost_model(
            _BEST_PARAMS, full_df, val_df, n_rounds, model_path
        )

    # Start with standard
    syn = train_model("feature_matrix", VAL_IDS, TEST_IDS, "syn_catboost_model.cbm")
    # Then complex
    ran = train_model("feature_matrix_complex", VAL_IDS, TEST_IDS, "ran_catboost_model.cbm")

    return syn, ran


syn, ran = train_syn_and_ran_model()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Training model: syn_catboost_model.cbm
0:	test: 0.9558808	best: 0.9558808 (0)	total: 5.12s	remaining: 1h 25m 13s
1:	test: 0.9593632	best: 0.9593632 (1)	total: 10.9s	remaining: 1h 31m 2s
2:	test: 0.9617114	best: 0.9617114 (2)	total: 16.6s	remaining: 1h 32m 5s
3:	test: 0.9630054	best: 0.9630054 (3)	total: 21.9s	remaining: 1h 30m 45s
4:	test: 0.9633745	best: 0.9633745 (4)	total: 27.3s	remaining: 1h 30m 36s
5:	test: 0.9629292	best: 0.9633745 (4)	total: 33.2s	remaining: 1h 31m 40s
6:	test: 0.9628455	best: 0.9633745 (4)	total: 38.5s	remaining: 1h 30m 57s
7:	test: 0.9632908	best: 0.9633745 (4)	total: 44.3s	remaining: 1h 31m 29s
8:	test: 0.9633745	best: 0.9633745 (4)	total: 49.3s	remaining: 1h 30m 31s
9:	test: 0.9635686	best: 0.9635686 (9)	total: 54.6s	remaining: 1h 30m 3s
10:	test: 0.9638465	best: 0.9638465 (10)	total: 1m	remaining: 1h 30m 31s
11:	test: 0.9640482	be

  topk = df.groupby("clean_row_id", group_keys=False).apply(
  mrr = df.groupby("clean_row_id", group_keys=False).apply(reciprocal_rank).mean()



Evaluation Metrics (Validation Set):
Accuracy@1 : 0.9332
Recall@3   : 0.9965
MRR        : 0.9644
Training model: ran_catboost_model.cbm
0:	test: 0.6306195	best: 0.6306195 (0)	total: 1.57s	remaining: 26m 8s
1:	test: 0.6395205	best: 0.6395205 (1)	total: 3.19s	remaining: 26m 31s
2:	test: 0.6422610	best: 0.6422610 (2)	total: 4.9s	remaining: 27m 9s
3:	test: 0.6364604	best: 0.6422610 (2)	total: 6.58s	remaining: 27m 17s
4:	test: 0.6344127	best: 0.6422610 (2)	total: 8.33s	remaining: 27m 37s
5:	test: 0.6308731	best: 0.6422610 (2)	total: 9.91s	remaining: 27m 22s
6:	test: 0.6441716	best: 0.6441716 (6)	total: 11.6s	remaining: 27m 18s
7:	test: 0.6455189	best: 0.6455189 (7)	total: 13.2s	remaining: 27m 11s
8:	test: 0.6426034	best: 0.6455189 (7)	total: 14.9s	remaining: 27m 17s
9:	test: 0.6450812	best: 0.6455189 (7)	total: 16.5s	remaining: 27m 18s
10:	test: 0.6450812	best: 0.6455189 (7)	total: 18.2s	remaining: 27m 12s
11:	test: 0.6464209	best: 0.6464209 (11)	total: 19.9s	remaining: 27m 19s
12:	test: 0

  topk = df.groupby("clean_row_id", group_keys=False).apply(
  mrr = df.groupby("clean_row_id", group_keys=False).apply(reciprocal_rank).mean()



Evaluation Metrics (Validation Set):
Accuracy@1 : 0.4556
Recall@3   : 0.8667
MRR        : 0.6625


Ok Models are trained. Synthetic case performed significantly better in validation with ~93% Accuracy@1 whereas the placebo set achieved ~45% Accuracy@1.

Let's run the test set to be sure.

In [4]:
def evaluate_on_test(model, test_ids_path, data_table: str, label=""):
    # Load test IDs
    test_ids = pd.read_csv(test_ids_path)["test_ids"].dropna().astype(int).tolist()
    test_ids_set = set(test_ids)

    # Load test rows from DB
    with sqlite3.connect(_DB_PATH) as conn:
        test_df = pd.read_sql_query(
            f"SELECT * FROM {data_table} WHERE clean_row_id IN ({','.join(map(str, test_ids_set))})",
            conn,
        )

    # Drop non-feature columns
    drop_cols = ["label", "clean_row_id", "investor", "firm", "template_id"]
    group_sizes = test_df.groupby("clean_row_id", sort=False).size().tolist()
    group_id = np.repeat(np.arange(len(group_sizes)), group_sizes)

    X_test = test_df.drop(columns=drop_cols)
    y_test = test_df["label"]

    test_pool = Pool(data=X_test, label=y_test, group_id=group_id)
    test_df["score"] = model.predict(test_pool)

    acc1, recall3, mrr = _compute_ranking_metrics(test_df, k=3)
    print(f"\nEvaluation Metrics ({label} Test Set):")
    print(f"Accuracy@1 : {acc1:.4f}")
    print(f"Recall@3   : {recall3:.4f}")
    print(f"MRR        : {mrr:.4f}")

    return acc1, recall3, mrr

# Evaluate both models on their test sets
evaluate_on_test(syn, TEST_IDS, "feature_matrix", label="Synthetic")
evaluate_on_test(ran, TEST_IDS, "feature_matrix_complex", label="Random")

  topk = df.groupby("clean_row_id", group_keys=False).apply(
  mrr = df.groupby("clean_row_id", group_keys=False).apply(reciprocal_rank).mean()



Evaluation Metrics (Synthetic Test Set):
Accuracy@1 : 0.9277
Recall@3   : 0.9952
MRR        : 0.9617


  topk = df.groupby("clean_row_id", group_keys=False).apply(



Evaluation Metrics (Random Test Set):
Accuracy@1 : 0.5091
Recall@3   : 0.8916
MRR        : 0.6983


  mrr = df.groupby("clean_row_id", group_keys=False).apply(reciprocal_rank).mean()


(np.float64(0.5091141669331628),
 np.float64(0.8915893827950112),
 np.float64(0.698321803162877))