# Feature Ablation

This test will use the same synthetic data in the placebo testing. We train three models, including only one of the three categories of features. We will then compare the results in test.

In [6]:
!pip install sqlalchemy
!pip install catboost



In [7]:
import gc
import sqlite3
import pandas as pd
import numpy as np
from catboost import CatBoostRanker, Pool
from typing import Tuple

# Paths to files in Drive
_DB_PATH = "/content/drive/MyDrive/Colab Notebooks/database.db"
TEST_IDS = "test_ids.csv"
VAL_IDS = "val_ids.csv"

# Tuned hyperparameters
_BEST_PARAMS = {
    "loss_function": "YetiRank",
    "eval_metric": "NDCG:top=3",
    "random_seed": 42,
    "learning_rate": 0.13275757957731918,
    "depth": 6,
    "l2_leaf_reg": 7.142519331365267,
    "random_strength": 3.395785387976391,
    "min_data_in_leaf": 84,
    "subsample": 0.9048958560910838,
    "colsample_bylevel": 0.511123337191838,
    "grow_policy": "Lossguide",
}


def _compute_ranking_metrics(df: pd.DataFrame, k: int = 3):
    """
    Compute Accuracy@1, Recall@k, and MRR for a ranking prediction dataframe.

    Args:
        df (pd.DataFrame): Must contain columns ['clean_row_id', 'score', 'label']
        k (int): The cutoff rank for recall@k

    Returns:
        Tuple[float, float, float]: (Accuracy@1, Recall@k, MRR)
    """
    # Accuracy@1
    top1 = df.loc[df.groupby("clean_row_id")["score"].idxmax()]
    acc1 = (top1["label"] == 1).mean()

    # Recall@k
    topk = df.groupby("clean_row_id", group_keys=False).apply(
        lambda g: g.nlargest(k, "score")
    )
    recall_k = topk.groupby("clean_row_id")["label"].max().mean()

    # MRR
    def reciprocal_rank(g: pd.DataFrame) -> float:
        labels_sorted = g.sort_values("score", ascending=False)["label"].to_numpy()
        for rank, label in enumerate(labels_sorted, start=1):
            if label == 1:
                return 1.0 / rank
        return 0.0

    mrr = df.groupby("clean_row_id", group_keys=False).apply(reciprocal_rank).mean()

    return acc1, recall_k, mrr


def _train_catboost_model(
    parameters: dict,
    train_df: pd.DataFrame,
    val_df: pd.DataFrame,
    inc_cols: list,
    n_rounds: int = 500,
    model_output_path: str = "catboost_model.cbm",
) -> CatBoostRanker:
    """
    Trains a CatBoost ranking model using the provided training and validation data.

    Args:
        parameters (dict): Parameters for CatBoostRanker.
        train_df (pd.DataFrame): Training data with label, group info, and features.
        val_df (pd.DataFrame): Validation data with same structure.
        n_rounds (int): Maximum number of boosting rounds.
        model_output_path (str): File path to save the trained CatBoost model.

    Returns:
        CatBoostRanker: Trained CatBoost model.
    """

    drop_cols: list = ["label", "clean_row_id", "investor", "firm", "template_id"]

    # Train
    train_group_sizes = train_df.groupby("clean_row_id", sort=False).size().tolist()
    train_group_id = np.repeat(np.arange(len(train_group_sizes)), train_group_sizes)

    # Only include current features
    train_df = train_df[inc_cols]
    X_train = train_df.drop(columns=drop_cols)
    y_train = train_df["label"]
    del train_df  # Free memory early

    train_pool = Pool(data=X_train, label=y_train, group_id=train_group_id)
    del X_train, y_train, train_group_id  # Free memory
    gc.collect()  # Call garbage collector to be extra sure

    # Validation
    val_group_sizes = val_df.groupby("clean_row_id", sort=False).size().tolist()
    val_group_id = np.repeat(np.arange(len(val_group_sizes)), val_group_sizes)

    val_df = val_df[inc_cols]
    X_val = val_df.drop(columns=drop_cols)
    y_val = val_df["label"]

    val_pool = Pool(data=X_val, label=y_val, group_id=val_group_id)
    del X_val, y_val, val_group_id  # Free memory
    gc.collect()  # Call garbage collector to be extra sure

    # Train model
    model = CatBoostRanker(iterations=n_rounds, **parameters)
    model.fit(
        train_pool,
        eval_set=val_pool,
        early_stopping_rounds=10,
        verbose=True,
    )

    # Save model
    model.save_model(model_output_path)
    print(f"\nModel saved to: {model_output_path}")

    # Score model
    val_df = val_df.copy()  # preserve original structure
    val_df["score"] = model.predict(val_pool)

    acc1, recall3, mrr = _compute_ranking_metrics(val_df, k=3)

    print("\nEvaluation Metrics (Validation Set):")
    print(f"Accuracy@1 : {acc1:.4f}")
    print(f"Recall@3   : {recall3:.4f}")
    print(f"MRR        : {mrr:.4f}")

    return model


def train_syn_and_ran_model(
    inc_cols: list,
    n_rounds: int = 1000,
):
    """
    Trains two CatBoost ranking models on pre-split data and saves them to disk.

    The models are saved in `.cbm` format for compatibility with CatBoost's C++ inference engine.

    Args:
        n_rounds (int): Maximum number of boosting rounds for training (default: 1000).
    """
    # Mount drive
    import sys

    if "google.colab" in sys.modules:
        from google.colab import drive

        drive.mount("/content/drive")

    def train_model(
        data_table: str, val_ids_path: str, test_ids_path: str, model_path: str, inc_cols,
    ):
        print(f"Training model: {model_path}")
        # Get ids
        val_ids = (
            pd.read_csv(val_ids_path)["val_ids"].dropna().astype(int).tolist()
        )
        test_ids = (
            pd.read_csv(test_ids_path)["test_ids"].dropna().astype(int).tolist()
        )

        # Get data
        chunk_size = 100000
        chunks = []
        with sqlite3.connect(_DB_PATH) as conn:
            for chunk in pd.read_sql_query(f"SELECT * FROM {data_table}", conn, chunksize=chunk_size):
                chunks.append(chunk)
            full_df = pd.concat(chunks, ignore_index=True)

        # Split the set
        val_ids_set = set(val_ids)
        test_ids_set = set(test_ids)
        excluded_ids = val_ids_set | test_ids_set
        val_df = full_df[full_df["clean_row_id"].isin(val_ids_set)]
        full_df = full_df[~full_df["clean_row_id"].isin(excluded_ids)]

        # Train model
        return _train_catboost_model(
            _BEST_PARAMS, full_df, val_df, inc_cols, n_rounds, model_path
        )

    # Start with standard
    syn = train_model("feature_matrix", VAL_IDS, TEST_IDS, "syn_catboost_model.cbm", inc_cols)

    return syn

# Base fields
base = ["label", "clean_row_id", "investor", "firm", "template_id",]

# Core sets
context_features: list =  [
                   "investor_has_nickname", "investor_has_middle_name",
                   "investor_has_multiple_middle_names", "investor_has_multiple_first_names", "investor_has_multiple_last_names",
                   "firm_is_shared_infra", "firm_is_multi_domain", "investor_has_german_char",
                   "investor_has_nfkd_normalized", "template_firm_support_count", "template_firm_coverage_pct",
                   "template_name_characteristic_clash", "firm_num_templates", "firm_diversity_ratio",
                   "firm_is_single_template", "template_in_firm_templates"
                   ]
mined_rules: list = [
                   "template_in_mined_rules", "template_max_rule_confidence", "template_avg_rule_confidence",
                   ]
structural_rules: list = [
                   "template_uses_middle_name", "template_uses_multiple_firsts", "template_uses_multiple_middles",
                   "template_uses_multiple_lasts", "template_support_count", "template_coverage_pct",
                   ]
full_features = context_features + mined_rules + structural_rules

# Ablation sets
minues_context = structural_rules + mined_rules
minues_mined = context_features + structural_rules
minus_structural = mined_rules + context_features

# Train
full_modl = train_syn_and_ran_model(base + full_features)
context = train_syn_and_ran_model(base + context_features)
mined = train_syn_and_ran_model(base + mined_rules)
structural = train_syn_and_ran_model(base + structural_rules)
minues_context_modl = train_syn_and_ran_model(base + minues_context)
minues_mined_modl = train_syn_and_ran_model(base + minues_mined)
minus_structural_modl = train_syn_and_ran_model(base + minus_structural)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Training model: syn_catboost_model.cbm
0:	test: 0.9515495	best: 0.9515495 (0)	total: 5.78s	remaining: 1h 36m 14s
1:	test: 0.9610187	best: 0.9610187 (1)	total: 11.4s	remaining: 1h 35m 13s
2:	test: 0.9625334	best: 0.9625334 (2)	total: 17s	remaining: 1h 34m 10s
3:	test: 0.9620196	best: 0.9625334 (2)	total: 22.7s	remaining: 1h 34m 19s
4:	test: 0.9628455	best: 0.9628455 (4)	total: 28.3s	remaining: 1h 33m 54s
5:	test: 0.9624497	best: 0.9628455 (4)	total: 34s	remaining: 1h 33m 49s
6:	test: 0.9623317	best: 0.9628455 (4)	total: 39.9s	remaining: 1h 34m 20s
7:	test: 0.9621719	best: 0.9628455 (4)	total: 45.8s	remaining: 1h 34m 43s
8:	test: 0.9621300	best: 0.9628455 (4)	total: 51.5s	remaining: 1h 34m 30s
9:	test: 0.9624573	best: 0.9628455 (4)	total: 57.2s	remaining: 1h 34m 21s
10:	test: 0.9625753	best: 0.9628455 (4)	total: 1m 2s	remaining: 1h 33m 51s
11:	test: 0.9634088	b

  topk = df.groupby("clean_row_id", group_keys=False).apply(
  mrr = df.groupby("clean_row_id", group_keys=False).apply(reciprocal_rank).mean()



Evaluation Metrics (Validation Set):
Accuracy@1 : 0.9313
Recall@3   : 0.9962
MRR        : 0.9631
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Training model: syn_catboost_model.cbm
0:	test: 0.9519987	best: 0.9519987 (0)	total: 5.54s	remaining: 1h 32m 10s
1:	test: 0.9532014	best: 0.9532014 (1)	total: 11s	remaining: 1h 31m 45s
2:	test: 0.9537913	best: 0.9537913 (2)	total: 16.7s	remaining: 1h 32m 38s
3:	test: 0.9545830	best: 0.9545830 (3)	total: 22.6s	remaining: 1h 33m 45s
4:	test: 0.9550206	best: 0.9550206 (4)	total: 28.5s	remaining: 1h 34m 26s
5:	test: 0.9554926	best: 0.9554926 (5)	total: 34.2s	remaining: 1h 34m 19s
6:	test: 0.9556524	best: 0.9556524 (6)	total: 39.6s	remaining: 1h 33m 33s
7:	test: 0.9553061	best: 0.9556524 (6)	total: 45.3s	remaining: 1h 33m 42s
8:	test: 0.9554240	best: 0.9556524 (6)	total: 50.9s	remaining: 1h 33m 22s
9:	test: 0.9554240	best: 0.9556524 (6)	total: 56s	remaining: 1h 32m 22

  topk = df.groupby("clean_row_id", group_keys=False).apply(
  mrr = df.groupby("clean_row_id", group_keys=False).apply(reciprocal_rank).mean()



Evaluation Metrics (Validation Set):
Accuracy@1 : 0.9297
Recall@3   : 0.9930
MRR        : 0.9622
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Training model: syn_catboost_model.cbm
0:	test: 0.0000000	best: 0.0000000 (0)	total: 5.32s	remaining: 1h 28m 36s
1:	test: 0.0000000	best: 0.0000000 (0)	total: 10.6s	remaining: 1h 27m 56s
2:	test: 0.0000000	best: 0.0000000 (0)	total: 15.8s	remaining: 1h 27m 30s
3:	test: 0.0000000	best: 0.0000000 (0)	total: 21s	remaining: 1h 27m 1s
4:	test: 0.0000000	best: 0.0000000 (0)	total: 25.8s	remaining: 1h 25m 34s
5:	test: 0.0000000	best: 0.0000000 (0)	total: 31s	remaining: 1h 25m 40s
6:	test: 0.0000000	best: 0.0000000 (0)	total: 35.9s	remaining: 1h 24m 48s
7:	test: 0.0000000	best: 0.0000000 (0)	total: 41.2s	remaining: 1h 25m 7s
8:	test: 0.0000000	best: 0.0000000 (0)	total: 46.7s	remaining: 1h 25m 41s
9:	test: 0.0000000	best: 0.0000000 (0)	total: 51.7s	remaining: 1h 25m 14s


  topk = df.groupby("clean_row_id", group_keys=False).apply(
  mrr = df.groupby("clean_row_id", group_keys=False).apply(reciprocal_rank).mean()



Evaluation Metrics (Validation Set):
Accuracy@1 : 0.0003
Recall@3   : 0.3622
MRR        : 0.2080
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Training model: syn_catboost_model.cbm
0:	test: 0.4295077	best: 0.4295077 (0)	total: 5.63s	remaining: 1h 33m 45s
1:	test: 0.5282491	best: 0.5282491 (1)	total: 11.1s	remaining: 1h 32m 14s
2:	test: 0.5282491	best: 0.5282491 (1)	total: 16s	remaining: 1h 28m 35s
3:	test: 0.5282491	best: 0.5282491 (1)	total: 21.8s	remaining: 1h 30m 23s
4:	test: 0.5282491	best: 0.5282491 (1)	total: 26.9s	remaining: 1h 29m 15s
5:	test: 0.5282491	best: 0.5282491 (1)	total: 32.4s	remaining: 1h 29m 27s
6:	test: 0.5282491	best: 0.5282491 (1)	total: 37s	remaining: 1h 27m 32s
7:	test: 0.5282491	best: 0.5282491 (1)	total: 42.3s	remaining: 1h 27m 31s
8:	test: 0.5282491	best: 0.5282491 (1)	total: 47.7s	remaining: 1h 27m 29s
9:	test: 0.5282491	best: 0.5282491 (1)	total: 53.2s	remaining: 1h 27m 44

  topk = df.groupby("clean_row_id", group_keys=False).apply(
  mrr = df.groupby("clean_row_id", group_keys=False).apply(reciprocal_rank).mean()



Evaluation Metrics (Validation Set):
Accuracy@1 : 0.3619
Recall@3   : 0.8590
MRR        : 0.6113
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Training model: syn_catboost_model.cbm
0:	test: 0.5282491	best: 0.5282491 (0)	total: 5.52s	remaining: 1h 31m 59s
1:	test: 0.6730219	best: 0.6730219 (1)	total: 10.9s	remaining: 1h 30m 48s
2:	test: 0.6730219	best: 0.6730219 (1)	total: 16.6s	remaining: 1h 32m 5s
3:	test: 0.6730219	best: 0.6730219 (1)	total: 22.2s	remaining: 1h 32m 1s
4:	test: 0.6730219	best: 0.6730219 (1)	total: 27.9s	remaining: 1h 32m 25s
5:	test: 0.6730219	best: 0.6730219 (1)	total: 33.3s	remaining: 1h 31m 55s
6:	test: 0.6730219	best: 0.6730219 (1)	total: 38.2s	remaining: 1h 30m 12s
7:	test: 0.6730219	best: 0.6730219 (1)	total: 43s	remaining: 1h 28m 54s
8:	test: 0.6730219	best: 0.6730219 (1)	total: 48.5s	remaining: 1h 29m 5s
9:	test: 0.6730219	best: 0.6730219 (1)	total: 53.8s	remaining: 1h 28m 45s

  topk = df.groupby("clean_row_id", group_keys=False).apply(
  mrr = df.groupby("clean_row_id", group_keys=False).apply(reciprocal_rank).mean()



Evaluation Metrics (Validation Set):
Accuracy@1 : 0.3923
Recall@3   : 0.8590
MRR        : 0.6256
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Training model: syn_catboost_model.cbm
0:	test: 0.9566801	best: 0.9566801 (0)	total: 5.69s	remaining: 1h 34m 42s
1:	test: 0.9584802	best: 0.9584802 (1)	total: 11.4s	remaining: 1h 34m 40s
2:	test: 0.9583013	best: 0.9584802 (1)	total: 16.3s	remaining: 1h 30m 25s
3:	test: 0.9598731	best: 0.9598731 (3)	total: 22.2s	remaining: 1h 32m 4s
4:	test: 0.9597132	best: 0.9598731 (3)	total: 28s	remaining: 1h 32m 53s
5:	test: 0.9599910	best: 0.9599910 (5)	total: 33.2s	remaining: 1h 31m 35s
6:	test: 0.9614639	best: 0.9614639 (6)	total: 38.8s	remaining: 1h 31m 44s
7:	test: 0.9614639	best: 0.9614639 (6)	total: 43.9s	remaining: 1h 30m 49s
8:	test: 0.9611518	best: 0.9614639 (6)	total: 49.3s	remaining: 1h 30m 27s
9:	test: 0.9632414	best: 0.9632414 (9)	total: 55.1s	remaining: 1h 30m 5

  topk = df.groupby("clean_row_id", group_keys=False).apply(
  mrr = df.groupby("clean_row_id", group_keys=False).apply(reciprocal_rank).mean()



Evaluation Metrics (Validation Set):
Accuracy@1 : 0.9322
Recall@3   : 0.9958
MRR        : 0.9635
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Training model: syn_catboost_model.cbm
0:	test: 0.9491749	best: 0.9491749 (0)	total: 5.95s	remaining: 1h 38m 59s
1:	test: 0.9551119	best: 0.9551119 (1)	total: 11.8s	remaining: 1h 38m 22s
2:	test: 0.9573003	best: 0.9573003 (2)	total: 17.2s	remaining: 1h 35m 3s
3:	test: 0.9579664	best: 0.9579664 (3)	total: 23s	remaining: 1h 35m 17s
4:	test: 0.9591120	best: 0.9591120 (4)	total: 28.6s	remaining: 1h 34m 49s
5:	test: 0.9592795	best: 0.9592795 (5)	total: 33.9s	remaining: 1h 33m 29s
6:	test: 0.9597514	best: 0.9597514 (6)	total: 39s	remaining: 1h 32m 6s
7:	test: 0.9597514	best: 0.9597514 (6)	total: 44.6s	remaining: 1h 32m 13s
8:	test: 0.9615516	best: 0.9615516 (8)	total: 49.8s	remaining: 1h 31m 20s
9:	test: 0.9616353	best: 0.9616353 (9)	total: 55.6s	remaining: 1h 31m 40s


  topk = df.groupby("clean_row_id", group_keys=False).apply(
  mrr = df.groupby("clean_row_id", group_keys=False).apply(reciprocal_rank).mean()



Evaluation Metrics (Validation Set):
Accuracy@1 : 0.9325
Recall@3   : 0.9962
MRR        : 0.9641


In [8]:
def evaluate_on_test(model, test_ids_path, data_table: str, inc_cols, label=""):
    # Load test IDs
    test_ids = pd.read_csv(test_ids_path)["test_ids"].dropna().astype(int).tolist()
    test_ids_set = set(test_ids)

    # Load test rows from DB
    with sqlite3.connect(_DB_PATH) as conn:
        test_df = pd.read_sql_query(
            f"SELECT * FROM {data_table} WHERE clean_row_id IN ({','.join(map(str, test_ids_set))})",
            conn,
        )

    # Drop non-feature columns
    drop_cols = ["label", "clean_row_id", "investor", "firm", "template_id"]
    group_sizes = test_df.groupby("clean_row_id", sort=False).size().tolist()
    group_id = np.repeat(np.arange(len(group_sizes)), group_sizes)

    test_df = test_df[inc_cols]
    X_test = test_df.drop(columns=drop_cols)
    y_test = test_df["label"]

    test_pool = Pool(data=X_test, label=y_test, group_id=group_id)
    test_df["score"] = model.predict(test_pool)

    acc1, recall3, mrr = _compute_ranking_metrics(test_df, k=3)
    print(f"\nEvaluation Metrics ({label} Test Set):")
    print(f"Accuracy@1 : {acc1:.4f}")
    print(f"Recall@3   : {recall3:.4f}")
    print(f"MRR        : {mrr:.4f}")

    return acc1, recall3, mrr

# Evaluate both models on their test sets
evaluate_on_test(full_modl, TEST_IDS, "feature_matrix", base + full_features, label="Full Features")
evaluate_on_test(context, TEST_IDS, "feature_matrix", base + context_features, label="Context Features")
evaluate_on_test(mined, TEST_IDS, "feature_matrix", base + mined_rules, label="Mined Features")
evaluate_on_test(structural, TEST_IDS, "feature_matrix", base + structural_rules, label="Structural Features")
evaluate_on_test(minues_context_modl, TEST_IDS, "feature_matrix", base + minues_context, label="Context Features (Minus)")
evaluate_on_test(minues_mined_modl, TEST_IDS, "feature_matrix", base + minues_mined, label="Mined Features (Minus)")
evaluate_on_test(minus_structural_modl, TEST_IDS, "feature_matrix", base + minus_structural, label="Structural Features (Minus)")

  topk = df.groupby("clean_row_id", group_keys=False).apply(
  mrr = df.groupby("clean_row_id", group_keys=False).apply(reciprocal_rank).mean()



Evaluation Metrics (Full Features Test Set):
Accuracy@1 : 0.9252
Recall@3   : 0.9962
MRR        : 0.9603


  topk = df.groupby("clean_row_id", group_keys=False).apply(
  mrr = df.groupby("clean_row_id", group_keys=False).apply(reciprocal_rank).mean()



Evaluation Metrics (Context Features Test Set):
Accuracy@1 : 0.9223
Recall@3   : 0.9942
MRR        : 0.9582


  topk = df.groupby("clean_row_id", group_keys=False).apply(
  mrr = df.groupby("clean_row_id", group_keys=False).apply(reciprocal_rank).mean()



Evaluation Metrics (Mined Features Test Set):
Accuracy@1 : 0.0003
Recall@3   : 0.3671
MRR        : 0.2105


  topk = df.groupby("clean_row_id", group_keys=False).apply(
  mrr = df.groupby("clean_row_id", group_keys=False).apply(reciprocal_rank).mean()



Evaluation Metrics (Structural Features Test Set):
Accuracy@1 : 0.3668
Recall@3   : 0.8788
MRR        : 0.6198


  topk = df.groupby("clean_row_id", group_keys=False).apply(
  mrr = df.groupby("clean_row_id", group_keys=False).apply(reciprocal_rank).mean()



Evaluation Metrics (Context Features (Minus) Test Set):
Accuracy@1 : 0.3985
Recall@3   : 0.8788
MRR        : 0.6351


  topk = df.groupby("clean_row_id", group_keys=False).apply(
  mrr = df.groupby("clean_row_id", group_keys=False).apply(reciprocal_rank).mean()



Evaluation Metrics (Mined Features (Minus) Test Set):
Accuracy@1 : 0.9261
Recall@3   : 0.9958
MRR        : 0.9607


  topk = df.groupby("clean_row_id", group_keys=False).apply(



Evaluation Metrics (Structural Features (Minus) Test Set):
Accuracy@1 : 0.9242
Recall@3   : 0.9955
MRR        : 0.9599


  mrr = df.groupby("clean_row_id", group_keys=False).apply(reciprocal_rank).mean()


(np.float64(0.9242085065558043),
 np.float64(0.9955228653661656),
 np.float64(0.9598816757275345))