In [1]:
import json

import pandas as pd
from catboost import CatBoostRanker, Pool
from sklearn.model_selection import ParameterGrid
from tqdm import tqdm

In [2]:
train_data = pd.read_csv("wikir_en78k_training_preprocessed_light.csv")
val_data = pd.read_csv("wikir_en78k_validation_preprocessed_light.csv")
# test_data = pd.read_csv("wikir_en78k_test_preprocessed_light.csv")

In [3]:
# this is max value that can handle CatboostRanker with YetiRank loss on GPU
MAX_DOCS_PER_QUERY = 1023

train_data = (
    train_data.sort_values(
        by=["query_id", "relevance"], ascending=[True, False], kind="stable"
    )
    .groupby("query_id")
    .head(MAX_DOCS_PER_QUERY)
)
val_data = (
    val_data.sort_values(
        by=["query_id", "relevance"], ascending=[True, False], kind="stable"
    )
    .groupby("query_id")
    .head(MAX_DOCS_PER_QUERY)
)

In [4]:
X_train = train_data.values[:, list(range(4, 21))]
y_train = train_data.values[:, 3]
queries_train = train_data.values[:, 0].astype(int)
X_val = val_data.values[:, list(range(4, 21))]
y_val = val_data.values[:, 3]
queries_val = val_data.values[:, 0].astype(int)

In [5]:
train_pool = Pool(data=X_train, label=y_train, group_id=queries_train)

val_pool = Pool(data=X_val, label=y_val, group_id=queries_val)

In [10]:
default_parameters = {
    "iterations": 2000,
    "custom_metric": [
        "PrecisionAt:top=5",
        "PrecisionAt:top=10",
        "PrecisionAt:top=20",
        "NDCG:top=5",
        "NDCG:top=10",
        "NDCG:top=20",
        "NDCG",
        "MAP",
    ],
    "verbose": False,
    "random_seed": 42,
    "task_type": "GPU",
    "devices": "0:1",
    "gpu_ram_part": 0.95,
    # "loss_function": "YetiRankPairwise",
    "depth": 8,
    "l2_leaf_reg": 5,
    "random_strength": 1,
}

In [None]:
model = CatBoostRanker(**default_parameters, **params)
model.fit(train_pool, eval_set=val_pool, plot=False)

In [7]:
param_grid = {
    # "iterations": [1000, 2000, 4000],
    # "learning_rate": [10 ** (-i) for i in range(5)],
    "depth": [4, 6, 8],
    "l2_leaf_reg": [1, 3, 5],
    "random_strength": [0.1, 0.5, 1.0],
    "loss_function": ["PairLogit:max_pairs=100000", "QueryRMSE", "YetiRank"],
}

In [8]:
param_combinations = list(ParameterGrid(param_grid))

In [9]:
results = []

for params in tqdm(param_combinations):
    model = CatBoostRanker(**default_parameters, **params)
    model.fit(train_pool, eval_set=val_pool, plot=False)
    results.append(params | model.get_best_score())
    with open("./wikir_catboost_training_report.json", "w") as f:
        json.dump(results, f, indent=4)

  0%|                                                                                                                                                     | 0/81 [00:00<?, ?it/s]Default metric period is 5 because PrecisionAt, MAP, NDCG is/are not implemented for GPU
Metric PrecisionAt:top=5 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PrecisionAt:top=10 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PrecisionAt:top=20 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=10;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=20;type=Base is not implemented on

KeyboardInterrupt: 