In [1]:
from catboost import CatBoostRanker, Pool#MetricVisualizer
from copy import deepcopy
import numpy as np
import os
import pandas as pd

In [3]:
train_data = pd.read_csv("wikir_en1k_training_preprocessed_light.csv")
val_data = pd.read_csv("wikir_en1k_validation_preprocessed_light.csv")

In [4]:
X_train = train_data.values[:, [2, *list(range(4, 21))]]
y_train = train_data.values[:, 3]
queries_train = train_data.values[:, 0].astype(int)
X_val = val_data.values[:, [2, *list(range(4, 21))]]
y_val = val_data.values[:, 3]
queries_val = val_data.values[:, 0].astype(int)

In [5]:
train_pool = Pool(
    data=X_train,
    label=y_train,
    group_id=queries_train,
)

val_pool = Pool(
    data=X_val,
    label=y_val,
    group_id=queries_val,
)

In [6]:
default_parameters = {
    "iterations": 2000,
    "custom_metric": [
        "PrecisionAt:top=5", 
        "PrecisionAt:top=10", 
        "PrecisionAt:top=20", 
        "NDCG:top=5", 
        "NDCG:top=10",
        "NDCG:top=20",
        "NDCG",
        "MAP",
    ],
    "verbose": False,
    "random_seed": 42,
}

parameters = {}

In [7]:
def fit_model(loss_function, additional_params=None, train_pool=train_pool, test_pool=val_pool):
    parameters = deepcopy(default_parameters)
    parameters["loss_function"] = loss_function
    parameters["train_dir"] = loss_function

    if additional_params is not None:
        parameters.update(additional_params)

    model = CatBoostRanker(**parameters)
    model.fit(train_pool, eval_set=test_pool, plot=True)

    return model

In [8]:
model = fit_model("QueryRMSE")

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [9]:
# metric: best_value
# Precision@5: 0.122
# Precision@10: 0.071
# Precision@20: 0.0405
# NDCG@5: 0.783
# NDCG@10: 0.789
# NDCG@20: 0.795
# NDCG: 0.865
# MAP: 0.428