In [1]:
import lightgbm as lgb
import catboost as ct

In [2]:
import numpy as np
import pandas as pd

In [3]:
from lightgbm import LGBMRanker

In [4]:
from catboost.datasets import msrank_10k
train_df, test_df = msrank_10k()

In [5]:
from copy import deepcopy

# eval

In [6]:
def ndcg(y_score, y_true, k):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gain = 2 ** y_true - 1

    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)

# lightgbm data

In [7]:
X_train = train_df.drop([0, 1], axis=1).values
y_train = train_df[0].values * 4
qid_train = train_df[1].values

X_test = test_df.drop([0, 1], axis=1).values
y_test = test_df[0].values * 4
qid_test = test_df[1].values

In [8]:
_, group_train = np.unique(qid_train, return_counts=True)

In [9]:
# Convert labels to integers
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [33]:
# Initialize the LGBMRanker with lambdarank objective
gbm = LGBMRanker(
    objective='lambdarank',
    boosting_type='gbdt',
    n_estimators=2000  # Set the number of boosting iterations to 1000
)

In [34]:
gbm.fit(X_train, y_train, group=group_train)

In [35]:
predictions = gbm.predict(X_test)
print(predictions)

[-5.71849661 -3.26932375 -7.69962519 ... -5.24860019 -8.02321719
 -1.62271295]


In [36]:
grouped_predictions = {}
grouped_true_labels = {}

for qid in np.unique(qid_test):
    grouped_predictions[qid] = predictions[qid_test == qid]
    grouped_true_labels[qid] = y_test[qid_test == qid]

In [37]:
ndcg_scores = []

for qid in grouped_predictions:
    y_true = grouped_true_labels[qid]
    y_score = grouped_predictions[qid]

    if np.sum(y_true) == 0:  # Skip groups with zero relevance
        continue

    idcg = ndcg(y_true, y_true, k=10)
    ndcg_score = ndcg(y_score, y_true, k=10) / idcg
    ndcg_scores.append(ndcg_score)

In [38]:
np.mean(ndcg_scores)

0.4065723566594736

# catboost

In [16]:
from catboost import CatBoostRanker, Pool, MetricVisualizer

In [17]:
train_df, test_df = msrank_10k()

In [18]:
X_train = train_df.drop([0, 1], axis=1).values
y_train = train_df[0].values
qid_train = train_df[1].values

X_test = test_df.drop([0, 1], axis=1).values
y_test = test_df[0].values
qid_test = test_df[1].values

In [19]:
train_pool = Pool(
    data=X_train,
    label=y_train,
    group_id=qid_train
)

test_pool = Pool(
    data=X_test,
    label=y_test,
    group_id=qid_test
)

In [41]:
ct_ranker = CatBoostRanker(
    loss_function='LambdaMart',
    iterations=2000  # Set the number of estimators to 2000
)
ct_ranker.fit(train_pool, verbose=False)

<catboost.core.CatBoostRanker at 0x29b891fd0>

In [42]:
predictions = ct_ranker.predict(test_pool)
print(predictions)

[-1.26227823 -0.89225794 -0.84846133 ... -0.43407749 -0.80450815
  0.2595761 ]


In [43]:
grouped_predictions = {}
grouped_true_labels = {}

for qid in np.unique(qid_test):
    grouped_predictions[qid] = predictions[qid_test == qid]
    grouped_true_labels[qid] = y_test[qid_test == qid]

In [44]:
qids = np.unique(qid_test)

In [45]:
ndcg_scores = []

for qid in grouped_predictions:
    y_true = grouped_true_labels[qid]
    y_score = grouped_predictions[qid]

    if np.sum(y_true) == 0:  # Skip groups with zero relevance
        continue

    idcg = ndcg(y_true, y_true, k=10)
    ndcg_score = ndcg(y_score, y_true, k=10) / idcg
    ndcg_scores.append(ndcg_score)

In [46]:
np.mean(ndcg_scores)

0.4147356888355509