In [127]:
import numpy as np
from catboost import CatBoost, Pool
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [128]:
data = pd.read_csv('intern_task.csv')
feature_ids = [f'feature_{i}' for i in range(144)]  

In [129]:
scaler = MinMaxScaler()
ranks = np.array(data['rank']).reshape(-1, 1)
data['normalized_rank'] = scaler.fit_transform(ranks).flatten()

In [130]:
data_sorted = data.sort_values(by='query_id')
session_ids = data_sorted['query_id']
document_features = data_sorted[feature_ids]
relevance_labels = data_sorted['normalized_rank']

In [131]:
relevance_labels

0         0.00
67        0.50
66        0.00
65        0.50
64        0.25
          ... 
235199    0.25
235198    0.25
235197    0.50
235195    0.25
235257    0.25
Name: normalized_rank, Length: 235258, dtype: float64

In [132]:
data_sorted

Unnamed: 0,rank,query_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_135,feature_136,feature_137,feature_138,feature_139,feature_140,feature_141,feature_142,feature_143,normalized_rank
0,0,10,1.0,0.0,1.0,3.0,3.0,0.333333,0.0,0.333333,...,0.0,0.454545,0.890238,8.655534,1.000000,0.077778,0.002222,1.0,0.333333,0.00
67,2,10,3.0,3.0,3.0,1.0,3.0,1.000000,1.0,1.000000,...,0.0,0.142857,0.998020,22.936731,0.333333,0.022648,0.000009,31.0,10.333333,0.50
66,0,10,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.00
65,2,10,3.0,0.0,0.0,0.0,3.0,1.000000,0.0,0.000000,...,0.0,0.000000,0.997559,0.000000,0.000000,0.010774,0.000001,16.0,5.333333,0.50
64,1,10,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235199,1,29995,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.25
235198,1,29995,1.0,1.0,1.0,1.0,1.0,0.500000,0.5,0.500000,...,0.0,0.111111,0.471409,13.302685,0.500000,0.019608,0.000037,1.0,0.500000,0.25
235197,2,29995,1.0,0.0,0.0,0.0,1.0,0.500000,0.0,0.000000,...,0.0,0.000000,0.471409,0.000000,0.000000,0.003916,0.000017,3.0,1.500000,0.50
235195,1,29995,1.0,0.0,0.0,0.0,1.0,0.500000,0.0,0.000000,...,0.0,0.000000,0.471409,0.000000,0.000000,0.003077,0.000010,4.0,2.000000,0.25


In [133]:
train_data, test_data, train_labels, test_labels, train_groups, test_groups = train_test_split(
    document_features, relevance_labels, session_ids, test_size=0.2, random_state=42
)

In [134]:
groups = data_sorted.groupby('query_id').size().cumsum().shift(1).fillna(0).astype(int).tolist()

In [135]:
def grouped_train_test_split(data, groups, test_size=0.2, random_state=42):
    unique_groups = list(set(groups))
    np.random.seed(random_state)
    np.random.shuffle(unique_groups)
    split_point = int(len(unique_groups) * (1 - test_size))
    
    train_groups = set(unique_groups[:split_point])
    train_indices = [i for i, group in enumerate(groups) if group in train_groups]
    test_indices = [i for i, group in enumerate(groups) if group not in train_groups]

    return data.iloc[train_indices], data.iloc[test_indices]

In [136]:
train_data, test_data = grouped_train_test_split(data_sorted, groups)

In [137]:
train_features, train_labels, train_groups = train_data[feature_ids], train_data['normalized_rank'], train_data['query_id']
test_features, test_labels, test_groups = test_data[feature_ids], test_data['normalized_rank'], test_data['query_id']

In [138]:
train_pool = Pool(data=train_features, label=train_labels, group_id=train_groups)
test_pool = Pool(data=test_features, label=test_labels, group_id=test_groups)

In [139]:
print("Training labels distribution:", np.unique(train_labels, return_counts=True))
print("Testing labels distribution:", np.unique(test_labels, return_counts=True))


Training labels distribution: (array([0.  , 0.25, 0.5 , 0.75, 1.  ]), array([927, 437, 211,  18,   7]))
Testing labels distribution: (array([0.  , 0.25, 0.5 , 0.75, 1.  ]), array([235, 117,  45,   1,   2]))


In [140]:
model = CatBoost({
    'iterations': 100,
    'learning_rate': 0.000001,
    'loss_function': 'YetiRankPairwise',
    'task_type': 'CPU',  # Use 'GPU' if you have compatible hardware
    'verbose': 0
})

In [141]:
model.fit(train_pool)

<catboost.core.CatBoost at 0x1624b5fa0>

In [142]:
predictions = model.predict(test_pool)
evaluated_metrics = model.eval_metrics(test_pool, metrics=['NDCG:top=5', 'PFound:top=5', 'PrecisionAt:top=5'], ntree_end=model.tree_count_)
for metric_name, metric_values in evaluated_metrics.items():
    print(f"{metric_name}: {metric_values[-1]}")


NDCG:top=5;type=Base: 0.6220000075340804
PFound:top=5: 0.5420050549958881
PrecisionAt:top=5: 0.03157894736842106


##Cross-validation strategy

In [54]:
from catboost import cv, Pool
from sklearn.model_selection import GroupKFold
cv_data = Pool(
    data=document_features,
    label=relevance_labels,
    group_id=session_ids
)

In [123]:
params = {
    'iterations': 10000,  
    'learning_rate': 0.01,  
    'depth': 5,
    'l2_leaf_reg': 3,  
    'loss_function': 'YetiRank',
    'eval_metric': 'NDCG:top=5',
    'leaf_estimation_method': 'Newton',
    'verbose': False,
    'random_seed': 42
}

In [124]:
cv_config = {
    'params': params,
    'fold_count': 10,
    'type': 'TimeSeries',
    'partition_random_seed': 42,
    'shuffle': True,
    'stratified': False,
    'as_pandas': True
}

In [125]:
cv_results = cv(pool=cv_data, **cv_config)


Training on fold [0/10]

bestTest = 0.5505808767
bestIteration = 6456

Training on fold [1/10]

bestTest = 0.559775164
bestIteration = 9972

Training on fold [2/10]

bestTest = 0.4992661914
bestIteration = 8311

Training on fold [3/10]

bestTest = 0.5419122722
bestIteration = 8628

Training on fold [4/10]

bestTest = 0.5708016949
bestIteration = 9424

Training on fold [5/10]

bestTest = 0.6016697714
bestIteration = 9962

Training on fold [6/10]

bestTest = 0.5927335813
bestIteration = 8678

Training on fold [7/10]

bestTest = 0.5938205752
bestIteration = 7881

Training on fold [8/10]

bestTest = 0.5702204075
bestIteration = 5682

Training on fold [9/10]

bestTest = 0.5561951392
bestIteration = 9997



In [126]:
print(cv_results['test-NDCG:top=5;type=Base-mean'].mean())
print(cv_results['test-NDCG:top=5;type=Base-std'].mean())
print(cv_results['test-PFound-mean'].mean())
print(cv_results['test-PFound-std'].mean())

0.5477924638230123
0.02822954853924732
0.7148337234383239
0.022416220033210296
