# LambdaMART using DMLC XGBoost

## Imports

In [1]:
import json
import numpy as np
import os
import xgboost as xgb

from sklearn.datasets import dump_svmlight_file
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import MinMaxScaler

## Mount Drive

In [2]:
from google.colab import drive

drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
!ls /content/drive/MyDrive/data/ltr-data/

candidate-list-5features.jsonl


## Constants

In [4]:
LTR_DATA_DIR = "/content/drive/MyDrive/data/ltr-data"
INPUT_FILE = os.path.join(LTR_DATA_DIR, "candidate-list-5features.jsonl")

TRAIN_FILE = os.path.join(LTR_DATA_DIR, "train.svml")
TEST_FILE = os.path.join(LTR_DATA_DIR, "test.svml")

LR_MODEL_FILE = os.path.join(LTR_DATA_DIR, "lambdarank-model.json")
LM_MODEL_FILE = os.path.join(LTR_DATA_DIR, "lambdamart-model.json")

## Data Preparation

In [5]:
query2qid = {}
next_qid = 1
with open(INPUT_FILE, "r", encoding="utf-8") as fjson:
  for line in fjson:
    rec = json.loads(line.strip())
    query = rec["query"]
    if query not in query2qid.keys():
      query2qid[query] = next_qid
      next_qid += 1

print(len(query2qid))

250


In [6]:
features, labels, qids = [], [], []
non_feature_cols = set(["query", "doc_id", "label"])
with open(INPUT_FILE, "r", encoding="utf-8") as fjson:
  for line in fjson:
    rec = json.loads(line.strip())
    features.append(np.array(
        [float(col_value) for col_name, col_value in rec.items()
        if col_name not in non_feature_cols]))
    labels.append(rec["label"])
    qids.append(query2qid[rec["query"]])

X = np.array(features)
y = np.array(labels)
query_id = np.array(qids)

X.shape, y.shape, query_id.shape

((5004, 61), (5004,), (5004,))

### Scale Features

In [7]:
scaler = MinMaxScaler()
Xscaled = scaler.fit_transform(X)

Xscaled.shape

(5004, 61)

### Train Test Split

In [8]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
split_idxs = gss.split(Xscaled, y, query_id)
train_idxs, test_idxs = next(split_idxs)

In [9]:
Xtrain, ytrain, groups_train = Xscaled[train_idxs], y[train_idxs], query_id[train_idxs]
Xtest, ytest, groups_test = Xscaled[test_idxs], y[test_idxs], query_id[test_idxs]

Xtrain.shape, ytrain.shape, groups_train.shape, Xtest.shape, ytest.shape, groups_test.shape

((4504, 61), (4504,), (4504,), (500, 61), (500,), (500,))

### Convert to SVM Light format for XGBoost

In [10]:
dump_svmlight_file(Xtrain, ytrain, TRAIN_FILE, zero_based=False, query_id=groups_train)
dump_svmlight_file(Xtest, ytest, TEST_FILE, zero_based=False, query_id=groups_test)

In [11]:
!wc -l /content/drive/MyDrive/data/ltr-data/*.svml

    500 /content/drive/MyDrive/data/ltr-data/test.svml
   4504 /content/drive/MyDrive/data/ltr-data/train.svml
   5004 total


## Model Training

In [12]:
dtrain = xgb.DMatrix(TRAIN_FILE + "?format=libsvm")
dtest = xgb.DMatrix(TEST_FILE + "?format=libsvm")

### LambdaRank Model (rank:pairwise)

In [13]:
params = {
  'objective': 'rank:pairwise',
  'eta': 0.1,
  'gamma': 1.0,
  'min_child_weight': 0.1,
  'max_depth': 6
}
num_rounds = 10

params['eval_metric'] = ['ndcg@10']
evallist = [ (dtrain, 'train'), (dtest, 'test') ]

model_lr = xgb.train(params, dtrain, num_rounds, evals=evallist)

[0]	train-ndcg@10:0.86274	test-ndcg@10:0.79469
[1]	train-ndcg@10:0.89202	test-ndcg@10:0.80127
[2]	train-ndcg@10:0.90400	test-ndcg@10:0.80605
[3]	train-ndcg@10:0.91098	test-ndcg@10:0.82094
[4]	train-ndcg@10:0.91857	test-ndcg@10:0.82225
[5]	train-ndcg@10:0.92669	test-ndcg@10:0.83030
[6]	train-ndcg@10:0.93682	test-ndcg@10:0.83075
[7]	train-ndcg@10:0.93999	test-ndcg@10:0.80711
[8]	train-ndcg@10:0.94905	test-ndcg@10:0.82735
[9]	train-ndcg@10:0.95226	test-ndcg@10:0.82344


In [14]:
model_lr.save_model(LR_MODEL_FILE)

## LambdaMART Model (rank:ndcg)

In [15]:
params = {
  'objective': 'rank:ndcg',
  'eta': 0.1,
  'gamma': 1.0,
  'min_child_weight': 0.1,
  'max_depth': 6
}
num_rounds = 10

params['eval_metric'] = ['ndcg@10']
evallist = [ (dtrain, 'train'), (dtest, 'test') ]

model_lm = xgb.train(params, dtrain, num_rounds, evals=evallist)

[0]	train-ndcg@10:0.83917	test-ndcg@10:0.79532
[1]	train-ndcg@10:0.87522	test-ndcg@10:0.81703
[2]	train-ndcg@10:0.89649	test-ndcg@10:0.82095
[3]	train-ndcg@10:0.90580	test-ndcg@10:0.81629
[4]	train-ndcg@10:0.91194	test-ndcg@10:0.82056
[5]	train-ndcg@10:0.91315	test-ndcg@10:0.81586
[6]	train-ndcg@10:0.91641	test-ndcg@10:0.81577
[7]	train-ndcg@10:0.91879	test-ndcg@10:0.81788
[8]	train-ndcg@10:0.91820	test-ndcg@10:0.81995
[9]	train-ndcg@10:0.91909	test-ndcg@10:0.81711


In [16]:
model_lm.save_model(LM_MODEL_FILE)

In [17]:
# pred = model.predict(dtest)

# eval = xgb.rank.eval.ndcg(pred, dtest, 10)
# # print('NDCG@10:', eval)