# 5) Learning-to-Rank Feature Factory with LambdaRank (PyTorch)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:

%%capture
!pip -q install --upgrade pip
!pip -q install datasets transformers sentence-transformers faiss-cpu rank-bm25 torchmetrics scikit-learn lightgbm langdetect unidecode pandas matplotlib tqdm nltk

In [None]:

import numpy as np, pandas as pd, torch, torch.nn as nn, torch.optim as optim, faiss
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
device = "cuda" if torch.cuda.is_available() else "cpu"
SEED=42; np.random.seed(SEED); torch.manual_seed(SEED)

In [None]:

def load_amz(lang="en", n_docs=30000, n_queries=3000):
#    ds = load_dataset("amazon_reviews_multi", lang, split="train")
#    df = ds.to_pandas()[["product_id","review_title","review_body","stars"]].dropna()
    df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Datasets/amazon_reviews_multi/en/train.csv")
    df = df[df["language"] == lang]
    df = df[["review_id", "product_id", "review_title", "review_body", "stars"]].dropna()
    g = df.groupby("product_id")
    prod = g.agg({"review_title":lambda s:" | ".join(s.head(10).astype(str)),
                  "review_body":lambda s:" ".join(s.head(5).astype(str)),
                  "stars":"mean"}).reset_index()
    prod["doc_text"] = (prod["review_title"].fillna("")+" "+prod["review_body"].fillna("")).str.strip()
    prod = prod[prod["doc_text"].str.len()>16].sample(frac=1, random_state=SEED).head(n_docs).reset_index(drop=True)
    pids = set(prod["product_id"])
    q = df[df["product_id"].isin(pids)][["review_title","product_id"]].dropna()
    q = q.rename(columns={"review_title":"query","product_id":"relevant_pid"}).drop_duplicates().sample(frac=1, random_state=SEED).head(n_queries).reset_index(drop=True)
    return prod[["product_id","doc_text","stars"]], q
docs_df, queries_df = load_amz("en", 30000, 3000)

In [None]:

def tok(txt): return [t for t in str(txt).lower().replace("\n"," ").split() if t]
bm25 = BM25Okapi([tok(t) for t in docs_df["doc_text"].tolist()])
dense = SentenceTransformer("intfloat/multilingual-e5-base", device=device)
doc_vec = dense.encode(docs_df["doc_text"].tolist(), batch_size=128, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=True).astype("float32")
index = faiss.IndexFlatIP(doc_vec.shape[1]); index.add(doc_vec)

In [None]:

def candidates_for_queries(qs, topk=300):
    bm_idx, ann_idx = [], []
    for q in qs:
        s = bm25.get_scores(tok(q)); top = np.argpartition(s, -200)[-200:]; top = top[np.argsort(-s[top])]; bm_idx.append(top)
    qv = dense.encode(qs, batch_size=128, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False).astype("float32")
    _, aidx = index.search(qv, 200); ann_idx = aidx
    fused = []
    for i in range(len(qs)):
        fused.append(np.unique(np.concatenate([bm_idx[i], ann_idx[i]]))[:topk])
    return bm_idx, ann_idx, fused
train_q, dev_q = train_test_split(queries_df, test_size=0.2, random_state=SEED)
bm_tr, ann_tr, fused_tr = candidates_for_queries(train_q["query"].tolist(), 300)
bm_dv, ann_dv, fused_dv = candidates_for_queries(dev_q["query"].tolist(), 300)

In [None]:

def build_features(qdf, fused, bm_idx, ann_idx):
    X, Y = [], []
    for i,q in enumerate(qdf.itertuples()):
        feats=[]; labs=[]
        for d in fused[i]:
            bmr = np.where(bm_idx[i]==d)[0][0] if d in bm_idx[i] else 9999
            anr = np.where(ann_idx[i]==d)[0][0] if d in ann_idx[i] else 9999
            pop = float(docs_df.iloc[d]["stars"]); dlen = len(str(docs_df.iloc[d]["doc_text"]).split())
            feats.append([bmr, anr, pop, dlen])
            labs.append(1 if docs_df.iloc[d]["product_id"]==q.relevant_pid else 0)
        if sum(labs)==0: continue
        X.append(np.array(feats, dtype=np.float32)); Y.append(np.array(labs, dtype=np.float32))
    return X,Y
Xtr_list, ytr_list = build_features(train_q, fused_tr, bm_tr, ann_tr)
Xdv_list, ydv_list = build_features(dev_q, fused_dv, bm_dv, ann_dv)

In [None]:

class LambdaRank(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.mlp = nn.Sequential(nn.Linear(in_dim,64), nn.ReLU(), nn.Linear(64,1))
    def forward(self, x): return self.mlp(x)

def pairwise_loss(scores, labels):
    s = scores.view(-1); y = labels.view(-1)
    pos = torch.where(y>0.5)[0]; neg = torch.where(y<0.5)[0]
    if len(pos)==0 or len(neg)==0: return None
    sp = s[pos].unsqueeze(1); sn = s[neg].unsqueeze(0)
    diff = sp - sn
    return torch.mean(torch.log1p(torch.exp(-diff)))

model = LambdaRank(4).to(device); opt = optim.AdamW(model.parameters(), lr=3e-3)
for ep in range(3):
    model.train(); tot=0; cnt=0
    for Xq, yq in zip(Xtr_list, ytr_list):
        xb = torch.tensor(Xq, dtype=torch.float32).to(device); yb = torch.tensor(yq, dtype=torch.float32).to(device)
        opt.zero_grad(); sc = model(xb); l = pairwise_loss(sc, yb)
        if l is None: continue
        l.backward(); opt.step(); tot += l.item(); cnt+=1
    print("ep", ep, "loss", tot/max(cnt,1))

In [None]:

import numpy as np
def ndcg_at_k(labels, scores, k=10):
    order = np.argsort(-scores)[:k]
    rel = np.array(labels)[order]
    gains = (2**rel - 1) / np.log2(np.arange(2, len(rel)+2))
    ideal = (2**sorted(labels, reverse=True) - 1) / np.log2(np.arange(2, len(rel)+2))
    ideal = ideal.sum() if len(ideal)>0 else 1.0
    return gains.sum()/ideal if ideal>0 else 0.0

model.eval(); ndcgs=[]
with torch.no_grad():
    for Xq, yq in zip(Xdv_list, ydv_list):
        xb = torch.tensor(Xq, dtype=torch.float32).to(device)
        sc = model(xb).cpu().numpy().ravel()
        ndcgs.append(ndcg_at_k(yq, sc, 10))
print("Dev NDCG@10:", float(np.mean(ndcgs)))