# Search Relevance at Scale — Vespa vs FAISS, LightGBM vs PyTorch *(+ MS MARCO + FastAPI)*

**Dataset toggle** (Amazon Reviews Multi or MS MARCO), hybrid retrieval (BM25 + FAISS e5), RRF, LightGBM vs PyTorch ranking, optional CE rerank, Vespa scaffold, and **FastAPI** online scoring.

In [None]:

#@title Install dependencies
%%capture
!pip -q install --upgrade pip
!pip -q install datasets transformers sentence-transformers faiss-cpu rank-bm25                  lightgbm scikit-learn torchmetrics pyvespa pandas numpy matplotlib tqdm                  langdetect unidecode scipy fastapi uvicorn pydantic

## 1) Imports & Config

In [None]:

import os, time, json, math, re, random, itertools
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from scipy.stats import entropy
import torch, torch.nn as nn, torch.optim as optim
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
import faiss, lightgbm as lgb
from tqdm.auto import tqdm

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED = 42
np.random.seed(SEED); torch.manual_seed(SEED); random.seed(SEED)
CONFIG = {"DATASET":"amazon","language":"en","N_DOCS":60000,"N_QUERIES":8000,"TOPK_BM25":200,"TOPK_ANN":200,"FUSION_K":300,"RERANK_TOPN":40,"USE_CE_RERANK":False}
CONFIG

## 2) Dataset Loaders — Amazon Reviews Multi and MS MARCO

In [None]:

def load_amazon_reviews_multi(lang="en", n_docs=60000, n_queries=8000, seed=SEED):
    ds = load_dataset("amazon_reviews_multi", lang, split="train")
    df = ds.to_pandas()[["product_id","review_title","review_body","stars"]].dropna()
    g = df.groupby("product_id")
    prod = g.agg({"review_title": lambda s: " | ".join(s.head(10).astype(str)),
                  "review_body":  lambda s: " ".join(s.head(5).astype(str)),
                  "stars": "mean"}).reset_index()
    prod["doc_text"] = (prod["review_title"].fillna("") + " " + prod["review_body"].fillna("")).str.strip()
    prod = prod[prod["doc_text"].str.len()>32].sample(frac=1, random_state=seed).head(n_docs).reset_index(drop=True)
    pids = set(prod["product_id"])
    q = df[df["product_id"].isin(pids)][["review_title","product_id"]].dropna()
    q = q.rename(columns={"review_title":"query","product_id":"relevant_pid"}).drop_duplicates()
    q = q.sample(frac=1, random_state=seed).head(n_queries).reset_index(drop=True)
    prod = prod[["product_id","doc_text","stars"]]
    return prod, q

def load_msmarco(n_docs=120000, n_queries=12000, seed=SEED):
    ds = load_dataset("ms_marco", "v2.1", split="train[:10%]")
    docs, qrows, seen = [], [], {}
    for r in ds:
        q = r["query"]; passages = r.get("passages", {})
        pos_idxs = [i for i,sel in enumerate(passages.get("is_selected", [])) if sel==1]
        if not pos_idxs: continue
        pid_text = passages["passage_text"][pos_idxs[0]]
        did = str(abs(hash(pid_text)) % (10**12))
        if did not in seen:
            seen[did] = {"product_id": did, "doc_text": pid_text, "stars": 0.0}
            docs.append(seen[did])
        qrows.append({"query": q, "relevant_pid": did})
        if len(docs) >= n_docs and len(qrows) >= n_queries: break
    docs_df = pd.DataFrame(docs).sample(frac=1, random_state=seed).head(n_docs).reset_index(drop=True)
    queries_df = pd.DataFrame(qrows).sample(frac=1, random_state=seed).head(n_queries).reset_index(drop=True)
    return docs_df, queries_df

docs_df, queries_df = (load_amazon_reviews_multi(CONFIG["language"], CONFIG["N_DOCS"], CONFIG["N_QUERIES"])
                       if CONFIG["DATASET"]=="amazon" else
                       load_msmarco(CONFIG["N_DOCS"], CONFIG["N_QUERIES"]))
len(docs_df), len(queries_df)

## 3) BM25 + FAISS (e5) + RRF

In [None]:

import re
def simple_tokenize(txt): return [t for t in re.sub(r"\W+"," ", str(txt).lower()).split() if t]
bm25 = BM25Okapi([simple_tokenize(t) for t in docs_df["doc_text"].tolist()])

def bm25_search(qs, k):
    out_idx, out_scores = [], []
    for q in qs:
        s = bm25.get_scores(simple_tokenize(q))
        top = np.argpartition(s, -k)[-k:]; top = top[np.argsort(-s[top])]
        out_idx.append(top); out_scores.append(s[top])
    return out_idx, out_scores

dense = SentenceTransformer("intfloat/multilingual-e5-base", device=DEVICE)
def encode_texts(texts, bs=128, normal=True):
    vecs=[]; 
    for i in range(0,len(texts),bs):
        emb = dense.encode(texts[i:i+bs], batch_size=bs, convert_to_numpy=True, normalize_embeddings=normal, show_progress_bar=False)
        vecs.append(emb.astype("float32"))
    return np.vstack(vecs)

import faiss
doc_vec = encode_texts(docs_df["doc_text"].tolist(), 128, True)
index = faiss.IndexHNSWFlat(doc_vec.shape[1], 32); index.hnsw.efConstruction=200; index.hnsw.efSearch=128; index.add(doc_vec)
def ann_search(qs, k):
    q_vec = encode_texts(qs, 128, True); sco, idx = index.search(q_vec, k); return idx, sco

def rrf_fusion(bm_idx, ann_idx, k=300, K=60):
    fused=[]; 
    for i in range(len(bm_idx)):
        scores={}
        for r,d in enumerate(bm_idx[i]): scores[d]=scores.get(d,0)+1.0/(K+r+1)
        for r,d in enumerate(ann_idx[i]): scores[d]=scores.get(d,0)+1.0/(K+r+1)
        top=sorted(scores.items(), key=lambda x:-x[1])[:k]
        fused.append(np.array([d for d,_ in top], dtype=int))
    return fused

## 4) Candidates, Features, Labels

In [None]:

train_q, dev_q = train_test_split(queries_df, test_size=0.2, random_state=SEED, shuffle=True)
def build_candidates_and_labels(qdf):
    qs = qdf["query"].tolist()
    bm_idx,_ = bm25_search(qs, 200)
    ann_idx,_ = ann_search(qs, 200)
    fused = rrf_fusion(bm_idx, ann_idx, 300)
    feats, labels, groups = [], [], []
    for i,row in enumerate(qdf.itertuples(index=False)):
        rel_pid = str(row.relevant_pid)
        bm_pos = {d:r for r,d in enumerate(bm_idx[i])}
        an_pos = {d:r for r,d in enumerate(ann_idx[i])}
        q_feats=[]; q_labels=[]
        for d in fused[i]:
            bmr = bm_pos.get(d, 9999); anr = an_pos.get(d, 9999)
            pop = float(docs_df.iloc[d]["stars"]) if "stars" in docs_df.columns else 0.0
            dlen = len(str(docs_df.iloc[d]["doc_text"]).split())
            q_feats.append([bmr, anr, pop, dlen])
            q_labels.append(1.0 if str(docs_df.iloc[d]["product_id"])==rel_pid else 0.0)
        if sum(q_labels)==0: continue
        feats.append(np.array(q_feats, np.float32)); labels.append(np.array(q_labels, np.float32)); groups.append(len(q_feats))
    return feats, labels, groups

Xtr_list, ytr_list, gtr = build_candidates_and_labels(train_q)
Xdv_list, ydv_list, gdv = build_candidates_and_labels(dev_q)
len(Xtr_list), len(Xdv_list)

## 5) Rankers — LightGBM LambdaMART & PyTorch LambdaRank

In [None]:

def flatten_for_lgb(X_list, y_list):
    return np.concatenate(X_list,0), np.concatenate(y_list,0)
Xtr, ytr = flatten_for_lgb(Xtr_list, ytr_list); Xdv, ydv = flatten_for_lgb(Xdv_list, ydv_list)
lgb_train = lgb.Dataset(Xtr, label=ytr, group=gtr); lgb_valid = lgb.Dataset(Xdv, label=ydv, group=gdv, reference=lgb_train)
params = {"objective":"lambdarank","metric":"ndcg","ndcg_at":[10],"learning_rate":0.05,"num_leaves":63,"min_data_in_leaf":50,"feature_pre_filter":False,"verbose":-1}
lgb_model = lgb.train(params, lgb_train, valid_sets=[lgb_valid], num_boost_round=300, early_stopping_rounds=30, verbose_eval=50)

In [None]:

class LambdaRankTorch(nn.Module):
    def __init__(self, in_dim): super().__init__(); self.mlp = nn.Sequential(nn.Linear(in_dim,128), nn.ReLU(), nn.Linear(128,1))
    def forward(self,x): return self.mlp(x)
def pairwise_loss(scores, labels):
    s=scores.view(-1); y=labels.view(-1); pos=torch.where(y>0.5)[0]; neg=torch.where(y<0.5)[0]
    if len(pos)==0 or len(neg)==0: return None
    diff = s[pos].unsqueeze(1) - s[neg].unsqueeze(0); return torch.mean(torch.log1p(torch.exp(-diff)))
def train_lambdarank_torch(X_list, y_list, epochs=3, lr=3e-3, device=DEVICE):
    model=LambdaRankTorch(in_dim=X_list[0].shape[1]).to(device); opt=optim.AdamW(model.parameters(), lr=lr)
    for ep in range(epochs):
        tot=0; cnt=0; model.train()
        for Xq,yq in zip(X_list,y_list):
            xb=torch.tensor(Xq, dtype=torch.float32).to(device); yb=torch.tensor(yq, dtype=torch.float32).to(device)
            opt.zero_grad(); sc=model(xb); l=pairwise_loss(sc,yb); 
            if l is None: continue
            l.backward(); opt.step(); tot+=l.item(); cnt+=1
        print(f"[Torch] epoch {ep} loss {tot/max(cnt,1):.4f}")
    return model
torch_model = train_lambdarank_torch(Xtr_list, ytr_list, epochs=3, lr=3e-3)

## 6) Evaluation + Artifact Save (for FastAPI)

In [None]:

def ndcg_at_k(labels, scores, k=10):
    order = np.argsort(-scores)[:k]; rel = np.array(labels)[order]
    gains = (2**rel - 1)/np.log2(np.arange(2,len(rel)+2))
    ideal = (2**sorted(labels, reverse=True) - 1)/np.log2(np.arange(2,len(rel)+2))
    ideal = ideal.sum() if len(ideal)>0 else 1.0
    return gains.sum()/ideal if ideal>0 else 0.0

def eval_ranker_lightgbm(X_list, y_list, model):
    ndcgs=[]; rec=[]
    for Xq,yq in zip(X_list,y_list):
        sc=model.predict(Xq, num_iteration=model.best_iteration)
        ndcgs.append(ndcg_at_k(yq, sc, 10)); rec.append(int(np.sum(np.array(yq)[np.argsort(-sc)[:100]])>0))
    return float(np.mean(ndcgs)), float(np.mean(rec))

def eval_ranker_torch(X_list, y_list, model):
    model.eval(); ndcgs=[]; rec=[]
    with torch.no_grad():
        for Xq,yq in zip(X_list,y_list):
            sc=model(torch.tensor(Xq, dtype=torch.float32).to(DEVICE)).cpu().numpy().ravel()
            ndcgs.append(ndcg_at_k(yq, sc, 10)); rec.append(int(np.sum(np.array(yq)[np.argsort(-sc)[:100]])>0))
    return float(np.mean(ndcgs)), float(np.mean(rec))

print("LightGBM:", eval_ranker_lightgbm(Xdv_list, ydv_list, lgb_model))
print("PyTorch :", eval_ranker_torch(Xdv_list, ydv_list, torch_model))

ART_DIR = "/content/artifacts"; os.makedirs(ART_DIR, exist_ok=True)
docs_meta = {"product_id": docs_df["product_id"].astype(str).tolist(),
             "doc_text": docs_df["doc_text"].tolist(),
             "stars": docs_df["stars"].tolist() if "stars" in docs_df.columns else [0.0]*len(docs_df)}
open(os.path.join(ART_DIR,"docs_meta.json"),"w").write(json.dumps(docs_meta))
faiss.write_index(index, os.path.join(ART_DIR,"faiss_hnsw.index"))
lgb_model.save_model(os.path.join(ART_DIR,"lgb_lambdamart.txt"), num_iteration=lgb_model.best_iteration)
torch.save({"state_dict": torch_model.state_dict(), "in_dim": Xtr_list[0].shape[1]}, os.path.join(ART_DIR,"torch_lambdarank.pt"))
open(os.path.join(ART_DIR,"dense_model.json"),"w").write(json.dumps({"name":"intfloat/multilingual-e5-base"}))
open(os.path.join(ART_DIR,"config.json"),"w").write(json.dumps(CONFIG))
os.listdir(ART_DIR)