# Search Relevance at Scale — Vespa vs FAISS, LightGBM vs PyTorch

This notebook is **Colab-executable** and walks through a scalable search relevance pipeline:
- **Data**: Amazon Reviews Multi (medium/large, multilingual).
- **Retrieval**: BM25 (lexical) + **FAISS HNSW** (dense, multilingual e5).
- **Fusion**: RRF.
- **Ranking**: **LightGBM LambdaMART** vs **PyTorch LambdaRank** (pairwise).
- **(Optional)** Rerank: Cross-Encoder (MiniLM) on top-*N*.
- **(Optional)** Vespa: build a minimal **application package**; deploy if you have Docker/endpoint. Colab cannot run Vespa locally, but this notebook generates the package and shows the feed/query code you can reuse with a running Vespa instance.

**Metrics**: NDCG@10, Recall@100, zero-result rate, simple latency sampling.  
**Monitoring**: feature drift (PSI), query mix entropy, quick interleaving simulation.

In [None]:

#@title Install dependencies
%%capture
!pip -q install --upgrade pip
!pip -q install datasets transformers sentence-transformers faiss-cpu rank-bm25                  lightgbm scikit-learn torchmetrics pyvespa pandas numpy matplotlib tqdm                  langdetect unidecode scipy

## 1) Imports & Config

In [None]:

import os, time, json, math, re, random, itertools
import numpy as np, pandas as pd
import matplotlib.pyplot as plt

import torch, torch.nn as nn, torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from scipy.stats import entropy

from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
import faiss
import lightgbm as lgb

from tqdm.auto import tqdm
from langdetect import detect as lang_detect
from unidecode import unidecode

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED = 42
np.random.seed(SEED); torch.manual_seed(SEED); random.seed(SEED)

CONFIG = {
    "language": "en",        # any of: 'en','de','fr','ja','zh','es','it'...
    "N_DOCS": 60000,         # scale up if Colab allows (memory/ runtime)
    "N_QUERIES": 8000,
    "TOPK_BM25": 200,
    "TOPK_ANN": 200,
    "FUSION_K": 300,
    "RERANK_TOPN": 40,       # used only if you enable cross-encoder rerank
    "USE_CE_RERANK": False   # set True to enable (slower)
}

## 2) Load & Build a Product Corpus + Queries

In [None]:

def load_amazon_reviews_multi(lang="en", n_docs=60000, n_queries=8000, seed=SEED):
    # Load split of Amazon Reviews Multi in the given language
    ds = load_dataset("amazon_reviews_multi", lang, split="train")
    df = ds.to_pandas()[["product_id","review_title","review_body","stars"]].dropna()
    # Aggregate reviews per product into a "document"
    g = df.groupby("product_id")
    prod = g.agg({
        "review_title": lambda s: " | ".join(s.head(10).astype(str)),
        "review_body":  lambda s: " ".join(s.head(5).astype(str)),
        "stars": "mean"
    }).reset_index()
    prod["doc_text"] = (prod["review_title"].fillna("") + " " + prod["review_body"].fillna("")).str.strip()
    prod = prod[prod["doc_text"].str.len() > 32].sample(frac=1, random_state=seed).head(n_docs).reset_index(drop=True)

    # Build a weak-labeled query set from titles; align relevant_pid to the same product
    pids = set(prod["product_id"])
    q_df = df[df["product_id"].isin(pids)][["review_title","product_id"]].dropna()
    q_df = q_df.rename(columns={"review_title":"query","product_id":"relevant_pid"}).drop_duplicates()
    q_df = q_df.sample(frac=1, random_state=seed).head(n_queries).reset_index(drop=True)

    # Keep only necessary columns
    prod = prod[["product_id","doc_text","stars"]]
    return prod, q_df

docs_df, queries_df = load_amazon_reviews_multi(CONFIG["language"], CONFIG["N_DOCS"], CO~NFIG["N_QUERIES"])
docs_df.head(), queries_df.head(), len(docs_df), len(queries_df)

## 3) Lexical Retrieval (BM25/OpenSearch-like)

In [None]:

def simple_tokenize(txt):
    txt = str(txt).lower().replace("\n", " ")
    return [t for t in re.sub(r"\W+", " ", txt).split() if t]

bm25 = BM25Okapi([simple_tokenize(t) for t in docs_df["doc_text"].tolist()])

def bm25_search(qs, k):
    out_idx, out_scores = [], []
    for q in qs:
        s = bm25.get_scores(simple_tokenize(q))
        top = np.argpartition(s, -k)[-k:]
        top = top[np.argsort(-s[top])]
        out_idx.append(top)
        out_scores.append(s[top])
    return out_idx, out_scores

## 4) Dense Retrieval (SentenceTransformers e5 + FAISS HNSW)

In [None]:

dense = SentenceTransformer("intfloat/multilingual-e5-base", device=DEVICE)

def encode_texts(texts, bs=128, normal=True):
    vecs = []
    for i in range(0, len(texts), bs):
        emb = dense.encode(
            texts[i:i+bs],
            batch_size=bs,
            convert_to_numpy=True,
            normalize_embeddings=normal,
            show_progress_bar=False
        )
        vecs.append(emb.astype("float32"))
    return np.vstack(vecs)

doc_vec = encode_texts(docs_df["doc_text"].tolist(), bs=128, normal=True)
index = faiss.IndexHNSWFlat(doc_vec.shape[1], 32)
index.hnsw.efConstruction = 200
index.hnsw.efSearch = 128
index.add(doc_vec)

def ann_search(qs, k):
    q_vec = encode_texts(qs, bs=128, normal=True)
    sco, idx = index.search(q_vec, k)
    return idx, sco

## 5) Candidate Fusion (RRF)

In [None]:

def rrf_fusion(bm_idx, ann_idx, k=300, K=60):
    fused = []
    for i in range(len(bm_idx)):
        scores = {}
        for r, d in enumerate(bm_idx[i]):
            scores[d] = scores.get(d, 0.0) + 1.0/(K + r + 1)
        for r, d in enumerate(ann_idx[i]):
            scores[d] = scores.get(d, 0.0) + 1.0/(K + r + 1)
        top = sorted(scores.items(), key=lambda x: -x[1])[:k]
        fused.append(np.array([d for d,_ in top], dtype=int))
    return fused

## 6) Build Candidates & Labels

In [None]:

train_q, dev_q = train_test_split(queries_df, test_size=0.2, random_state=SEED, shuffle=True)

def build_candidates_and_labels(qdf):
    qs = qdf["query"].tolist()
    bm_idx, bm_s = bm25_search(qs, CONFIG["TOPK_BM25"])
    ann_idx, ann_s = ann_search(qs, CONFIG["TOPK_ANN"])
    fused = rrf_fusion(bm_idx, ann_idx, CONFIG["FUSION_K"])

    feats, labels, groups = [], [], []
    for i, row in enumerate(qdf.itertuples(index=False)):
        rel_pid = row.relevant_pid
        q_feats, q_labels = [], []
        bm_positions = {d: r for r, d in enumerate(bm_idx[i])}
        ann_positions = {d: r for r, d in enumerate(ann_idx[i])}
        for d in fused[i]:
            bmr = bm_positions.get(d, 9999)
            anr = ann_positions.get(d, 9999)
            pop = float(docs_df.iloc[d]["stars"])
            dlen = len(str(docs_df.iloc[d]["doc_text"]).split())
            q_feats.append([bmr, anr, pop, dlen])
            q_labels.append(1.0 if docs_df.iloc[d]["product_id"] == rel_pid else 0.0)
        if sum(q_labels) == 0:
            continue
        feats.append(np.array(q_feats, dtype=np.float32))
        labels.append(np.array(q_labels, dtype=np.float32))
        groups.append(len(q_feats))
    return feats, labels, groups

Xtr_list, ytr_list, gtr = build_candidates_and_labels(train_q)
Xdv_list, ydv_list, gdv = build_candidates_and_labels(dev_q)
len(Xtr_list), len(Xdv_list), sum(gtr), sum(gdv)

## 7) LightGBM LambdaMART Ranking

In [None]:

def flatten_for_lgb(X_list, y_list):
    X = np.concatenate(X_list, axis=0)
    y = np.concatenate(y_list, axis=0)
    return X, y

Xtr, ytr = flatten_for_lgb(Xtr_list, ytr_list)
Xdv, ydv = flatten_for_lgb(Xdv_list, ydv_list)

lgb_train = lgb.Dataset(Xtr, label=ytr, group=gtr)
lgb_valid = lgb.Dataset(Xdv, label=ydv, group=gdv, reference=lgb_train)

params = {
    "objective": "lambdarank",
    "metric": "ndcg",
    "ndcg_at": [10],
    "learning_rate": 0.05,
    "num_leaves": 63,
    "min_data_in_leaf": 50,
    "feature_pre_filter": False,
    "verbose": -1
}

lgb_model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_valid],
    num_boost_round=300,
    early_stopping_rounds=30,
    verbose_eval=50
)

## 8) PyTorch LambdaRank (pairwise)

In [None]:

class LambdaRankTorch(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(in_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )
    def forward(self, x):
        return self.mlp(x)

def pairwise_loss(scores, labels):
    s = scores.view(-1)
    y = labels.view(-1)
    pos = torch.where(y > 0.5)[0]
    neg = torch.where(y < 0.5)[0]
    if len(pos) == 0 or len(neg) == 0:
        return None
    sp = s[pos].unsqueeze(1)
    sn = s[neg].unsqueeze(0)
    diff = sp - sn
    return torch.mean(torch.log1p(torch.exp(-diff)))

def train_lambdarank_torch(X_list, y_list, epochs=3, lr=3e-3, device=DEVICE):
    model = LambdaRankTorch(in_dim=X_list[0].shape[1]).to(device)
    opt = optim.AdamW(model.parameters(), lr=lr)
    for ep in range(epochs):
        model.train()
        tot = 0.0; cnt = 0
        for Xq, yq in zip(X_list, y_list):
            xb = torch.tensor(Xq, dtype=torch.float32).to(device)
            yb = torch.tensor(yq, dtype=torch.float32).to(device)
            opt.zero_grad()
            sc = model(xb)
            loss = pairwise_loss(sc, yb)
            if loss is None: continue
            loss.backward(); opt.step()
            tot += loss.item(); cnt += 1
        print(f"[Torch LambdaRank] epoch {ep} avg loss {tot/max(cnt,1):.4f}")
    return model

torch_model = train_lambdarank_torch(Xtr_list, ytr_list, epochs=3, lr=3e-3)

## 9) Evaluation: NDCG@10 / Recall@100 / Zero-Result Rate

In [None]:

def ndcg_at_k(labels, scores, k=10):
    order = np.argsort(-scores)[:k]
    rel = np.array(labels)[order]
    gains = (2**rel - 1) / np.log2(np.arange(2, len(rel)+2))
    ideal = (2**sorted(labels, reverse=True) - 1) / np.log2(np.arange(2, len(rel)+2))
    ideal = ideal.sum() if len(ideal)>0 else 1.0
    return gains.sum()/ideal if ideal>0 else 0.0

def eval_ranker_lightgbm(X_list, y_list, model):
    ndcgs, rec = [], []
    for Xq, yq in zip(X_list, y_list):
        sc = model.predict(Xq, num_iteration=model.best_iteration)
        ndcgs.append(ndcg_at_k(yq, sc, 10))
        top100 = np.argsort(-sc)[:100]
        rec.append(int(np.sum(np.array(yq)[top100])>0))
    return float(np.mean(ndcgs)), float(np.mean(rec))

def eval_ranker_torch(X_list, y_list, model):
    model.eval()
    ndcgs, rec = [], []
    with torch.no_grad():
        for Xq, yq in zip(X_list, y_list):
            xb = torch.tensor(Xq, dtype=torch.float32).to(DEVICE)
            sc = model(xb).cpu().numpy().ravel()
            ndcgs.append(ndcg_at_k(yq, sc, 10))
            top100 = np.argsort(-sc)[:100]
            rec.append(int(np.sum(np.array(yq)[top100])>0))
    return float(np.mean(ndcgs)), float(np.mean(rec))

lgb_ndcg10, lgb_recall100 = eval_ranker_lightgbm(Xdv_list, ydv_list, lgb_model)
torch_ndcg10, torch_recall100 = eval_ranker_torch(Xdv_list, ydv_list, torch_model)

print("LightGBM   — NDCG@10:", lgb_ndcg10,  "Recall@100:", lgb_recall100)
print("PyTorch    — NDCG@10:", torch_ndcg10, "Recall@100:", torch_recall100)

## 10) Optional: Cross-Encoder Rerank (Budgeted Precision)

In [None]:

USE_CE = bool(CONFIG.get("USE_CE_RERANK", False))
if USE_CE:
    from sentence_transformers import CrossEncoder
    teacher_pairs = []
    for Xq, yq, qrow in zip(Xdv_list, ydv_list, dev_q.itertuples(index=False)):
        sc = lgb_model.predict(Xq, num_iteration=lgb_model.best_iteration)
        order = np.argsort(-sc)[:CONFIG["RERANK_TOPN"]]
        for d in order:
            doc_text = docs_df.iloc[int(d)]["doc_text"]
            teacher_pairs.append([qrow.query, doc_text])
    ce = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2", device=DEVICE)
    ce_scores = ce.predict(teacher_pairs, batch_size=64, show_progress_bar=True)
    print("CE rerank scored pairs:", len(ce_scores))
else:
    print("Skipping CE rerank. Set CONFIG['USE_CE_RERANK']=True to enable.")

## 11) Latency Sampling (rough)

In [None]:

def time_block(fn, *args, repeats=5, **kwargs):
    times = []
    for _ in range(repeats):
        t0 = time.time()
        _ = fn(*args, **kwargs)
        times.append((time.time()-t0)*1000.0)
    return np.mean(times), np.percentile(times, 95)

sample_qs = dev_q["query"].head(64).tolist()
bm_t_mean, bm_t_p95 = time_block(lambda qs: bm25_search(qs, 50), sample_qs, repeats=3)
ann_t_mean, ann_t_p95 = time_block(lambda qs: ann_search(qs, 50), sample_qs, repeats=3)
print(f"BM25 latency ~ mean {bm_t_mean:.1f}ms / p95 {bm_t_p95:.1f}ms (batch of {len(sample_qs)})")
print(f"ANN  latency ~ mean {ann_t_mean:.1f}ms / p95 {ann_t_p95:.1f}ms (batch of {len(sample_qs)})")

## 12) Drift & Health Checks (PSI, Query Mix Entropy)

In [None]:

def psi(expected, actual, buckets=10, eps=1e-8):
    q = np.linspace(0, 100, buckets+1)
    cuts_e = np.percentile(expected, q)
    cuts_a = np.percentile(actual, q)
    cuts = np.unique(np.concatenate([cuts_e, cuts_a]))
    e_hist, _ = np.histogram(expected, bins=cuts)
    a_hist, _ = np.histogram(actual, bins=cuts)
    e = e_hist / (e_hist.sum() + eps)
    a = a_hist / (a_hist.sum() + eps)
    val = np.sum((a - e) * np.log((a + eps)/(e + eps)))
    return float(val)

Xtr_all = np.concatenate(Xtr_list, axis=0)
Xdv_all = np.concatenate(Xdv_list, axis=0)
psis = [psi(Xtr_all[:,i], Xdv_all[:,i]) for i in range(Xtr_all.shape[1])]
for i,pv in enumerate(psis):
    print(f"Feature {i} PSI: {pv:.4f}")

def q_entropy(qs):
    tokens = [str(q).split()[0].lower() if str(q).split() else "<empty>" for q in qs]
    vals, counts = np.unique(tokens, return_counts=True)
    p = counts / counts.sum()
    return float(entropy(p))
ent_train = q_entropy(train_q["query"].tolist())
ent_dev = q_entropy(dev_q["query"].tolist())
print("Query-mix entropy train/dev:", ent_train, ent_dev)

## 13) Interleaving Simulation (A vs B Rankers)

In [None]:

def interleave(rankA, rankB, k=10):
    seen=set(); out=[]
    a=b=0
    while len(out)<k and (a<len(rankA) or b<len(rankB)):
        if len(out)%2==0:
            while a<len(rankA) and rankA[a] in seen: a+=1
            if a<len(rankA): out.append((rankA[a],"A")); seen.add(rankA[a]); a+=1
        else:
            while b<len(rankB) and rankB[b] in seen: b+=1
            if b<len(rankB): out.append((rankB[b],"B")); seen.add(rankB[b]); b+=1
    return out

winsA=winsB=0; trials=500
for t in range(trials):
    A = list(np.random.permutation(200)[:20])
    B = list(np.random.permutation(200)[:20])
    il = interleave(A,B,10)
    relA = set(A[:10]); relB = set(B[:12])
    clicks = [1 if (d in relA or d in relB) and np.random.rand()<0.2 else 0 for d,_ in il]
    creditA = sum(c for (d,s),c in zip(il,clicks) if s=="A")
    creditB = sum(c for (d,s),c in zip(il,clicks) if s=="B")
    winsA += (creditA>creditB); winsB += (creditB>creditA)
print("Interleaving wins A:", winsA, "B:", winsB)

## 14) Vespa: Application Package (schema + feed + query) — *optional*

> **Note:** Running Vespa locally on Colab is not supported (requires Docker / Java services).  
> This section *generates* a minimal Vespa **application package** (`schemas/`, `services.xml`, `hosts.xml`)
> and includes Python snippets to **deploy**, **feed documents**, and **query** when you run it on your own machine or Vespa Cloud.

In [None]:

import os, json, pathlib, textwrap

VESPA_DIR = "/content/vespa_app"
os.makedirs(VESPA_DIR, exist_ok=True)
os.makedirs(os.path.join(VESPA_DIR, "schemas"), exist_ok=True)

schema = r"""
schema product {
  document product {
    field product_id type string {
      indexing: attribute | summary
    }
    field doc_text type string {
      indexing: index | summary
      index: enable-bm25
    }
    field stars type double {
      indexing: attribute | summary
    }
  }
  fieldsets {
    default: doc_text
  }
  rank-profile bm25 {
    first-phase {
      expression: bm25(doc_text)
    }
  }
}
"""

with open(os.path.join(VESPA_DIR, "schemas", "product.sd"), "w") as f:
    f.write(schema)

services_xml = r"""
<services>
  <container id="default" version="1.0">
    <search/>
    <document-api/>
    <http>
      <server id="default" port="8080"/>
    </http>
    <component id="com.yahoo.language.simple.SimpleLinguistics" bundle="language-tools"/>
  </container>
  <content id="content" version="1.0">
    <documents>
      <document type="product" mode="index"/>
    </documents>
    <nodes count="1"/>
  </content>
</services>
"""

with open(os.path.join(VESPA_DIR, "services.xml"), "w") as f:
    f.write(services_xml)

hosts_xml = "<hosts><host name='localhost'><alias>node1</alias></host></hosts>"
with open(os.path.join(VESPA_DIR, "hosts.xml"), "w") as f:
    f.write(hosts_xml)

print("Vespa app package written to:", VESPA_DIR)
print("Next steps (run locally with Docker):")
print("1) docker run --detach --name vespa --hostname vespa-container --privileged -p 8080:8080 -p 19071:19071 vespaengine/vespa")
print("2) vespa deploy --wait 300", VESPA_DIR)
print("3) Feed via /document/v1/, query via /search/ yql='select * from sources product where userInput(@q)'")

### Vespa feed/query helpers (execute when Vespa endpoint is reachable)

In [None]:

import requests

VESPA_ENDPOINT = None  # e.g., "http://localhost:8080"

def vespa_feed_docs(endpoint, docs):
    for d in docs:
        url = f"{endpoint}/document/v1/product/product/docid/{d['product_id']}"
        r = requests.post(url, json={"fields": d})
        if r.status_code//100 != 2:
            print("Feed error:", r.status_code, r.text)
            break

def vespa_query(endpoint, q, hits=10):
    params = {"yql": "select * from sources product where userInput(@q)", "q": q, "hits": hits}
    r = requests.get(f"{endpoint}/search/", params=params)
    return r.json()

# Example usage (uncomment when endpoint is set):
# if VESPA_ENDPOINT:
#     feed = docs_df.head(1000).rename(columns={"doc_text":"doc_text","stars":"stars"}).to_dict(orient="records")
#     vespa_feed_docs(VESPA_ENDPOINT, feed)
#     print(vespa_query(VESPA_ENDPOINT, "iphone case"))

## 15) Wrap-up & Next Steps

You now have a working **retrieval + ranking** stack with:
- **FAISS HNSW** dense retrieval + BM25 lexical, fused by RRF
- **LightGBM LambdaMART** vs **PyTorch LambdaRank** comparison
- Optional **cross-encoder** rerank
- Basic **latency**, **drift**, and **interleaving** utilities
- A ready-to-deploy **Vespa** application package scaffold (run locally or in Vespa Cloud)

**Extensions**
- Add category/brand/entity KG features, seller quality, price/discount, ETA.
- Swap to **BEIR**/MS MARCO for broader retrieval eval.
- Add a simple **FastAPI** microservice for online scoring.