# 6) ANN Index Engineering: HNSW vs IVF-PQ vs Flat with Real Embeddings

In [None]:

%%capture
!pip -q install --upgrade pip
!pip -q install datasets transformers sentence-transformers faiss-cpu rank-bm25 torchmetrics scikit-learn lightgbm langdetect unidecode pandas matplotlib tqdm nltk

In [None]:

import numpy as np, faiss, time
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm

In [None]:

ds = load_dataset("amazon_reviews_multi","en", split="train[:20%]")
texts = ds["review_title"][:50000]
enc = SentenceTransformer("intfloat/multilingual-e5-base")
emb = enc.encode(texts, batch_size=128, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=True).astype("float32")
q = emb[:1000]; xb = emb[1000:]
d = xb.shape[1]

In [None]:

t0=time.time(); flat = faiss.IndexFlatIP(d); flat.add(xb); add_flat = time.time()-t0
t0=time.time(); D0,I0 = flat.search(q, 100); q_flat = time.time()-t0

In [None]:

hnsw = faiss.IndexHNSWFlat(d, 32); hnsw.hnsw.efConstruction=200; hnsw.hnsw.efSearch=128
t0=time.time(); hnsw.add(xb); add_h = time.time()-t0
t0=time.time(); D1,I1 = hnsw.search(q, 100); q_h = time.time()-t0

nlist=1024; m=32; bits=8
ivfpq = faiss.IndexIVFPQ(faiss.IndexFlatIP(d), d, nlist, m, bits); ivfpq.nprobe=16
t0=time.time(); ivfpq.train(xb); train_t=time.time()-t0
t0=time.time(); ivfpq.add(xb); add_pq = time.time()-t0
t0=time.time(); D2,I2 = ivfpq.search(q, 100); q_pq = time.time()-t0

In [None]:

def recall_at_k(I_ref, I_ann, k=100):
    hits=0
    for i in range(I_ref.shape[0]):
        hits += len(set(I_ref[i,:k]).intersection(set(I_ann[i,:k])))
    return hits/(I_ref.shape[0]*k)
rec_h = recall_at_k(I0, I1, 100)
rec_pq = recall_at_k(I0, I2, 100)
print(f"Flat q_s={q_flat:.3f} HNSW q_s={q_h:.3f} IVF-PQ q_s={q_pq:.3f} | PQ train_s={train_t:.3f}")
print(f"HNSW recall@100={rec_h:.3f} | IVF-PQ recall@100={rec_pq:.3f}")