In [1]:
import faiss
import numpy as np

In [6]:
# emb: (N, d) float32 array of your document vectors
# For cosine similarity, L2-normalize emb and use IP metric; for L2 use as-is.
# Create dummy embeddings (e.g., 1000 docs, each 768-d)
N, d = 9984, 768
emb = np.random.random((N, d)).astype('float32')
N,d = emb.shape
m = 16 # number of sub-quantizers (d should be divisible by m)
nbits = 8 # bits per sub-quantizer (K=256 centroids per subspace)

In [7]:
# For cosine similarity: normalize and use IP; here we show L2 path for clarity.
index_pq = faiss.IndexPQ(d, m, nbits) # L2 by default
# If you want IP (cosine on normalized vectors):
# faiss.normalize_L2(emb)
# index_pq = faiss.IndexPQ(d, m, nbits, faiss.METRIC_INNER_PRODUCT)

In [8]:
# Train on a sample (or the whole set if small)
train_size = min(50000, N)
faiss_idx = np.random.choice(N, train_size, replace=False)
index_pq.train(emb[faiss_idx])

In [9]:
# Add all vectors (compressed as PQ codes)
index_pq.add(emb)

In [10]:
# Query
q = emb[0:1] # or your own query embedding shaped (1, d)
D, I = index_pq.search(q, k=5)
print("Top-5 IDs:", I[0], "Approx distances:", D[0])


Top-5 IDs: [   0 2627   91 8998 6527] Approx distances: [49.26925  67.707794 68.59401  68.9275   69.077255]
