In [1]:
import numpy as np
import faiss


from faiss.contrib.datasets import SyntheticDataset
from faiss.contrib.inspect_tools import get_invlist

Make a synthetic dataset, construct an IVFPQ index. 

In [2]:
ds = SyntheticDataset(32, 10000, 1000, 100)

In [3]:
index = faiss.index_factory(ds.d, "IVF100,PQ4x8np")
index.train(ds.get_train())
index.add(ds.get_database())

Reference search results

In [4]:
index.nprobe = 4
Dref, Iref = index.search(ds.get_queries(), 10)

In [6]:
Dref.shape

(100, 10)

## Reproduce tables 

IVFPQ search is based on precomputed look-up tables. 
This demonstrates how to compute them. Note that the c++ version optionally uses a slightly faster way of precomputing them, see https://github.com/facebookresearch/faiss/blob/main/faiss/IndexIVFPQ.cpp#L334

In [7]:
# set some variables
xq = ds.get_queries()
nq, d = xq.shape
nprobe = index.nprobe

In [8]:
# coarse quantization
Dcoarse, Icoarse = index.quantizer.search(xq, nprobe)

In [10]:
print(nprobe)

4


In [9]:
# compute residuals 
residuals = xq[:, None, :] - index.quantizer.reconstruct_batch(Icoarse.ravel()).reshape(nq, nprobe, d)
residuals.shape

(100, 4, 32)

In [11]:
# call compute_distance_tables on the residual tables 

pq = index.pq 
dis_tab = np.zeros((nq, nprobe, pq.M, pq.ksub), dtype='float32')
dis_tab[:] = np.nan
pq.compute_distance_tables(
    nq * nprobe, 
    faiss.swig_ptr(residuals), 
    faiss.swig_ptr(dis_tab)    
)

In [13]:
dis_tab.shape # (number of queries, nprobe, M, ksub)

(100, 4, 4, 256)

## Search with precomputed table 

Pure Python implementation of search from look-up tables. 

In [14]:
# this is a schematic implementation of 

Dnew = []
Inew = []
K = 10   # number of results 
for i in range(nq): 
    all_dis = []  # all distances for this query vector
    all_ids = []  # all ids 
    for j in range(nprobe): 
        ids, codes = get_invlist(index.invlists, int(Icoarse[i, j]))
        # codes is an array of size l by pq.M. If pq.nbits != 8 the encoding 
        # is a bit more complex, see 
        # https://github.com/facebookresearch/faiss/wiki/Python-C---code-snippets#how-can-i-get-access-to-non-8-bit-quantization-code-entries-in-pq--ivfpq--aq-
        tab = dis_tab[i, j]
        # distances for this inverted list
        distances = np.sum([
            tab[m, codes[:, m]]
            for m in range(pq.M)
        ], axis=0)
        # collect results. In the C++ implementation the top-K results 
        # are maintained with a heap rather than stored completely
        all_dis.append(distances)
        all_ids.append(ids)
    # get the top-K 
    all_dis = np.hstack(all_dis)
    all_ids = np.hstack(all_ids)
    order = np.argsort(all_dis)[:K]
    Dnew.append(all_dis[order])
    Inew.append(all_ids[order])
    
Dnew = np.vstack(Dnew)
Inew = np.vstack(Inew)
        

In [17]:
all_dis.shape

(75,)

In [15]:
nq

100

In [56]:
assert (Inew == Iref).all()
np.testing.assert_allclose(Dref, Dnew, rtol=1e-5)