In [1]:
%load_ext autoreload
%autoreload 2

# Detailed Tutorial

This is a small tutorial on how to use `rpc.codecs.ROCOneShotArrayCodec` to compress a matrix of shape `(n, d)` where the order between rows is irrelevant.
Equivalently, this can be thought of as the set of 1-D arrays corresponding to the rows of the matrix.

There are 3 restrictions the matrix must meet in practice:
1. Rows must be unique.
2. The matrix must be sorted row-wise.
3. Auxiliary data must be available at the encoder (e.g., user ids) to remove the initial bits problems.

In [2]:
from faiss.contrib.datasets import DatasetSIFT1M
from faiss.contrib.inspect_tools import get_invlist
import faiss
import numpy as np

rng = np.random.default_rng(seed=0)

# Build index, remove repetitions (Restriction 1).
ds = DatasetSIFT1M()
db = ds.get_database()
db = np.unique(db, axis=0)
queries = ds.get_queries()
index = faiss.index_factory(ds.d, f'IVF1000,PQ16x8')
index.train(ds.get_train())
index.add(db)

# Restriction 2: sort database based on codes.
# Also, add with external user-generated ids to serve as auxiliary information
# in later steps.
perm = []
for c in range(index.nlist):
    I, X = get_invlist(index.invlists, c)
    perm_sort = np.lexsort(X[:, ::-1].T)
    perm.append(I[perm_sort])
perm = np.concatenate(perm)
index.reset()

ids = rng.integers(0, 1 << 16, size=db.shape[0])
index.add_with_ids(db[perm], ids[perm]) 

We'll go through the inner workings of `rpc.codecs.ROCOneShotArrayCodec`. First, we sample a permutation with `rpc.codecs.VectorizedPermutationCodec`. Then, we encode the rows of the matrix in the order defined by the sampled permutation.

`rpc.codecs.VectorizedPermutationCodec` implements a codec that samples a permutation uniformly at random from the ANS state using a vectorized Lehmer Code.

See the overleaf for details on what a Lehmer Code is.

In [3]:
from rpc.codecs import VectorizedPermutationCodec, UniformCodec
from rpc.rans import initialize_ans_state
from faiss.contrib.inspect_tools import get_invlist
import numpy as np

# Select a cluster to encode
I, X = get_invlist(index.invlists, 23)
n, d = X.shape

# To facilitate the exposition, we'll pretend `n` is an 
# integer multiple of `d`. Note that in the actual codec 
# `rpc.codecs.ROCOneShotArrayCodec` this is not necessary.
I = I[:d*(n//d)]
X = X[:d*(n//d)]
n = X.shape[0]

ans_state = initialize_ans_state(shape=(d,))

# (Restriction 3) In this example, the user ids serve as the auxiliary data mentioned in the paper.
# However, any data can be used.
# Encode the auxiliary information in blocks of `d`.
for i in range(n//d):
    ans_state = UniformCodec(precs=1 << 16).encode(I[d*i: d*(i+1)], ans_state)

# Decode a permutation.
codec_perm = VectorizedPermutationCodec(n, d)
ans_state, perm = codec_perm.decode(ans_state)

# Sanity checks.
assert np.all(np.sort(perm) == np.arange(n))

After decoding the permutation, we now just have to reorder the matrix based on this value to encode it into the ANS stack. This is the last step of the encoding procedure.

In [4]:
from rpc.codecs import UniformCodec

row_codec = UniformCodec(precs=256)
for i in perm:
    ans_state = row_codec.encode(X[i], ans_state)

Decoding is shown next. First, the permuted version of `X` is decoded from the ANS stack.

In [5]:
from rpc.codecs import UniformCodec
from rpc.permutations import compute_applied_permutation

X_decoded = np.zeros((n, d), dtype=np.uint8)

# Decode the rows.
for i in reversed(range(n)):
    ans_state, X_decoded[i] = row_codec.decode(ans_state)



From this point onward, all the rows of the matrix (i.e., the PQ codes) are ready to be used.
This algorithm therefore makes the codes available in linear time.

 Next, the permutation applied to `X` is deduced.

In [None]:
# Sort the decoded rows and recover the sampled permutation
perm_decoded = compute_applied_permutation(X_decoded)

# Sanity check.
assert np.all(perm_decoded == perm)
for i in range(n):
    assert np.all(X[perm_decoded] == X_decoded)

To recover the auxiliary data, we need to encode the permutation back onto the stack.

In [6]:
# Encode permutation back onto the stack.
ans_state = codec_perm.encode(perm_decoded, ans_state)

# Last step: recover the auxiliary information.
I_decoded = np.zeros(shape=(n,), dtype=np.int64)
for i in reversed(range(n//d)):
    ans_state, I_decoded[d*i: d*(i+1)] = UniformCodec(precs=1 << 16).decode(ans_state)

# Sanity checks.
assert np.all(I == I_decoded)

# Putting everything together: `rpc.codecs.ROCOneShotArrayCodec`

Most of the functionalities, except encoding the auxiliary information, is done by a single codec. Next we show how.

In [7]:
from rpc.codecs import ROCOneShotArrayCodec, UniformCodec
from rpc.rans import initialize_ans_state
from rpc.permutations import compute_applied_permutation

# Select a cluster to encode
I, X = get_invlist(index.invlists, 23)
n, d = X.shape

# Encode the auxiliary information in blocks of `d`, and then the last block.
ans_state = initialize_ans_state(shape=(d,))
for i in range(n//d):
    ans_state = UniformCodec(precs=1 << 16).encode(I[d*i: d*(i+1)], ans_state)

# Encode the matrix.
row_codec = UniformCodec(precs=256)
codec = ROCOneShotArrayCodec((n, d), row_codec)
ans_state = codec.encode(X, ans_state)

# Decode matrix.
_, X_decoded = codec.decode(ans_state)

# Sanity check.
perm = compute_applied_permutation(X_decoded)
assert np.all(X[perm] == X_decoded)
