In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Add with ids

In [2]:
import faiss
from faiss.contrib.datasets import DatasetSIFT1M, SyntheticDataset
import numpy as np

# Work with the most general case: user inserts external ids
ds = DatasetSIFT1M()
# nb = 10_000_000
# nc = int(np.ceil(np.sqrt(nb)))
# ds = SyntheticDataset(d=128, nt=10*nc, nb=nb, nq=100)
db = ds.get_database()
queries = ds.get_queries()
index = faiss.index_factory(ds.d, f'IVF1000,PQ8x8')
# index.verbose = True
index.train(ds.get_train())
np.random.seed(0)
ids = np.random.randint(0, 2**64 - 1, size=len(db), dtype=np.uint64)

# sort database based on ids
perm = np.argsort(ids)
db = db[perm]
ids = ids[perm]

index.add_with_ids(db, ids) # works, the vectors are stored in the underlying index

In [3]:
# searching will return D, I, and C.
# We can merge I and C, row-wise, and apply ROC.
from faiss.contrib.inspect_tools import get_invlist

byte_savings = 0
IC_bytes = 0
I_bytes = 0
C_bytes = 0
for c in range(index.nlist):
    I, C = get_invlist(index.invlists, c)

    # check if vectors are sorted according to ids
    assert np.all(np.diff(I) > 0)

    # check savings
    byte_savings += np.sum(np.log2(np.arange(2, I.shape[0] + 1)))/8
    IC_bytes += I.nbytes + C.nbytes
    I_bytes += I.nbytes
    C_bytes += C.nbytes

print('Total IC bytes:', IC_bytes)
print('Total I bytes:', I_bytes, 'pct', 100*I_bytes/IC_bytes)
print('Total C bytes:', C_bytes, 'pct', 100*C_bytes/IC_bytes)
print('Total bytes saved:', byte_savings, 'pct IC', 100*byte_savings/IC_bytes)
print('Total bytes saved:', byte_savings, 'pct I', 100*byte_savings/I_bytes)
print('Total bytes saved:', byte_savings, 'pct C', 100*byte_savings/C_bytes)

Total IC bytes: 16000000
Total I bytes: 8000000 pct 50.0
Total C bytes: 8000000 pct 50.0
Total bytes saved: 1079392.2928356656 pct IC 6.74620183022291
Total bytes saved: 1079392.2928356656 pct I 13.49240366044582
Total bytes saved: 1079392.2928356656 pct C 13.49240366044582


In [4]:
# merge external ids and codes
from ipynb_helper_functions import compute_empirical_entropy_from_freqs
from sortedcontainers import SortedList

def merge_ids_and_codes(I, C):
    assert C.dtype == np.uint8
    return np.concatenate([I.view(dtype=np.uint8).reshape(I.shape[0], -1),C], axis=1)

# for a single cluster
I, C = get_invlist(index.invlists, 23)
IC = merge_ids_and_codes(I, C)
sorted_list = SortedList([ic.tobytes() for ic in IC])
sorted_list

SortedList([b'\x00\x0e\x9fz\x03\xd4\x14\xbfZ\nO\xe9H\xe4\xe2\xa0', b'\x006>a\x03\x94\xc1l\x8e\xcfO\xd4H\x87d\xe1', b'\x00P\xb1\xfb\x8e\xddC\xcfQx)\xafd:\x92c', b'\x00R\xba\xd8\x84\xc6\xfds\x90\xf6\xfdou\x12yd', b'\x00\xd2\xe6G\xbe\r\xe3\x05Q\x9b8\xcb2/\x05v', b'\x00\xe8\xcdo\xde\xba\x1f\x8b:b\xe8gDuQ\x1b', b'\x01W\xb8\xde\xdf\xd0\xa9F\xbe\n\xb8YH\\\xa3\x87', b'\x01\xa2\xb4\x1c\xc7\xc0\xfc\x07\x96\xa5\x0f\xd3<\xc7\x11\xa0', b'\x01\xaeyb\xf9`\x9e\x85\xee \x1a\xa8:@)\xb3', b'\x02\x1e\xaat\x01\x9bq|\x8e\xdeG\xe5H\xceG\xa4', b'\x02L\xcf\x90&\x15\xea\x16(\x89lgD\xce\x14\x81', b'\x02\\\xb1u\x8a\xd8\xcf\xa1X\xf6O\x91{?\xe4\xa2', b'\x02c\x02\xd6n|\xc0\x13\xaeE\xdaz\x97\xe5\x1aX', b'\x02e@\xd0\xd4a\xf3$Z \x886!\x0e\xfco', b'\x02~\x04%a\x0e%\xc2\xd5R|O5\xbdR\xa0', b'\x02\x89c\xac56p\xaf\xb0\xdei\xd9lL\x02\xcb', b'\x02\x89\xa9\xd2\xb9\x9eC&\x8d\xa8\x03\xd9\xa9\xee\xf5\x9f', b'\x03\x91\xdf\x87\xe5\xb7\x84\xc9\x0c\xc6\xfc\xd9T\xef\xd0\x81', b'\x03\x9e\x1f\xee\x0f\xba+\x87\x07\x813\xc1H\x12\xf5\xfe',

# Distribution of ids+codes?

In [5]:
# ids are uniformely distributed
for c in range(index.nlist):
    I, C = get_invlist(index.invlists, c)
    print(c, np.diff(I).mean()/(ids.max()/(I.shape[0] - 1)))

0 0.9995856552065747
1 0.9954581004782932
2 0.9972266522889327
3 0.9998551082784807
4 0.9991571343510857
5 0.996145930526675
6 0.9982782088438371
7 0.9947434157424858
8 0.9977252207056426
9 0.9993663691335491
10 0.999584308834924
11 0.9996995703956455
12 0.9918125096256524
13 0.994780475296742
14 0.9977434506673541
15 0.9982699710676107
16 0.9908324107331739
17 0.9992640242131867
18 0.9980690249209537
19 0.9916941963110376
20 0.9961823670162233
21 0.9984100597988549
22 0.9976308499935738
23 0.9989930515340121
24 0.9993557450258777
25 0.9986346828988482
26 0.9965600294816956
27 0.9969917471484004
28 0.9985608421786563
29 0.9988839706559625
30 0.9965892123932111
31 0.9950228620122472
32 0.9972160777436486
33 0.9987687993180034
34 0.9986733289079319
35 0.9994618885657133
36 0.9987855782863189
37 0.9990649286219188
38 0.9986995372266991
39 0.9977466984897521
40 0.9984664559453604
41 0.9995666803097946
42 0.9986180028277871
43 0.9985872330523897
44 0.9987841253532912
45 0.9991645856704812
4

In [6]:
from ipynb_helper_functions import compute_empirical_entropy_from_freqs

# anything to gain entropy coding the codes? unlikely
IC_all = np.concatenate([merge_ids_and_codes(*get_invlist(index.invlists, c)) for c in range(index.nlist)], axis=0)
for col in range(IC_all.shape[1]):
    x = IC_all[:, col]
    print('col', col, compute_empirical_entropy_from_freqs(x))

col 0 7.999817261997368
col 1 7.9997987258051095
col 2 7.999834987639778
col 3 7.999817910636274
col 4 7.99981792704093
col 5 7.999788495176301
col 6 7.999803393891999
col 7 7.999841077288339
col 8 7.834437745055748
col 9 7.852393143294568
col 10 7.855576223093281
col 11 7.878637296368633
col 12 7.851141442733805
col 13 7.871556847076469
col 14 7.838555855301651
col 15 7.850192316702203


# Compress codes only

In [12]:
import faiss
from faiss.contrib.datasets import DatasetSIFT1M, SyntheticDataset, DatasetGIST1M
import numpy as np

# Work with the most general case: user inserts external ids
ds = DatasetGIST1M()
db = ds.get_database()
queries = ds.get_queries()
index = faiss.index_factory(ds.d, f'IVF1000,PQ8x8')
index.verbose = True
index.train(ds.get_train())
index.add(db)

# sort database based on codes
perm = []
for c in range(index.nlist):
    I, C = get_invlist(index.invlists, c)
    # perm.append(I[np.argsort(C[:, 0])])
    perm.append(I[np.lexsort(C[:, ::-1].T)])
perm = np.concatenate(perm)
index.reset()
index.add(db[perm]) # works, the vectors are stored in the underlying index

Training level-1 quantizer
Training level-1 quantizer on 100000 vectors in 960D
Training IVF residual
  Input training set too big (max size is 65536), sampling 65536 / 100000 vectors
doing polysemous training for PQ
precomputing IVFPQ tables type 1
IndexIVFPQ::add_core_o: adding 0:32768 / 1000000
 add_core times: 0.000 210.631 3.884 
IndexIVFPQ::add_core_o: adding 32768:65536 / 1000000
 add_core times: 0.000 221.514 2.458 
IndexIVFPQ::add_core_o: adding 65536:98304 / 1000000
 add_core times: 0.000 206.478 2.393 
IndexIVFPQ::add_core_o: adding 98304:131072 / 1000000
 add_core times: 0.000 182.924 2.269 
IndexIVFPQ::add_core_o: adding 131072:163840 / 1000000
 add_core times: 0.000 183.862 2.272 
IndexIVFPQ::add_core_o: adding 163840:196608 / 1000000
 add_core times: 0.001 200.333 2.234 
IndexIVFPQ::add_core_o: adding 196608:229376 / 1000000
 add_core times: 0.000 183.920 2.153 
IndexIVFPQ::add_core_o: adding 229376:262144 / 1000000
 add_core times: 0.000 182.511 2.205 
IndexIVFPQ::add_c

In [13]:
print(np.all(np.diff(I.astype(np.int32)) > 0))
C_all = [get_invlist(index.invlists, c)[1] for c in range(index.nlist)]
assert all(np.all(np.diff(C_all[c][:, 0]) >= 0) for c in range(index.nlist))

True


In [14]:
# estimate savings
byte_savings = 0
C_bytes = 0
for c in range(index.nlist):
    I, C = get_invlist(index.invlists, c)

    # check if vectors are sorted according to ids
    assert np.all(np.diff(I) > 0)

    # check savings
    byte_savings += np.sum(np.log2(np.arange(2, I.shape[0] + 1)))/8
    C_bytes += C.nbytes

print('Total C bytes', C_bytes)
print('Total C bytes per element:', C_bytes/index.ntotal)
print('Total bytes saved:', byte_savings, 'pct C', 100*byte_savings/C_bytes)
print('Total bytes saved per element:', byte_savings/index.ntotal, 'pct C', 100*byte_savings/C_bytes)

Total C bytes 8000000
Total C bytes per element: 8.0
Total bytes saved: 1123774.1140941458 pct C 14.047176426176824
Total bytes saved per element: 1.1237741140941457 pct C 14.047176426176824


# One-shot permutation decoding

In [9]:
from numpy.typing import NDArray
import numpy as np

rng = np.random.default_rng()



perm = np.array([1,5,0,6,3,4,2], dtype=np.uint32)

lehmer_encode_inplace(perm)
assert np.all(perm == np.array([1, 4, 0, 3, 1, 1, 0], dtype=np.uint32))

lehmer_decode_inplace(perm)
assert np.all(perm == np.array([1,5,0,6,3,4,2], dtype=np.uint32))

In [77]:
%load_ext autoreload
%autoreload 2

from rpc.codecs import VectorizedPermutationCodec
from craystack.rans import base_message
import numpy as np

rng = np.random.default_rng()

n = 10_000
d = 130

ans_state = base_message(shape=d)
perm = rng.permutation(n)
codec = VectorizedPermutationCodec(n, d)
ans_state = codec.encode(perm, ans_state)
%time _, perm_dec = codec.decode(ans_state)
assert np.all(perm == perm_dec)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
CPU times: user 48.7 ms, sys: 17 μs, total: 48.7 ms
Wall time: 48.7 ms
