In [2]:
import h5py

with h5py.File("data/sift/sift-128-euclidean.hdf5", "r") as f:
    # List all groups
    print("Keys: %s" % f.keys())
    distances = list(f.keys())[0]
    neighbors = list(f.keys())[1]
    test = list(f.keys())[2]
    train = list(f.keys())[3]

    # Get the data
    train = list(f[train])
    distances = list(f[distances])
    neighbors = list(f[neighbors])
    test = list(f[test])

Keys: <KeysViewHDF5 ['distances', 'neighbors', 'test', 'train']>


In [5]:
# we have 10000 test vectors
# for each of those vectors we have their 100 closest neighbors in order from closest to farthest
# we also have their distances away from each of these 100 neighbors
train[0]

array([  0.,  16.,  35.,   5.,  32.,  31.,  14.,  10.,  11.,  78.,  55.,
        10.,  45.,  83.,  11.,   6.,  14.,  57., 102.,  75.,  20.,   8.,
         3.,   5.,  67.,  17.,  19.,  26.,   5.,   0.,   1.,  22.,  60.,
        26.,   7.,   1.,  18.,  22.,  84.,  53.,  85., 119., 119.,   4.,
        24.,  18.,   7.,   7.,   1.,  81., 106., 102.,  72.,  30.,   6.,
         0.,   9.,   1.,   9., 119.,  72.,   1.,   4.,  33., 119.,  29.,
         6.,   1.,   0.,   1.,  14.,  52., 119.,  30.,   3.,   0.,   0.,
        55.,  92., 111.,   2.,   5.,   4.,   9.,  22.,  89.,  96.,  14.,
         1.,   0.,   1.,  82.,  59.,  16.,  20.,   5.,  25.,  14.,  11.,
         4.,   0.,   0.,   1.,  26.,  47.,  23.,   4.,   0.,   0.,   4.,
        38.,  83.,  30.,  14.,   9.,   4.,   9.,  17.,  23.,  41.,   0.,
         0.,   2.,   8.,  19.,  25.,  23.,   1.], dtype=float32)

In [12]:
# read in all the quantized vectors and codebooks
# create the approximate vectors in place (retaining indices)
# take that X and do np.argmin(np.linalg.norm(X - queries))
from collections import defaultdict
from heapq import heappush, heappop
import numpy as np

def readQuantAndCodes(rootDirectory, M):
    quantFile = rootDirectory + "quantized.txt"
    codesFile = rootDirectory + "codebooks.txt"
    quantizedVectors = []
    codebooks = defaultdict(list)
    with open(quantFile, "r") as f:
        lines = f.readlines()
        for line in lines:
            quantizedVectors.append([int(x) for x in line.split(" ")])

    with open(codesFile, "r") as f:
        lines = f.readlines()
        codebook_index = 0
        for line in lines:
            centroids_in_column = []
            whole_line_as_ints = [float(x) for x in line.split(" ")]
            index = 0
            for i in range(256):
                centroid = []
                for j in range(M):
                    centroid.append(whole_line_as_ints[index])
                    index += 1
                codebooks[codebook_index].append(centroid)
            codebook_index += 1

    return quantizedVectors, codebooks


for M in [64]:
    quantizedVectors, codebooks = readQuantAndCodes(f"data/sift/testing/testing_M{M}/", M)
    full_quantized = []
    for vector in quantizedVectors:
        actual_vector = []
        for i in range(len(vector)):
            for code in codebooks[i][vector[i]]:
                actual_vector.append(code)
        full_quantized.append(actual_vector)
    
    actual_neighbors_pq = []
    for i1, vector1 in enumerate(test):
        # for each query vector, find distance between all vectors
        # enter into a heap of size 100
        heap = []
        for i2, vector2 in enumerate(full_quantized):
            dist = np.linalg.norm(vector1 - vector2)
            if len(heap) == 100:
                if -dist > heap[0][0]:
                    heappop(heap)
                    heappush(heap, (-dist, i1, i2))
            else:
                heappush(heap, (-dist, i1, i2))
        closest_neighbors = [x[2] for x in heap]
        actual_neighbors_pq.append(closest_neighbors)


In [11]:
neighbors

128