In [1]:
import wrapper as wp
import os, sys
import numpy as np
import pandas as pd
from time import time

In [2]:
DATA_DIR = '../data/diversity/'

BASE_FILENAME = os.path.join(DATA_DIR, 'base_uk.ivec')
METADATA_FILENAME = os.path.join(DATA_DIR, "base_seller_csr.spmat")

In [3]:
CUTOFF = 5_000
CLUSTER_SIZE = 5000
NQ = 200_000
WEIGHT_CLASSES = (100_000, 400_000)
MAX_DEGREES = (16, 32, 64)

TINY_CUTOFF = 0
TARGET_POINTS = 15_000
BEAM_WIDTHS = (1100, 1100, 1100) # (85, 85, 85)
SEARCH_LIMITS = (25_000_000, 25_000_000, 25_000_000) #(int(WEIGHT_CLASSES[0] * 0.2), int(WEIGHT_CLASSES[1] * 0.5), int(3_000_000 * 0.5))

ALPHA = 1.175

BITVECTOR_CUTOFF = 1000000000

In [4]:
index = wp.init_squared_ivf_index('Euclidian', 'uint8')

===Running IVF_Squared


In [5]:
for i in range(3):
    index.set_build_params(wp.BuildParams(MAX_DEGREES[i], 500, ALPHA), i)
    index.set_query_params(wp.QueryParams(10, BEAM_WIDTHS[i], 1.35, SEARCH_LIMITS[i], MAX_DEGREES[i]), i)

index.set_bitvector_cutoff(BITVECTOR_CUTOFF)
index.set_materialized_joins(False)

In [6]:
index.fit_from_filename(BASE_FILENAME, METADATA_FILENAME, CUTOFF, CLUSTER_SIZE, "index_cache/", WEIGHT_CLASSES, True)

Detected 20000000 points with dimension 64
IVF^2: points loaded
IVF^2: filters loaded
Num above cutoff = 78 filters.n_points = 2530
IVF^2: fit completed


In [7]:
# import queries
QUERY_FILENAME = os.path.join(DATA_DIR, 'query_uk_random_metadata.parquet')
GT_FILENAME = os.path.join(DATA_DIR, 'uk_random_GT.bin')

In [8]:

queries = pd.read_parquet(QUERY_FILENAME)

query_embedding_matrix = np.array(queries['embedding'].tolist(), dtype=np.uint8)

def single_label_query_filters(i, nq):
    return [wp.QueryFilter(i)] * nq

nq = query_embedding_matrix.shape[0]

In [9]:
# run query

start = time()
neighbors, distances = index.unsorted_batch_filter_search(query_embedding_matrix, single_label_query_filters(0, nq), nq, 1000)
end = time()

print(f"qps: {nq / (end - start)} ({nq} queries in {end - start} seconds)")

qps: 32071.254210488187 (199700 queries in 6.226759910583496 seconds)


In [10]:
neighbors

array([[ 5509449,  6847861,  7919950, ...,        0,        0,        0],
       [ 5509449,  6847861,  7919950, ...,        0,        0,        0],
       [10350430,  4254545, 10311253, ...,        0,        0,        0],
       ...,
       [  813673,   707021,  5454082, ...,        0,        0,        0],
       [ 2810051,  4683690,  4039861, ...,        0,        0,        0],
       [ 8735606,  1157519,  1488224, ...,  4377917,  7309636,  9394876]],
      dtype=uint32)

In [11]:
queries['vamana_neighbors'] = [n[:np.max(n.nonzero()[0]) + 1] for n in neighbors]
queries['vamana_distances'] = [d[:np.max(n.nonzero()[0]) + 1] for n, d in zip(neighbors, distances)]
queries

Unnamed: 0,id,embedding,query,vamana_neighbors,vamana_distances
0,1,"[130, 117, 110, 120, 97, 116, 114, 128, 128, 1...",,"[5509449, 6847861, 7919950, 9968828, 14630462,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 420.0, 502.0, 5..."
1,2,"[130, 117, 110, 120, 97, 116, 114, 128, 128, 1...",...,"[5509449, 6847861, 7919950, 9968828, 14630462,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 420.0, 502.0, 5..."
2,3,"[131, 119, 118, 117, 108, 118, 120, 127, 137, ...",12up1y1,"[10350430, 4254545, 10311253, 10324895, 103338...","[2316.0, 2364.0, 2386.0, 2393.0, 2475.0, 2477...."
3,4,"[129, 121, 121, 120, 109, 120, 124, 131, 126, ...", grundfos,"[14138781, 1665908, 2905, 307572, 5344232, 138...","[1508.0, 2228.0, 2381.0, 2508.0, 2575.0, 2583...."
4,5,"[130, 117, 110, 120, 97, 116, 114, 128, 128, 1...",,"[5509449, 6847861, 7919950, 9968828, 14630462,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 420.0, 502.0, 5..."
...,...,...,...,...,...
199695,199996,"[128, 105, 138, 122, 104, 143, 126, 114, 120, ...",379 watch battery,"[6768531, 7503993, 8766520, 7681457, 8436637, ...","[1576.0, 1602.0, 1602.0, 1605.0, 1652.0, 1652...."
199696,199997,"[130, 104, 140, 123, 104, 148, 127, 116, 120, ...",379 watch battery equivalent,"[547352, 6768531, 13531723, 7681457, 7503993, ...","[1676.0, 1695.0, 1705.0, 1758.0, 1761.0, 1761...."
199697,199998,"[134, 101, 139, 123, 101, 150, 130, 125, 114, ...",379 watch battery equivalent chart,"[813673, 707021, 5454082, 9737401, 14224558, 1...","[3005.0, 3054.0, 3055.0, 3112.0, 3138.0, 3149...."
199698,199999,"[129, 114, 129, 114, 109, 130, 125, 119, 121, ...",379008,"[2810051, 4683690, 4039861, 4379997, 19709619,...","[1664.0, 1724.0, 1759.0, 1882.0, 1899.0, 1929...."


In [12]:
# average length of neighbors
np.min([len(n.nonzero()[0]) for n in neighbors])

47

In [13]:
QUERY_OUTPUT_FILENAME = os.path.join(DATA_DIR, 'query_uk_random_metadata_vamana.parquet')

queries.to_parquet(QUERY_OUTPUT_FILENAME)

In [14]:
def read_gt(gt_file):
    """reads the ibin ground truth to a numpy array"""
    file = np.fromfile(gt_file, dtype=np.uint32)
    n = file[0]
    k = file[1]
    gt = file[2:2 + n * k].reshape(n, k)
    distances = file[2 + n * k:].reshape(n, k).view(np.float32)
    
    return gt, distances

In [15]:
gt, gt_distances = read_gt(GT_FILENAME)

In [16]:
queries['gt'] = gt.tolist()
queries['gt_distances'] = gt_distances.tolist()
queries

Unnamed: 0,id,embedding,query,vamana_neighbors,vamana_distances,gt,gt_distances
0,1,"[130, 117, 110, 120, 97, 116, 114, 128, 128, 1...",,"[5509449, 6847861, 7919950, 9968828, 14630462,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 420.0, 502.0, 5...","[17837415, 17365371, 7919950, 5509449, 1463046...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 420.0..."
1,2,"[130, 117, 110, 120, 97, 116, 114, 128, 128, 1...",...,"[5509449, 6847861, 7919950, 9968828, 14630462,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 420.0, 502.0, 5...","[17837415, 17365371, 7919950, 5509449, 1463046...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 420.0..."
2,3,"[131, 119, 118, 117, 108, 118, 120, 127, 137, ...",12up1y1,"[10350430, 4254545, 10311253, 10324895, 103338...","[2316.0, 2364.0, 2386.0, 2393.0, 2475.0, 2477....","[10350430, 4254545, 10311253, 10324895, 103338...","[2316.0, 2364.0, 2386.0, 2393.0, 2475.0, 2477...."
3,4,"[129, 121, 121, 120, 109, 120, 124, 131, 126, ...", grundfos,"[14138781, 1665908, 2905, 307572, 5344232, 138...","[1508.0, 2228.0, 2381.0, 2508.0, 2575.0, 2583....","[14138781, 1665908, 2905, 307572, 5344232, 138...","[1508.0, 2228.0, 2381.0, 2508.0, 2575.0, 2583...."
4,5,"[130, 117, 110, 120, 97, 116, 114, 128, 128, 1...",,"[5509449, 6847861, 7919950, 9968828, 14630462,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 420.0, 502.0, 5...","[17837415, 17365371, 7919950, 5509449, 1463046...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 420.0..."
...,...,...,...,...,...,...,...
199695,199996,"[128, 105, 138, 122, 104, 143, 126, 114, 120, ...",379 watch battery,"[6768531, 7503993, 8766520, 7681457, 8436637, ...","[1576.0, 1602.0, 1602.0, 1605.0, 1652.0, 1652....","[6768531, 7503993, 8766520, 7681457, 8488634, ...","[1576.0, 1602.0, 1602.0, 1605.0, 1652.0, 1652...."
199696,199997,"[130, 104, 140, 123, 104, 148, 127, 116, 120, ...",379 watch battery equivalent,"[547352, 6768531, 13531723, 7681457, 7503993, ...","[1676.0, 1695.0, 1705.0, 1758.0, 1761.0, 1761....","[547352, 6768531, 13531723, 7681457, 7503993, ...","[1676.0, 1695.0, 1705.0, 1758.0, 1761.0, 1761...."
199697,199998,"[134, 101, 139, 123, 101, 150, 130, 125, 114, ...",379 watch battery equivalent chart,"[813673, 707021, 5454082, 9737401, 14224558, 1...","[3005.0, 3054.0, 3055.0, 3112.0, 3138.0, 3149....","[813673, 707021, 5454082, 9737401, 14224558, 1...","[3005.0, 3054.0, 3055.0, 3112.0, 3138.0, 3149...."
199698,199999,"[129, 114, 129, 114, 109, 130, 125, 119, 121, ...",379008,"[2810051, 4683690, 4039861, 4379997, 19709619,...","[1664.0, 1724.0, 1759.0, 1882.0, 1899.0, 1929....","[2810051, 4683690, 4039861, 4379997, 19709619,...","[1664.0, 1724.0, 1759.0, 1882.0, 1899.0, 1929...."


In [17]:
queries.to_parquet(QUERY_OUTPUT_FILENAME)