In [1]:
# Imports
import os
import numpy as np
import time 
import h5py

import cyborg_vector_search_py as cvs

In [2]:
# Parameters
dataset = "sift-128-euclidean"
dimension = 128
index_type = "IVFFlat"
n_lists = 4096
metric = "euclidean"

# Training parameters
batch_size = 2048
max_iters = 100
max_dataset_size = 1000000
training_ratio = 0.1 # Number of vectors to use for training (rest will be indexed post-training)
top_k = 100

# Location configuration (Redis)
index_location = cvs.DBConfig(location='redis', table_name="index_table", connection_string="host:127.0.0.1,port:6379,db:0")
config_location = cvs.DBConfig(location='redis', table_name="config_table", connection_string="host:127.0.0.1,port:6379,db:0")
items_location = cvs.DBConfig(location='redis', table_name="items_table", connection_string="host:127.0.0.1,port:6379,db:0")

# Index configuration
index_config = cvs.IndexIVFFlat(dimension, n_lists, metric)

In [3]:
# Set where your dataset is located
dataset_location = f'../../../../datasets/{dataset}.hdf5'

# Load the dataset
with h5py.File(dataset_location, 'r') as file:
    train = np.array(file['train'], dtype=np.float32)
    test = np.array(file['test'], dtype=np.float32)
    neighbors = np.array(file['neighbors'], dtype=np.int32)

# Setup the training data
vectors = train[:max_dataset_size]
ids = np.arange(max_dataset_size)
training_size = int(max_dataset_size * training_ratio)
training_vectors = vectors[:training_size]
training_ids = ids[:training_size]
remaining_vectors = vectors[training_size:]
remaining_ids = ids[training_size:]

In [4]:
# Setup the CVS index
client = cvs.Client(
    index_location=index_location,
    config_location=config_location,
    cpu_threads = 16,
    gpu_accelerate = False
)

# Dummy index name and key
index_name = "memory_example_index"
index_key = bytes([1] * 32)

# Create the index
index = client.create_index(index_name, index_key, index_config)

In [None]:
# Upsert untrained

start = time.time()
index.upsert(training_ids, training_vectors)
print(f"Upserted {training_size} vectors in {time.time() - start:.2f} seconds")

In [None]:
# Untrained query

n_probes = 10
num_queries = 1000
initial_queries = test[:num_queries]

start = time.time()
results = index.query(initial_queries, top_k, n_probes)
end = time.time()

# Compute the neighbors for the training_size vectors
dists = np.sum(training_vectors**2, axis=1) - 2 * np.dot(initial_queries, training_vectors.T) + np.sum(initial_queries**2, axis=1)[:, np.newaxis]
initial_neighbors = np.argpartition(dists, top_k, axis=1)[:, :top_k]
initial_neighbors = np.take_along_axis(initial_neighbors, np.argsort(dists[np.arange(dists.shape[0])[:, None], initial_neighbors], axis=1), axis=1)

result_ids = [
    [res["id"] for res in query_results] for query_results in results
]

# Compute the recall using the neighbors
recall = np.zeros(initial_queries.shape[0])
for i in range(initial_queries.shape[0]):
    recall[i] = len(np.intersect1d(initial_neighbors[i], result_ids[i])) / len(initial_neighbors[i])

print(f"Queried {initial_queries.shape[0]} vectors in {end - start:.2f} seconds")
print(f"QPS: {num_queries / (end - start):.2f}")
print(f"Mean recall: {recall.mean() * 100:.2f}%")

In [None]:
# Train index for faster queries

start = time.time()
index.train(batch_size, max_iters)
print(f"Trained index with {training_size} vectors in {time.time() - start:.2f} seconds")

In [None]:
# Trained query

n_probes = 32

# Query the test set
start = time.time()
results = index.query(initial_queries, top_k, n_probes)
end = time.time()

result_ids = [
    [res["id"] for res in query_results] for query_results in results
]

# Compute the recall using the neighbors
result_ids = np.array(result_ids)
recall = np.zeros(initial_queries.shape[0])
for i in range(initial_queries.shape[0]):
    recall[i] = len(np.intersect1d(initial_neighbors[i], result_ids[i])) / len(initial_neighbors[i])

print(f"Queried {initial_queries.shape[0]} vectors in {end - start:.2f} seconds")
print(f"QPS: {num_queries / (end - start):.2f}")
print(f"Mean recall: {recall.mean() * 100:.2f}%")

In [None]:
# Upsert remaining (skip if training ratio is 1)
if training_ratio < 1:

    start = time.time()
    index.upsert(remaining_ids, remaining_vectors)
    print(f"Upserted {remaining_vectors.shape[0]} vectors in {time.time() - start:.2f} seconds")
    print(f"VPS: {remaining_vectors.shape[0] / (time.time() - start):.2f}")

In [None]:
# Trained query

n_probes = 32

# Query the test set
start = time.time()
results = index.query(test, top_k, n_probes)
end = time.time()

result_ids = [
    [res["id"] for res in query_results] for query_results in results
]

# Convert the results to numpy array
result_ids = np.array(result_ids)

# Compute the recall using the neighbors
recall = np.zeros(test.shape[0])
for i in range(test.shape[0]):
    recall[i] = len(np.intersect1d(neighbors[i], result_ids[i])) / len(neighbors[i])

print(f"Queried {test.shape[0]} vectors in {end - start:.2f} seconds")
print(f"QPS: {test.shape[0] / (end - start):.2f}")
print(f"Mean recall: {recall.mean() * 100:.2f}%")

In [None]:
# Cleanup

index.delete_index()