# NMSLIB Demo with GloVe Dataset

In [31]:
!pip install hnswlib optuna

In [1]:
import numpy as np
import h5py
import os
import requests
import tempfile
import time
import hnswlib
import optuna

  from .autonotebook import tqdm as notebook_tqdm


### Download dataset

In [2]:
with tempfile.TemporaryDirectory() as tmp:
    loc = os.path.join(tmp, "glove.hdf5")
    
    if not os.path.isfile(loc):
        response = requests.get("http://ann-benchmarks.com/glove-100-angular.hdf5")
        with open(loc, 'wb') as f:
            f.write(response.content)
    
    glove_h5py = h5py.File(loc, "r")

In [3]:
list(glove_h5py.keys())

['distances', 'neighbors', 'test', 'train']

In [4]:
dataset = glove_h5py['train']
queries = glove_h5py['test']
print(dataset.shape)
print(queries.shape)

(1183514, 100)
(10000, 100)


### Create NMSLIB index

In [13]:
# Declaring index
dim = dataset.shape[-1]
p = hnswlib.Index(space='cosine', dim=dim)  # possible options are l2, cosine or ip

# Initializing index
# max_elements - the maximum number of elements (capacity). Will throw an exception if exceeded
# during insertion of an element.
# The capacity can be increased by saving/loading the index, see below.
#
# ef_construction - controls index search speed/build speed tradeoff
#
# M - is tightly connected with internal dimensionality of the data. Strongly affects memory consumption (~M)
# Higher M leads to higher accuracy/run_time at fixed ef/efConstruction

p.init_index(max_elements=len(dataset), ef_construction=100, M=200)
# Controlling the recall by setting ef:
# higher ef leads to better accuracy, but slower search
p.set_ef(100)
# Set number of threads used during batch search/construction
# By default using all available cores
p.set_num_threads(8)
p.add_items(dataset)

In [6]:
def compute_recall(neighbors, true_neighbors):
    total = 0
    for gt_row, row in zip(true_neighbors, neighbors):
        total += np.intersect1d(gt_row, row).shape[0]
    return total / true_neighbors.size

In [15]:
# Query the elements for themselves and measure recall:
start = time.time()
labels, distances = p.knn_query(queries, k=10)
end = time.time()

print("Recall:", compute_recall(labels, glove_h5py['neighbors'][:, :10]))
print("Time:", end - start)

Recall: 0.91259
Time: 1.7801198959350586


### Serializing and deleting the index:

In [16]:
p.save_index("index.bin")
del p

### Re-initializing, loading the index

In [None]:
p = hnswlib.Index(space='l2', dim=dim)  # the space can be changed - keeps the data, alters the distance function.

# Increase the total capacity (max_elements), so that it will handle the new data
p.load_index("index.bin", max_elements=len(dataset))

# you can add new data
# p.add_items(data2)

p.set_ef(100)
start = time.time()
labels, distances = p.knn_query(queries, k=10)
end = time.time()

print("Recall:", compute_recall(labels, glove_h5py['neighbors'][:, :10]))
print("Time:", end - start)

Recall: 0.9126
Time: 2.3662092685699463


In [7]:
def objective(trial):
    ef_search = trial.suggest_int("ef_search", 50, 300, step=50)
    M = trial.suggest_int("M", 12, 200, step=30)
    ef_construction = trial.suggest_int("ef_construction", 50, 300, step=50)
    # space = trial.suggest_categorical("space", ["l2", "cosine", "ip"])
    space = "cosine"
    
    dim = dataset.shape[-1]
    p = hnswlib.Index(space=space, dim=dim)  # possible options are l2, cosine or ip
    p.init_index(max_elements=len(dataset), ef_construction=ef_construction, M=M)
    p.set_ef(ef_search)
    p.add_items(dataset)
    
    labels, distances = p.knn_query(queries, k=10)
    return compute_recall(labels, glove_h5py['neighbors'][:, :10])


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print(f"Value: {trial.value}")
print("Params:")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[32m[I 2022-07-01 20:02:31,522][0m A new study created in memory with name: no-name-8841d3fe-818f-4350-89e0-a4732de7fbd2[0m
[32m[I 2022-07-01 20:07:03,046][0m Trial 0 finished with value: 0.92649 and parameters: {'ef_search': 150, 'M': 162, 'ef_construction': 50}. Best is trial 0 with value: 0.92649.[0m
[32m[I 2022-07-01 20:12:46,300][0m Trial 1 finished with value: 0.96664 and parameters: {'ef_search': 300, 'M': 192, 'ef_construction': 150}. Best is trial 1 with value: 0.96664.[0m
[32m[I 2022-07-01 20:16:12,731][0m Trial 2 finished with value: 0.95276 and parameters: {'ef_search': 300, 'M': 132, 'ef_construction': 100}. Best is trial 1 with value: 0.96664.[0m
[32m[I 2022-07-01 20:26:52,759][0m Trial 3 finished with value: 0.87181 and parameters: {'ef_search': 50, 'M': 132, 'ef_construction': 300}. Best is trial 1 with value: 0.96664.[0m
[32m[I 2022-07-01 20:30:13,784][0m Trial 4 finished with value: 0.85582 and parameters: {'ef_search': 100, 'M': 42, 'ef_construction'

Number of finished trials:  10
Best trial:
Value: 0.96664
Params:
    ef_search: 300
    M: 192
    ef_construction: 150


In [10]:
p = hnswlib.Index(space='cosine', dim=100)
p.init_index(max_elements=len(dataset), ef_construction=150, M=192)
p.set_ef(300)
p.add_items(dataset)

start = time.time()
labels, distances = p.knn_query(queries, k=10)
end = time.time()

print("Recall:", compute_recall(labels, glove_h5py['neighbors'][:, :10]))
print("Time:", end - start)

Recall: 0.96669
Time: 4.551661968231201


In [11]:
start = time.time()
labels, distances = p.knn_query([queries[0]], k=10)
end = time.time()

print("Time:", end - start)

Time: 0.004340648651123047
