# Example of utilisation of a Faiss index

In [1]:
import faiss
import numpy as np

from autofaiss.datasets.readers.local_iterators import read_embeddings_local
from autofaiss.datasets.readers.remote_iterators import read_embeddings_remote, read_mapping_remote
from autofaiss.external.optimize import get_optimal_hyperparameters
from autofaiss.external.scores import compute_fast_metrics
from autofaiss.indices.index_utils import load_index_from_hdfs, set_search_hyperparameters
from autofaiss.utils.decorators import Timeit
from autofaiss.utils.hdfs_utils import HDFSFileSystem


## Loading index, inverse-mapping and some product embeddings from HDFS

In [2]:
INDEX_PATH_ON_HDFS = "/user/v.paltz/tests_autofaiss/image_embeddings_0_views_US/OPQ64_256,IVF131072_HNSW32,PQ64x8-nprobe=16,efSearch=32,ht=2048.index"
VECTORS_PATH_ON_HDFS = "/user/deepr/dev/r.beaumont/image_embeddings_0_views_US"
INVERSE_MAPPING_PATH_ON_HDFS = "/user/v.paltz/tests_autofaiss/image_embeddings_0_views_US/inverse_mapping.npy"

In [3]:
INDEX_PATH_ON_HDFS = "/user/v.paltz/tests_autofaiss/image_embeddings_100_views_US/OPQ64_256,IVF65536_HNSW32,PQ64x8-nprobe=256,efSearch=512,ht=2048.index"
VECTORS_PATH_ON_HDFS = "/user/deepr/dev/r.beaumont/image_embeddings_100_views_US"
INVERSE_MAPPING_PATH_ON_HDFS = "/user/v.paltz/tests_autofaiss/image_embeddings_100_views_US/inverse_mapping.npy"

In [4]:
with Timeit("Loading index from hdfs"):
    my_index = load_index_from_hdfs(INDEX_PATH_ON_HDFS)

Loading index from hdfs
>>> Finished in 17.9041 secs


In [5]:
with Timeit("Loading mapping from hdfs"):
    with HDFSFileSystem() as hdfs:
        with hdfs.open(INVERSE_MAPPING_PATH_ON_HDFS, mode="rb") as inverse_mapping_file:
            inverse_mapping = np.load(inverse_mapping_file)

Loading mapping from hdfs
>>> Finished in 5.7593 secs


In [6]:
with Timeit("Loading a few vectors from hdfs"):
    example_vectors = next(read_embeddings_remote(VECTORS_PATH_ON_HDFS, verbose=False))

Loading a few vectors from hdfs
>>> Finished in 2.8073 secs


## Sanity checks

In [7]:
# Assert the mapping array has the same size as the index.
assert inverse_mapping.shape == (my_index.ntotal, 2)

# Verify that mapping array is correct.
example_mapping_remote = next(read_mapping_remote(VECTORS_PATH_ON_HDFS, verbose=False, stack_input=2))
assert np.all(inverse_mapping[:example_mapping_remote.shape[0]] == example_mapping_remote)

if False:
    # Verify the local embeddings arrays are in the same order
    LOCAL_EMBEDDING_PATH = "/home/v.paltz/downloaded_vectors/image_embeddings_100_views_US"
    example_vectors_local = next(read_embeddings_local(LOCAL_EMBEDDING_PATH, verbose=False, stack_input=1))
    assert np.all(example_vectors == example_vectors_local)

## Printing infos about the current index

In [8]:
from pprint import pprint

with Timeit("Get basic infos about index"):
    pprint(compute_fast_metrics(VECTORS_PATH_ON_HDFS, my_index))

Get basic infos about index
{'average speed': '14.86 ms/query',
 'compression ratio': 69,
 'nb vectors': 32431891,
 'reconstruction error': '45.2803%',
 'size': '2.3G',
 'vectors dimension': 1280}
>>> Finished in 17.4455 secs


## KNN Search on random products

In [9]:
from autofaiss.catalog.display import display_results, display_product
from autofaiss.catalog.external_requests import product_partner_to_str
from autofaiss.indices.index_utils import quantize_vec_without_modifying_index
from autofaiss.indices.search import knn_query

In [10]:
MAX_PER_LINE = 10
NB_PRODUCTS_TO_SHOW = 30
START_FROM = 0
QUANTISE_BEFORE_QUERY = True


num_product = np.random.randint(example_vectors.shape[0])
product_embedding = example_vectors[num_product]

if QUANTISE_BEFORE_QUERY:
    product_embedding = quantize_vec_without_modifying_index(my_index, np.expand_dims(product_embedding, 0))[0]


display_product(tuple(inverse_mapping[num_product]))

product_nums = knn_query(my_index, product_embedding, START_FROM + NB_PRODUCTS_TO_SHOW, inverse_mapping)[START_FROM:]

for b in range(0, NB_PRODUCTS_TO_SHOW, MAX_PER_LINE):
    display_results(product_nums[b:b + MAX_PER_LINE])

HBox(children=(VBox(children=(Label(value='134.6444'), HTML(value='<img src="https://cdn.homes.com/x2/@v=-1053…

HBox(children=(VBox(children=(Label(value='131.4819'), HTML(value='<img src="https://cdn.homes.com/x2/@v=-2011…

HBox(children=(VBox(children=(Label(value='130.6400'), HTML(value='<img src="https://cdn.homes.com/x2/@v=41933…

## Bonus: tuning the hyperparameters

In [11]:
index_key = INDEX_PATH_ON_HDFS.split("/")[-1].split("-")[0]

with Timeit("Compute best hyperparameters"):
    index_param = get_optimal_hyperparameters(my_index, index_key, max_speed=1.0)

print(f"New index params: {index_param}")

with Timeit("Set search hyperparameters for the index"):
    set_search_hyperparameters(my_index, index_param, use_gpu=False)

Compute best hyperparameters
>>> Finished in 33.8860 secs
New index params: nprobe=8,efSearch=16,ht=2048
Set search hyperparameters for the index
>>> Finished in 0.0001 secs
