<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#PySparNN" data-toc-modified-id="PySparNN-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>PySparNN</a></span><ul class="toc-item"><li><span><a href="#PysParNN-also-works-with-scipy-coo-matrices" data-toc-modified-id="PysParNN-also-works-with-scipy-coo-matrices-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>PysParNN also works with scipy coo matrices</a></span></li><li><span><a href="#'Performant'-example" data-toc-modified-id="'Performant'-example-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>'Performant' example</a></span></li><li><span><a href="#Insert-elements" data-toc-modified-id="Insert-elements-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Insert elements</a></span></li><li><span><a href="#Important-notes:" data-toc-modified-id="Important-notes:-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Important notes:</a></span></li></ul></li></ul></div>

# PySparNN


- Git: https://github.com/facebookresearch/pysparnn

- Summary: Approximate Nearest Neighbor Search for Sparse Data in Python




In [1]:
import pysparnn.cluster_index as ci

import numpy as np
import scipy
from scipy.sparse import csr_matrix

In [2]:
import pysparnn.cluster_index as ci

from sklearn.feature_extraction.text import TfidfVectorizer

data = [
    'hello world',
    'oh hello there',
    'Play it',
    'Play it again Sam',
]   

keys = range(len(data))

tv = TfidfVectorizer()
tv.fit(data)

features_vec = tv.transform(data)

# build the search index!
cp = ci.MultiClusterIndex(features_vec, data)


In [3]:
# search the index with a sparse matrix
search_data = [
    'oh there',
    'Play it again Frank'
]

search_features_vec = tv.transform(search_data)
cp.search(search_features_vec, k=5, k_clusters=2, return_distance=False)

[['oh hello there', 'hello world', 'Play it', 'Play it again Sam'],
 ['Play it again Sam', 'Play it', 'hello world', 'oh hello there']]

## Using Pysparnn in Jina

In [247]:
from jina.executors.indexers.vector import BaseVectorIndexer


class PysparnnIndexer(BaseVectorIndexer):
    """
    :class:`PysparnnIndexer` Approximate Nearest Neighbor Search for Sparse Data in Python using PySparNN.

    """

    def __init__(self, k_clusters=2, num_indexes=None, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.k_clusters = k_clusters
        self.num_indexes = num_indexes

    def post_init(self):
        self.index = {}
        self.mci = None

    def _build_advanced_index(self):
        keys = []
        indexed_vectors = []
        import pysparnn.cluster_index as ci
        for key, vector in self.index.items():
            keys.append(key)
            indexed_vectors.append(vector)
        
        self.mci = ci.MultiClusterIndex(scipy.sparse.vstack(indexed_vectors), keys)

    def query(self, vectors, top_k, *args, **kwargs):
        """Find the top-k vectors with smallest ``metric`` and return their ids in ascending order.

        :return: a tuple of two ndarrays.
            The first array contains indices, the second array contains distances.
            If `n_vectors = vector.shape[0]` both arrays have shape `n_vectors x top_k`

        :param vectors: the vectors with which to search
        :param args: not used
        :param kwargs: not used
        :param top_k: number of results to return
        :return: tuple of arrays of the form `(indices, distances)`
        """

        if not self.mci:
            self._build_advanced_index()

        n_elements = search_features_vec.shape[0]
        index_distance_pairs = self.mci.search(vectors,
                                               k=top_k,
                                               k_clusters=self.k_clusters,
                                               num_indexes=self.num_indexes,
                                               return_distance=True)
        distances = []
        indices = [] 
        for record in index_distance_pairs:
            distances_to_record, indices_to_record = zip(*record)
            distances.append(distances_to_record)
            indices.append(indices_to_record)

        return np.array(indices), np.array(distances)
    
    def add(self, keys, vectors, *args, **kwargs):
        if self.mci is not None:
            raise Exception(' Not possible query while indexing')
        for key, vector in zip(keys, vectors):
            self.index[key] = vector
        
    def update(
            self, keys, vectors, *args, **kwargs
    ) -> None:
        if self.mci is not None:
            raise Exception(' Not possible query while indexing')
        for key, vector in zip(keys, vectors):
            self.index[key] = vector

    def delete(self, keys, *args, **kwargs) -> None:
        if self.mci is not None:
            raise Exception(' Not possible query while indexing')
        for key in keys:
            del self.index[key]

    def store_index_to_disk(self):
        """Store self.index to disk"""
        scipy.sparse.save_npz('./vectors.npz', scipy.sparse.vstack(self.index.values()))
        
        with open('./indices.npy', 'wb') as f:
            np.save(f, list(self.index.keys()))

    def load_index_from_disk(self):
        """Load self.index from disk"""
        vectors = scipy.sparse.load_npz('./vectors.npz')
            
        with open('./indices.npy', 'rb') as f:
            indices = np.load(f)
            
        self.index = {ind:vec for ind,vec in zip(indices, vectors)}        

In [248]:
indexer = PysparnnIndexer()

PysparnnIndexer@4248[I]:post_init may take some time...
PysparnnIndexer@4248[I]:post_init may take some time takes 0 seconds (0.00s)


In [249]:
indexer.post_init()

In [250]:
for index in range(len(data)):
    indexer.add(keys=[index], vectors=[features_vec[index]])

In [251]:
indexer._build_advanced_index()

In [252]:
vectors = search_features_vec
n_elements = vectors.shape[0]
indices, distances = indexer.query(search_features_vec, top_k=4)
#aux = indexer.query(vectors, top_k=4)

In [253]:
indices

array([[1., 0., 2., 3.],
       [3., 2., 0., 1.]])

In [254]:
distances

array([[0.12656138, 1.        , 1.        , 1.        ],
       [0.16833831, 0.2555503 , 1.        , 1.        ]])

## Storing index to disk

In [255]:
indexer.store_index_to_disk()

In [256]:
ls

01_introduction_pysparnn.ipynb       [34m__main__.PysparnnIndexer-6d5683ae-0[m[m/ pysparnn_integration.ipynb
02_pysparnn_in_jina.ipynb            indices.npy                          vectors.npz


In [260]:
indexer = PysparnnIndexer()

PysparnnIndexer@4248[I]:post_init may take some time...
PysparnnIndexer@4248[I]:post_init may take some time takes 0 seconds (0.00s)


In [261]:
indexer.index

{}

In [262]:
indexer.load_index_from_disk()

In [263]:
indexer.index

{0: <1x8 sparse matrix of type '<class 'numpy.float64'>'
 	with 2 stored elements in Compressed Sparse Row format>,
 1: <1x8 sparse matrix of type '<class 'numpy.float64'>'
 	with 3 stored elements in Compressed Sparse Row format>,
 2: <1x8 sparse matrix of type '<class 'numpy.float64'>'
 	with 2 stored elements in Compressed Sparse Row format>,
 3: <1x8 sparse matrix of type '<class 'numpy.float64'>'
 	with 4 stored elements in Compressed Sparse Row format>}

# Implementation details

## Understanding `_build_advanced_index`

In [32]:
indexer = PysparnnIndexer()
indexer.post_init()

PysparnnIndexer@4248[I]:post_init may take some time...
PysparnnIndexer@4248[I]:post_init may take some time takes 0 seconds (0.00s)


In [33]:
for index in range(len(data)):
    indexer.add(keys=[index], vectors=[features_vec[index]])

In [34]:
indexer.index

{0: <1x8 sparse matrix of type '<class 'numpy.float64'>'
 	with 2 stored elements in Compressed Sparse Row format>,
 1: <1x8 sparse matrix of type '<class 'numpy.float64'>'
 	with 3 stored elements in Compressed Sparse Row format>,
 2: <1x8 sparse matrix of type '<class 'numpy.float64'>'
 	with 2 stored elements in Compressed Sparse Row format>,
 3: <1x8 sparse matrix of type '<class 'numpy.float64'>'
 	with 4 stored elements in Compressed Sparse Row format>}

You can `fit` the indexer using `_build_advanced_index`

In [35]:
keys = []
indexed_vectors = []
import pysparnn.cluster_index as ci
for key, vector in indexer.index.items():
    keys.append(key)
    indexed_vectors.append(vector)

In [36]:
indexed_vectors

[<1x8 sparse matrix of type '<class 'numpy.float64'>'
 	with 2 stored elements in Compressed Sparse Row format>,
 <1x8 sparse matrix of type '<class 'numpy.float64'>'
 	with 3 stored elements in Compressed Sparse Row format>,
 <1x8 sparse matrix of type '<class 'numpy.float64'>'
 	with 2 stored elements in Compressed Sparse Row format>,
 <1x8 sparse matrix of type '<class 'numpy.float64'>'
 	with 4 stored elements in Compressed Sparse Row format>]

In [37]:
aux = ci.MultiClusterIndex(scipy.sparse.vstack(indexed_vectors), keys)