## Maximum Inner Product

- https://github.com/soundcloud/cosine-lsh-join-spark
- https://cmry.github.io/notes/euclidean-v-cosine
- https://stats.stackexchange.com/questions/146221/is-cosine-similarity-identical-to-l2-normalized-euclidean-distance
- https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/XboxInnerProduct.pdf

## ANN Library

In [12]:
from n2 import HnswIndex

class N2:

    def __init__(self):
        pass

    def fit(self, X):
        index = HnswIndex(dimension=X.shape[1])
        for vector in X:
            index.add_data(vector)

        index.build(m=16, ef_construction=200, n_threads=8)
        self.index_ = index
        return self

    def query(self, vector, topn):
        indices = self.index_.search_by_vector(vector, k=topn, ef_search=50)
        return indices

In [2]:
import nmslib
import numpy as np

data = np.random.randn(1000, 10).astype(np.float32)
indexA = nmslib.init(method='hnsw', space='cosinesimil')
indexA.addDataPointBatch(data)
indexA.createIndex({'post': 2}, print_progress=True)

print("Original index:")
print(indexA.knnQuery(data[42], k=3))

indexA.saveIndex("foo.hnsw")

indexB = nmslib.init(method='hnsw', space='cosinesimil')
indexB.loadIndex("foo.hnsw")

print("\n\nDeserialized index:")
print(indexB.knnQuery(data[42], k=3))

Original index:
(array([ 42, 285, 547], dtype=int32), array([0.        , 0.10073507, 0.11398488], dtype=float32))


Deserialized index:
(array([ 42, 285, 547], dtype=int32), array([0.        , 0.10073507, 0.11398488], dtype=float32))


In [12]:
import numpy as np
from n2 import HnswIndex


index_params = {'M': 16, 'post': 0, 'efConstruction': 400}
query_params = {'ef': 90}

N, dim = 10240, 20
samples = np.arange(N * dim).reshape(N, dim)

index = HnswIndex(dim)
for sample in samples:
    index.add_data(sample)
index.build(m=16, ef_construction=400, n_threads=8)
print(index.search_by_id(0, 10))
# [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


## Docker Utility Function

In [None]:
import docker

# https://docker-py.readthedocs.io/en/stable/client.html
docker_client = docker.from_env()
docker_client

In [None]:
images = docker_client.images.list()
images[0]

In [None]:
def bytes2human(input_bytes):
    """
    
    Examples
    --------
    bytes2human(10000) # '9.8K'
    
    References
    ----------
    https://github.com/giampaolo/psutil/blob/master/scripts/meminfo.py
    """
    symbols = ('K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y')
    prefix = {}
    for i, s in enumerate(symbols):
        prefix[s] = 1 << (i + 1) * 10

    for s in reversed(symbols):
        if input_bytes >= prefix[s]:
            value = input_bytes / prefix[s]
            return '%.1f%s' % (value, s)

    return '%sB' % input_bytes

In [None]:
import psutil

# pip install psutil
available_memory = psutil.virtual_memory().available
bytes2human(available_memory)

## ANN Benchmarks

In [None]:
# python install.py --algo annoy
# python install.py --algo nmslib
# python install.py --algo faiss

In [None]:
# python run.py --dataset glove-100-angular --algo annoy