https://github.com/soundcloud/cosine-lsh-join-spark

In [1]:
import os
import requests


def download(url, filename):
    with open(filename, "wb") as file:
        response = requests.get(url)
        file.write(response.content)


DATA_DIR = './datasets/'
URL = 'http://nlp.stanford.edu/data/glove.twitter.27B.zip'
filename = os.path.join(DATA_DIR, 'glove.twitter.27B.zip')
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

if not os.path.exists(filename):
    download(URL, filename)

In [2]:
import zipfile
import numpy as np
from sklearn.model_selection import train_test_split


def get_train_test_data(filename):
    dimension = 25  # valid values 25, 50, 100, 200
    test_size = 0.2
    random_state = 1234

    with zipfile.ZipFile(filename) as f:
        X = []
        zip_filename = 'glove.twitter.27B.{}d.txt'.format(dimension)
        for line in f.open(zip_filename):
            # remove the first index, id field and only get the vectors
            vector = np.array([float(x) for x in line.strip().split()[1:]])
            X.append(vector)

        X_train, X_test = train_test_split(
            np.array(X), test_size=test_size, random_state=random_state)

    # downsample for experimentation purpose
    # X_train = X_train[:50000]
    # X_test = X_test[:10000]
    return X_train, X_test


X_train, X_test = get_train_test_data(filename)
print('training data shape: ', X_train.shape)
print('testing data shape: ', X_test.shape)

training data shape:  (954811, 25)
testing data shape:  (238703, 25)


In [3]:
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.neighbors import NearestNeighbors


class BruteForce:

    def __init__(self):
        pass

    def fit(self, X):
        lens = (X ** 2).sum(axis=-1)
        index = X / np.sqrt(lens)[:, np.newaxis]
        self.index_ = np.ascontiguousarray(index, dtype=np.float32)
        return self

    def query(self, vector, topn):
        """Find indices of `n` most similar vectors from the index to query vector `v`."""

        # argmax_a dot(a, b) / |a||b| = argmin_a -dot(a, b)
        dists = -np.dot(self.index_, vector)
        indices = np.argpartition(dists, topn)[:topn]
        return sorted(indices, key=lambda index: dists[index])


class KDTree:

    def __init__(self, topn=10, n_jobs=-1):
        self.topn = topn
        self.n_jobs = n_jobs

    def fit(self, X):
        X_normed = normalize(X)
        index = NearestNeighbors(
            n_neighbors=self.topn, metric='euclidean', n_jobs=self.n_jobs)
        index.fit(X_normed)
        self.index_ = index
        return self

    def query_batch(self, X):
        X_normed = normalize(X)
        _, indices = self.index_.kneighbors(X_normed)
        return indices

    def query(self, vector):
        """Find indices of `n` most similar vectors from the index to query vector `v`."""
        vector_normed = normalize(vector.reshape(1, -1))
        _, indices = self.index_.kneighbors(vector_normed)
        return indices.ravel()

In [4]:
import time


def get_ground_truth(X_train, X_test, kdtree_params):
    start = time.time()
    kdtree = KDTree()
    kdtree.fit(X_train)
    build_time = time.time() - start

    start = time.time()
    indices = kdtree.query_batch(X_test)
    search_time = time.time() - start

    ground_truth = [(vector, index) for vector, index in zip(X_test, indices)]
    return build_time, search_time, ground_truth

In [5]:
from joblib import dump, load


MODEL_DIR = 'model'
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)

ground_truth_filename = 'ground_truth.pkl'
ground_truth_filepath = os.path.join(MODEL_DIR, ground_truth_filename)
print('ground truth filepath: ', ground_truth_filepath)

if os.path.exists(ground_truth_filepath):
    ground_truth = load(ground_truth_filepath)
else:
    # using a setting of kdtree_params = {'topn': 10, 'n_jobs': -1},
    # it took at least 1 hour to finish on a 8 core laptop
    kdtree_params = {'topn': 10, 'n_jobs': -1}
    build_time, search_time, ground_truth = get_ground_truth(X_train, X_test, kdtree_params)
    dump(ground_truth, ground_truth_filepath)

ground_truth[0]

ground truth filepath:  model/ground_truth.pkl


(array([ 0.84227,  0.19005,  1.5346 ,  0.88995, -1.6548 , -0.60046,
        -1.3206 , -1.5521 , -0.30763, -0.56361,  1.5054 ,  3.2881 ,
         1.7582 , -0.63313, -0.48781,  2.0016 , -2.5334 ,  1.0601 ,
        -0.19666, -0.38252,  0.65653,  0.89475,  2.7882 ,  2.4109 ,
        -0.72981]),
 array([213945, 566700, 232533, 673941,  79801, 932371,  59183, 318977,
        649659, 871934]))

In [6]:
import nmslib


class Hnsw:

    def __init__(self, index_params=None, query_params=None, print_progress=True):
        self.index_params = index_params
        self.query_params = query_params
        self.print_progress = print_progress

    def fit(self, X):
        index_params = self.index_params
        if index_params is None:
            index_params = {'M': 16, 'post': 0, 'efConstruction': 400}

        query_params = self.query_params
        if query_params is None:
            query_params = {'ef': 90}

        index = nmslib.init(space='cosinesimil', method='hnsw')
        index.addDataPointBatch(X)
        index.createIndex(index_params, print_progress=self.print_progress)
        index.setQueryTimeParams(query_params)

        self.index_ = index
        self.index_params_ = index_params
        self.query_params_ = query_params
        return self

    def query(self, vector, topn):
        indices, distances = self.index_.knnQuery(vector, k=topn)
        return indices

In [6]:
start = time.time()
hnsw = Hnsw()
hnsw.fit(X_train)
build_time = time.time() - start
build_time

369.5593509674072

In [7]:
topn = 10

query_vector, correct_indices = ground_truth[0]
start = time.time()
found_indices = hnsw.query(query_vector, topn)
search_time = time.time() - start
print(search_time)
found_indices

0.002225160598754883


array([213945, 566700, 232533, 673941, 932371,  59183, 318977, 649659,
       871934, 221617], dtype=int32)

In [8]:
precision = len(set(found_indices).intersection(correct_indices)) / topn
precision

0.9

In [14]:
from tqdm import trange


def run_algo(X_train, X_test, topn, ground_truth, algo_type, algo_params):
    
    if algo_type == 'hsnw':
        algo = Hnsw()
    elif algo_type == 'n2':
        algo = N2()

    start = time.time()
    algo.fit(X_train)
    build_time = time.time() - start

    total_correct = 0
    total_search_time = 0.0
    n_queries = len(ground_truth)
    for i in trange(n_queries):
        query_vector, correct_indices = ground_truth[i]

        start = time.time()
        found_indices = algo.query(query_vector, topn)
        search_time = time.time() - start
        total_search_time += search_time

        n_correct = len(set(found_indices).intersection(correct_indices))
        total_correct += n_correct

    avg_search_time = total_search_time / n_queries
    avg_precision = total_correct / (n_queries * topn)
    return build_time, avg_search_time, avg_precision

In [15]:
algo_type = 'hsnw'
algo_params = {
    'index_params': {'M': 16, 'post': 0, 'efConstruction': 100}
}

build_time, avg_search_time, avg_precision = run_algo(
    X_train, X_test, topn, ground_truth, algo_type, algo_params)
print('build time: ', build_time)
print('average search time: ', avg_search_time)
print('average precision: ', avg_precision)

100%|██████████| 238703/238703 [00:46<00:00, 5136.39it/s]

build time:  292.62172412872314
average search time:  0.00016463863871264096
average precision:  0.9770141137731826





In [9]:


build_time, avg_search_time, avg_precision = run_algo(
    X_train, X_test, topn, ground_truth)
print('build time: ', build_time)
print('average search time: ', avg_search_time)
print('average precision: ', avg_precision)

100%|██████████| 238703/238703 [00:45<00:00, 5289.05it/s]


build time:  308.47004795074463
average search time:  0.00015883217352200492
average precision:  0.9770610340045999


In [12]:
from n2 import HnswIndex

class N2:

    def __init__(self):
        pass

    def fit(self, X):
        index = HnswIndex(dimension=X.shape[1])
        for vector in X:
            index.add_data(vector)

        index.build(m=16, ef_construction=200, n_threads=8)
        self.index_ = index
        return self

    def query(self, vector, topn):
        indices = self.index_.search_by_vector(vector, k=topn, ef_search=50)
        return indices

In [17]:
start = time.time()
n2 = N2()
n2.fit(X_train)
build_time = time.time() - start
build_time

149.39423203468323

In [21]:
topn = 10

query_vector, correct_indices = ground_truth[0]
start = time.time()
# found_indices = n2.query(query_vector, topn)
found_indices = n2.index_.search_by_vector(query_vector, k=topn, ef_search=100)
search_time = time.time() - start
print(search_time)
found_indices

0.0003478527069091797


[232533, 213945, 932371, 673941, 566700, 59183, 318977, 221617, 431056, 908013]

In [22]:
correct_indices

array([213945, 566700, 232533, 673941,  79801, 932371,  59183, 318977,
       649659, 871934])

In [19]:
precision = len(set(found_indices).intersection(correct_indices)) / topn
precision

0.7

In [13]:
algo_type = 'n2'
topn = 10
build_time, avg_search_time, avg_precision = run_algo(X_train, X_test, topn, ground_truth, algo_type)
print('build time: ', build_time)
print('average search time: ', avg_search_time)
print('average precision: ', avg_precision)

100%|██████████| 238703/238703 [00:22<00:00, 10387.34it/s]

build time:  170.42729496955872
average search time:  8.815961418996128e-05
average precision:  0.5988177777405395





In [10]:
hi

NameError: name 'hi' is not defined

In [None]:
import nmslib

index = nmslib.init(space='cosinesimil', method='hnsw')
# for i, x in enumerate(X):
#     nmslib.addDataPoint(self._index, i, x.tolist())

#index.addDataPointBatch(X_train)

#index_params = {'post': 2}
#index.createIndex(index_params, print_progress=True)

In [12]:
import numpy as np
from n2 import HnswIndex


index_params = {'M': 16, 'post': 0, 'efConstruction': 400}
query_params = {'ef': 90}

N, dim = 10240, 20
samples = np.arange(N * dim).reshape(N, dim)

index = HnswIndex(dim)
for sample in samples:
    index.add_data(sample)
index.build(m=16, ef_construction=400, n_threads=8)
print(index.search_by_id(0, 10))
# [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [13]:
index.search_by_vector(samples[0], k=5, ef_search=90)

[36, 37, 38, 39, 40]

In [None]:
# neighbours = index.knnQueryBatch(data, k=10, num_threads=4)
ids, distances = index.knnQuery(X_test[0], k=10)
ids

In [None]:
# if os.path.exists(self._index_name):
#     logging.debug("Loading index from file")
#     nmslib.loadIndex(self._index, self._index_name)
# else:
#     logging.debug("Create Index")
#     nmslib.createIndex(self._index, self._index_param)
#     if self._save_index:
#         nmslib.saveIndex(self._index, self._index_name)

# nmslib.setQueryTimeParams(self._index, self._query_param)

# def query(self, v, n):
# import nmslib
# return nmslib.knnQuery(self._index, n, v.tolist())

# def freeIndex(self):
# import nmslib
# nmslib.freeIndex(self._index)

In [None]:
import nmslib


# initialize a new index, using a HNSW index on Cosine Similarity
index = nmslib.init(method='hnsw', space='cosinesimil')
# index.addDataPointBatch(data)
# index.createIndex({'post': 2}, print_progress=True)

# query for the nearest neighbours of the first datapoint
# ids, distances = index.knnQuery(data[0], k=10)

In [None]:
hi

## Docker Utility Function

In [None]:
import docker

# https://docker-py.readthedocs.io/en/stable/client.html
docker_client = docker.from_env()
docker_client

In [None]:
images = docker_client.images.list()
images[0]

In [None]:
def bytes2human(input_bytes):
    """
    
    Examples
    --------
    bytes2human(10000) # '9.8K'
    
    References
    ----------
    https://github.com/giampaolo/psutil/blob/master/scripts/meminfo.py
    """
    symbols = ('K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y')
    prefix = {}
    for i, s in enumerate(symbols):
        prefix[s] = 1 << (i + 1) * 10

    for s in reversed(symbols):
        if input_bytes >= prefix[s]:
            value = input_bytes / prefix[s]
            return '%.1f%s' % (value, s)

    return '%sB' % input_bytes

In [None]:
import psutil

# pip install psutil
available_memory = psutil.virtual_memory().available
bytes2human(available_memory)

In [None]:
import numpy as np
from scipy.spatial.distance import cosine

vec1 = np.array([1.0, 2.0, 3.0])
vec2 = np.array([2.0, 5.0, 1.0])
vec3 = np.array([3.0, 4.0, 0.0])
vec4 = np.array([4.0, 3.0, 2.0])

cosine(vec1, vec2)

In [None]:
1 - vec1.dot(vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

## Maximum Inner Product

In [None]:
# python install.py --algo annoy
# python install.py --algo nmslib
# python install.py --algo faiss

In [None]:
# python run.py --dataset glove-100-angular --algo annoy

- https://cmry.github.io/notes/euclidean-v-cosine
- https://stats.stackexchange.com/questions/146221/is-cosine-similarity-identical-to-l2-normalized-euclidean-distance