In [26]:
# Copyright (c) 2016-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree. An additional grant
# of patent rights can be found in the PATENTS file in the same directory.


import numpy as np
import time

from sklearn.datasets import fetch_20newsgroups
from sklearn.neighbors import LSHForest
from sklearn.feature_extraction import DictVectorizer

In [27]:
import pysparnn

In [28]:
dataset = fetch_20newsgroups(subset='all', shuffle=True, random_state=42)

In [4]:
docs = np.array([x.split() for x in dataset.data])
datas = np.array(range(len(docs)))

In [5]:
class SNNSearch:
    def __init__(self, docs, datas):
        
        features = []
        for d in docs:
            features.append(dict([(w, 1) for w in d]))
        self.cp = pysparnn.ClusterIndex(features, datas)
        
    def search(self, docs):
        dicts = []
        for d in docs:
            dicts.append(dict([(w, 1) for w in d]))
        return self.cp.search(dicts, return_similarity=False, k=1, k_clusters=1)
        

t0 = time.time()
snn_search = SNNSearch(docs, datas)
print(time.time() - t0)

3.6509988308


In [6]:
class LSHSearch:
    def __init__(self, docs):
        self.lshf = LSHForest(n_estimators=1, n_candidates=1,
                     n_neighbors=1)
        self.dv = DictVectorizer()
        dicts = []
        for d in docs:
            dicts.append(dict([(w, 1) for w in d]))
        self.dv.fit(dicts)
        self.lshf.fit(self.dv.transform(dicts))
        
    def search(self, docs):
        dicts = []
        for d in docs:
            dicts.append(dict([(w, 1) for w in d]))
        return self.lshf.kneighbors(self.dv.transform(dicts), return_distance=False)
    
t0 = time.time()    
lsh = LSHSearch(docs) 
print(time.time() - t0)

4.54637718201


### Compare query speed an accuracy

In [7]:
import time
import random
def accuracy(result, truth):
    ret =  []
    for r, t in zip(result, truth):
        ret.append(1 if t in r else 0)
    return np.array(ret)



def time_it(search_index, docs, query_index):
    t0 = time.time()
    neighbors = search_index.search(docs[query_index])
    delta = time.time() - t0

    return delta, accuracy(neighbors, query_index).mean()

def time_it_n(search_index, docs, n=100, k_docs=100):

    times = []
    accuracys = []
    for i in range(n):
        query_index = random.sample(range(len(docs)), k_docs)
        time, accuracy = time_it(search_index, docs, query_index)
        times.append(time)
        accuracys.append(accuracy)
    return np.mean(times), np.mean(accuracys)

In [10]:
lsh_time, lsh_accuracy = time_it_n(lsh, docs)
print('LSH time per query: {0}'.format(lsh_time)) 
print('LSH average accuracy: {0}'.format(lsh_accuracy)) 

LSH time per query: 0.290881130695
LSH average accuracy: 0.9998


In [11]:
snn_time, snn_accuracy = time_it_n(snn_search, docs)
print('PySparNN time per query: {0}'.format(snn_time)) 
print('PySparNN average accuracy: {0}'.format(snn_accuracy)) 

PySparNN time per query: 0.214360470772
PySparNN average accuracy: 0.9997
