In [1]:
# Copyright (c) 2016-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree. An additional grant
# of patent rights can be found in the PATENTS file in the same directory.


import numpy as np
import time
from scipy.sparse import csr_matrix
from sklearn.datasets import fetch_20newsgroups
from sklearn.neighbors import LSHForest
from sklearn.feature_extraction import DictVectorizer

In [2]:
# make sure you run 'python setup.py install' first!
import pysparnn

# Fetch data

In [3]:
dataset = fetch_20newsgroups(subset='all', shuffle=True)

docs = np.array([x.split() for x in dataset.data])


In [4]:
print 'Num docs: {}'.format(len(docs))
print 'Avg doc length: {}'.format(np.mean([len(x) for x in docs]))
words = set()
for doc in docs:
    words.update(doc)
print 'Num unique words: {}'.format(len(words))    

Num docs: 18846
Avg doc length: 283.656001273
Num unique words: 386410


# Build LSH & PySparNN indexes

In [5]:
import pysparnn_utils
import inspect

In [6]:
print inspect.getsource(pysparnn_utils.PySparNNTextSearch)

class PySparNNTextSearch:
    def __init__(self, docs, k, matrix_size=None):
        self.dv = DictVectorizer()
        self.k = k
        datas = np.array(range(len(docs)))
        dicts = []
        for d in docs:
            dicts.append(dict([(w, 1) for w in d]))
        self.dv.fit(dicts)
        features = csr_matrix(self.dv.transform(dicts), dtype=int)
        self.cp = pysparnn.ClusterIndex(features, datas, matrix_size=matrix_size)
        
    def search(self, docs):
        dicts = []
        for d in docs:
            dicts.append(dict([(w, 1) for w in d]))
        features = csr_matrix(self.dv.transform(dicts), dtype=int)
        return self.cp.search(features, k=self.k, k_clusters=1, return_distance=False)



In [7]:
print inspect.getsource(pysparnn_utils.LSHForestTextSearch)

class LSHForestTextSearch:
    def __init__(self, docs, k):
        self.lshf = LSHForest(n_estimators=1, n_candidates=1,
                     n_neighbors=k)
        self.dv = DictVectorizer()
        self.k = k
        
        dicts = []
        for d in docs:
            dicts.append(dict([(w, 1) for w in d]))
        self.dv.fit(dicts)
        features = self.dv.transform(dicts)
        # floats are faster
        # features = csr_matrix(features, dtype=int)
        self.lshf.fit(features)
        
    def search(self, docs):
        dicts = []
        for d in docs:
            dicts.append(dict([(w, 1) for w in d]))
        features = self.dv.transform(dicts)
        # floats are faster
        # features = csr_matrix(features, dtype=int)
        return self.lshf.kneighbors(features, return_distance=False, n_neighbors=self.k)    



In [8]:
k = 1
index_docs = docs
search_docs = index_docs[:1000]

In [9]:
t0 = time.time()
knn_search = pysparnn_utils.KNNTextSearch(index_docs, k)
print(time.time() - t0)

14.829020977


In [10]:
answers = knn_search.search(search_docs)

In [11]:
t0 = time.time()
snn_search = pysparnn_utils.PySparNNTextSearch(index_docs, k)
print(time.time() - t0)

20.4018728733


In [12]:
t0 = time.time()    
lsh = pysparnn_utils.LSHForestTextSearch(docs, k=3) 
print(time.time() - t0)

15.7356410027


In [13]:
snn_time, snn_accuracy = pysparnn_utils.knn_benchmark(snn_search, search_docs, answers, n_trials=2000, docs_per_query=1)
lsh_time, lsh_accuracy = pysparnn_utils.knn_benchmark(lsh, search_docs, answers, n_trials=2000, docs_per_query=1)

In [14]:
print('PySparNN median time per query: {0}'.format(snn_time)) 
print('PySparNN median accuracy: {0}'.format(snn_accuracy)) 

PySparNN median time per query: 0.0070708990097
PySparNN median accuracy: 1.0


In [15]:
print('LSH median time per query: {0}'.format(lsh_time)) 
print('LSH median accuracy: {0}'.format(lsh_accuracy)) 

LSH median time per query: 0.0999974012375
LSH median accuracy: 1.0


In [16]:
lsh_time / snn_time

14.142105706819523