In [1]:
# Copyright (c) 2016-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree. An additional grant
# of patent rights can be found in the PATENTS file in the same directory.


import numpy as np
import time
import inspect

from scipy.sparse import csr_matrix
from sklearn.datasets import fetch_20newsgroups
from sklearn.neighbors import LSHForest
from sklearn.feature_extraction import DictVectorizer

In [2]:
import pysparnn
import pysparnn_utils

In [3]:
import os
import sys

docs = []
for folder, subs, files in os.walk('maildir'):
    for filename in files:
        with open(os.path.join(folder, filename), 'r') as src:
            txt = ' '.join(src.readlines())
            if len(txt) > 0:
                docs.append(txt.split())

In [4]:
print 'Num docs: {}'.format(len(docs))
print 'Avg doc length: {}'.format(np.mean([len(x) for x in docs]))
words = set()
for doc in docs:
    words.update(doc)
print 'Num unique words: {}'.format(len(words))

Num docs: 517401
Avg doc length: 329.550878332
Num unique words: 2584811


In [5]:
print inspect.getsource(pysparnn_utils.PySparNNTextSearch)

class PySparNNTextSearch:
    def __init__(self, docs, k, matrix_size=None):
        self.dv = DictVectorizer()
        self.k = k
        datas = np.array(range(len(docs)))
        dicts = []
        for d in docs:
            dicts.append(dict([(w, 1) for w in d]))
        self.dv.fit(dicts)
        features = csr_matrix(self.dv.transform(dicts), dtype=int)
        self.cp = pysparnn.ClusterIndex(features, datas, matrix_size=matrix_size)
        
    def search(self, docs):
        dicts = []
        for d in docs:
            dicts.append(dict([(w, 1) for w in d]))
        features = csr_matrix(self.dv.transform(dicts), dtype=int)
        return self.cp.search(features, k=self.k, k_clusters=1, return_distance=False)



In [None]:
t0 = time.time()
text_search = pysparnn_utils.PySparNNTextSearch(docs, 1)
print(time.time() - t0)

In [30]:
answers = list(range(len(docs)))

In [None]:
snn_time, snn_accuracy = pysparnn_utils.knn_benchmark(text_search, docs, answers)

In [None]:
print('PySparNN median time per query: {0}'.format(snn_time)) 
print('PySparNN median accuracy: {0}'.format(snn_accuracy)) 

In [None]:
print inspect.getsource(pysparnn_utils.LSHForestTextSearch)

In [None]:
t0 = time.time()
lsh_search = pysparnn_utils.LSHForestTextSearch(docs, 2)
print(time.time() - t0)

In [None]:
lsh_time, lsh_accuracy = pysparnn_utils.knn_benchmark(lsh_search, docs, answers)

In [None]:
print('LSH median time per query: {0}'.format(lsh_time)) 
print('LSH median accuracy: {0}'.format(lsh_accuracy)) 

In [None]:
lsh_time / snn_time