In [1]:
# Copyright (c) 2016-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree. An additional grant
# of patent rights can be found in the PATENTS file in the same directory.


# Evaluate pysparnn on 20 Newsgroups data

In [1]:
import numpy as np
import time
from scipy.sparse import csr_matrix
from sklearn.datasets import fetch_20newsgroups

In [2]:
# make sure you run 'python setup.py install' first!
import pysparnn

# Get data

In [3]:
dataset = fetch_20newsgroups(subset='all', shuffle=True)

docs = np.array([x.split() for x in dataset.data])

In [4]:
print 'Num docs: {}'.format(len(docs))
print 'Avg doc length: {}'.format(np.mean([len(x) for x in docs]))
words = set()
for doc in docs:
    words.update(doc)
print 'Num unique words: {}'.format(len(words))    

Num docs: 18846
Avg doc length: 283.656001273
Num unique words: 386410


## Turn documents into vectors

In [22]:
from sklearn.neighbors import LSHForest, NearestNeighbors 
from sklearn.feature_extraction import DictVectorizer

from sklearn.model_selection import train_test_split

dv = DictVectorizer()

dicts = []
for d in docs:
    dicts.append(dict([(w, 1) for w in d]))

features = csr_matrix(dv.fit_transform(dicts), dtype=int)

doc_index = np.array(range(len(docs)))

In [23]:
test_features = features[:100]
train_features = features[100:]

## Create an answer key

In [24]:
knn = NearestNeighbors()
        
knn.fit(train_features)

answers = knn.kneighbors(test_features, 3, return_distance=False)

## Build models to compare

In [25]:
snn = pysparnn.MultiClusterIndex(train_features, doc_index, num_indexes=2)

In [26]:
lshf = LSHForest()
        
lshf.fit(train_features)

LSHForest(min_hash_match=4, n_candidates=50, n_estimators=10, n_neighbors=5,
     radius=1.0, radius_cutoff_ratio=0.9, random_state=None)

## Compare results

In [27]:
import pysparnn_utils

In [28]:
t0 = time.time()

results = snn.search(test_features, return_distance=False, num_indexes=1)

print 'Percent of time snn returns a top 3 result:', pysparnn_utils.recall(answers, results).mean()

snn_time = time.time() - t0

Percent of time snn returns a top 3 result: 0.23


In [29]:
t0 = time.time()

results = lshf.kneighbors(test_features, return_distance=False)

print 'Percent of time lsh returns a top 3 result:', pysparnn_utils.recall(answers, results).mean()

lsh_time = time.time() - t0

Percent of time lsh returns a top 3 result: 0.054


In [30]:
# LSH is ~3x slower than snn
lsh_time / snn_time

4.006431332366357