In [4]:
# Copyright (c) 2016-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree. An additional grant
# of patent rights can be found in the PATENTS file in the same directory.

# Evaluate pysparnn on Enron data

In [1]:
import numpy as np
import time
import inspect

from scipy.sparse import csr_matrix
from sklearn.datasets import fetch_20newsgroups
from sklearn.neighbors import LSHForest
from sklearn.feature_extraction import DictVectorizer

In [2]:
# make sure you run 'python setup.py install' first!
import pysparnn

## Get data

In [3]:
# load enron data 
import os
import sys

docs = []
max_docs = 100000
for folder, subs, files in os.walk('maildir'):
    for filename in files:
        with open(os.path.join(folder, filename), 'r') as src:
            txt = ' '.join(src.readlines())
            if len(txt) > 0:
                docs.append(txt.split())
        if len(docs) > max_docs:
            break                
    if len(docs) > max_docs:
        break

In [4]:
print 'Num docs: {}'.format(len(docs))
print 'Avg doc length: {}'.format(np.mean([len(x) for x in docs]))
words = set()
for doc in docs:
    words.update(doc)
print 'Num unique words: {}'.format(len(words))

Num docs: 100001
Avg doc length: 413.757442426
Num unique words: 942676


## Turn documents into vectors

In [5]:
from sklearn.neighbors import LSHForest, NearestNeighbors 
from sklearn.feature_extraction import DictVectorizer

from sklearn.model_selection import train_test_split

dv = DictVectorizer()

dicts = []
for d in docs:
    dicts.append(dict([(w, 1) for w in d]))

features = csr_matrix(dv.fit_transform(dicts), dtype=int)

doc_index = np.array(range(len(docs)))

In [6]:
test_features = features[:500]
train_features = features[500:]

## Create an answer key

In [7]:
knn = NearestNeighbors(algorithm='brute', metric='cosine')
        
knn.fit(train_features)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [8]:
t0 = time.time()
# get 3 KNN for each document
answers = knn.kneighbors(test_features, n_neighbors=1, return_distance=False)
time.time() - t0

9.62377405166626

## Build models to compare

In [9]:
snn = pysparnn.MultiClusterIndex(train_features, doc_index, num_indexes=2)

In [10]:
lshf = LSHForest(n_neighbors=1)
        
lshf.fit(train_features)

LSHForest(min_hash_match=4, n_candidates=50, n_estimators=10, n_neighbors=1,
     radius=1.0, radius_cutoff_ratio=0.9, random_state=None)

## Compare results

In [11]:
import pysparnn_utils
import time 

In [13]:
t0 = time.time()

results = snn.search(test_features, return_distance=False)

print 'Recall:', pysparnn_utils.recall(answers, results).mean()

snn_time = time.time() - t0
snn_time

Recall: 0.872


5.836416959762573

In [16]:
# only search one index instead of 2
t0 = time.time()

results = snn.search(test_features, return_distance=False, num_indexes=1)

print 'Recall:', pysparnn_utils.recall(answers, results).mean()

time.time() - t0

Recall: 0.668


2.6423888206481934

In [14]:
t0 = time.time()

results = lshf.kneighbors(test_features, return_distance=False)

print 'Recall:',  pysparnn_utils.recall(answers, results).mean()
lsh_time = time.time() - t0
lsh_time

Recall: 0.934


23.57533288002014

In [15]:
# LSH is 4.5x slower than snn
lsh_time / snn_time

4.0393503484335005