diff --git a/CONTRIBUTING b/CONTRIBUTING new file mode 100644 index 0000000..c417720 --- /dev/null +++ b/CONTRIBUTING @@ -0,0 +1,36 @@ +# Contributing to PySparNN +We want to make contributing to this project as easy and transparent as +possible. + +## Pull Requests +We actively welcome your pull requests. + +1. Fork the repo and create your branch from `master`. +2. If you've added code that should be tested, add tests. +3. If you've changed APIs, update the documentation. +4. Ensure the test suite passes. +5. Make sure your code lints. +6. If you haven't already, complete the Contributor License Agreement ("CLA"). + +## Contributor License Agreement ("CLA") +In order to accept your pull request, we need you to submit a CLA. You only need +to do this once to work on any of Facebook's open source projects. + +Complete your CLA here: + +## Issues +We use GitHub issues to track public bugs. Please ensure your description is +clear and has sufficient instructions to be able to reproduce the issue. + +Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe +disclosure of security bugs. In those cases, please go through the process +outlined on that page and do not file a public issue. + +## Coding Style +* 2 spaces for indentation rather than tabs +* 80 character line length +* TODO: Finish THIS + +## License +By contributing to PySparNN, you agree that your contributions will be licensed +under its BSD license. diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..3315e03 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,30 @@ +BSD License + +For PySparNN software + +Copyright (c) 2016-present, Facebook, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name Facebook nor the names of its contributors may be used to + endorse or promote products derived from this software without specific + prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/PATENTS b/PATENTS new file mode 100644 index 0000000..53c2a9f --- /dev/null +++ b/PATENTS @@ -0,0 +1,33 @@ +Additional Grant of Patent Rights Version 2 + +"Software" means the PySparNN software distributed by Facebook, Inc. + +Facebook, Inc. ("Facebook") hereby grants to each recipient of the Software +("you") a perpetual, worldwide, royalty-free, non-exclusive, irrevocable +(subject to the termination provision below) license under any Necessary +Claims, to make, have made, use, sell, offer to sell, import, and otherwise +transfer the Software. For avoidance of doubt, no license is granted under +Facebook’s rights in any patent claims that are infringed by (i) modifications +to the Software made by you or any third party or (ii) the Software in +combination with any software or other technology. + +The license granted hereunder will terminate, automatically and without notice, +if you (or any of your subsidiaries, corporate affiliates or agents) initiate +directly or indirectly, or take a direct financial interest in, any Patent +Assertion: (i) against Facebook or any of its subsidiaries or corporate +affiliates, (ii) against any party if such Patent Assertion arises in whole or +in part from any software, technology, product or service of Facebook or any of +its subsidiaries or corporate affiliates, or (iii) against any party relating +to the Software. Notwithstanding the foregoing, if Facebook or any of its +subsidiaries or corporate affiliates files a lawsuit alleging patent +infringement against you in the first instance, and you respond by filing a +patent infringement counterclaim in that lawsuit against that party that is +unrelated to the Software, the license granted hereunder will not terminate +under section (i) of this paragraph due to such counterclaim. + +A "Necessary Claim" is a claim of a patent owned by Facebook that is +necessarily infringed by the Software standing alone. + +A "Patent Assertion" is any lawsuit or other action alleging direct, indirect, +or contributory infringement or inducement to infringe any patent, including a +cross-claim or counterclaim. diff --git a/README.md b/README.md new file mode 100644 index 0000000..ea4623d --- /dev/null +++ b/README.md @@ -0,0 +1,66 @@ +Blockers: +* Finsih contributing file +* pylint +* matrix vector mulitply discussion + +# PySparNN +Sparse (approximate) nearest neighbor search for python! This library is well suited to finding nearest neighbors in sparse, high dimensional spaces (like a text doccuments). + +Out of the box, PySparNN supports Cosine Similarity. + +PySparNN can be easily extended with abritrary similarity metrics (Manhattan, Eculidian, Jaccard, etc). + +If your data is NOT SPARSE & you don't require a custom distance function - please consider [annoy](https://github.com/spotify/annoy). +It uses a similar-ish method and I am a big fan of it. As of this writing, annoy performs ~8x faster on their introductory example. + +The most comparable library to PySparNN is scikit-learn's LSHForrest module. As of this writing, PySparNN is ~30% faster on the 20newsgroups dataset. [Here is the comparison.](https://github.com/facebook/PySparNN/blob/master/sparse_search_comparison.ipynb) + +Notes: +* A future update may allow incremental insertions. + +## Example Usage +``` +import pysparnn as snn + +data = [ + 'hello world', + 'oh hello there', + 'Play it', + 'Play it again Sam', +] + +features = [dict([(x, 1) for x in f.split()]) for f in data] + +cp = snn.ClusterIndex(features, data) + +cp.search(features, threshold=0.50, k=1, return_similarity=False) +>> [[u'hello world'], [u'oh hello there'], [u'Play it'], [u'Play it again Sam']] + +cp.search(features, threshold=0.50, k_clusters=2, k=2, return_similarity=False) +>> [[u'hello world'], +>> [u'oh hello there'], +>> [u'Play it', u'Play it again Sam'], +>> [u'Play it again Sam', u'Play it']] + +``` + +## Requirements +PySparNN requires numpy. Tested with numpy 1.10.4. + +## How PySparNN works +Searching for a document in an collection of K documents is naievely O(K) (assuming documents are constant sized). + +However! we can create a tree structure where the first level is O(sqrt(K)) and each of the leaves are also O(sqrt(K)). + +We randomly pick sqrt(K) items to be in the top level. Then for each of the K doccuments - assign it to the closest neighbor in the top +level. + +This breaks up one O(K) search into two O(sqrt(K)) searches which is much much faster when K is big! + +## Further Information +http://nlp.stanford.edu/IR-book/html/htmledition/cluster-pruning-1.html + +See the CONTRIBUTING file for how to help out. + +## License +PySparNN is BSD-licensed. We also provide an additional patent grant. diff --git a/pysparnn/ClusterPruning.py b/pysparnn/ClusterPruning.py new file mode 100644 index 0000000..82f2b88 --- /dev/null +++ b/pysparnn/ClusterPruning.py @@ -0,0 +1,283 @@ +# Copyright (c) 2016-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +import collections +import math +import numpy as np +import random +import scipy.sparse + +def k_best(l, k, similarity): + """Get the k-best tuples by similarity. + Args: + l: List of tuples. + k: Number of tuples to return. + similarity: Boolean value indicating if similarity values should be + returned. + Returns: + The K-best tuples (similarity, value) by similarity score. + """ + l = sorted(l, key=lambda x: x[0], reverse=True)[:k] + if similarity: + return l + else: + return [x[1] for x in l] + +class MatrixCluster(object): + """A sparse matrix representation out of features.""" + + def __init__(self, records_features, records_data): + """ + Args: + records_features: List of features in the format of + {feature_name1 -> value1, feature_name2->value2, ...}. + records_data: Data to return when a doc is matched. Index of + corresponds to records_features. + """ + self.dimension = {} + self.inverse_dimension = {} + self.matrix = self._create_matrix(records_features, + expand_dimension=True) + self.records_features = np.array(records_features) + self.records_data = np.array(records_data) + + + def _create_matrix(self, records_features, expand_dimension=False): + """Create a sparse matrix out of a set of features. + Args: + records_features: List of features in the format of + {feature_name1 -> value1, feature_name2->value2, ...}. + records_data: Data to return when a doc is matched. Index of + corresponds to records_features. + expand_dimension: Should the dimension of the space be expanded? + True on initialization. False on search. + """ + indptr = [0] + indices = [] + data = [] + # could force records_features to be a list of (int, float) instead of + # ageneric dict (that can take strings) + for features in records_features: + for feature, value in features.iteritems(): + if expand_dimension or feature in self.dimension: + index = self.dimension.setdefault(feature, + len(self.dimension)) + self.inverse_dimension[index] = feature + indices.append(index) + data.append(self._transform_value(value)) + indptr.append(len(indices)) + + shape = (len(records_features), len(self.dimension)) + return scipy.sparse.csr_matrix((data, indices, indptr), dtype=float, + shape=shape) + + + def nearest_search(self, features, k=1, threshold=0.0): + """Find the closest item(s) for each feature_list in + + Args: + features_list: A list where each element is a list of features + to query. + k: Return the k closest results. + threshold: Return items only above the threshold. + + Returns: + For each element in features_list, return the k-nearest items + and their similarity clores + [[(score1_1, item1_1), ..., (score1_k, item1_k)], + [(score2_1, item2_1), ..., (score2_k, item2_k)], ...] + """ + a = self._create_matrix(features) + sim_matrix = self._similarity(a).toarray() + sim_filter = sim_matrix >= threshold + + ret = [] + for i in range(sim_matrix.shape[0]): + # these arrays are the length of the sqrt(index) + # replacing the for loop by matrix ops could speed things up + + index = sim_filter[i] + scores = sim_matrix[i][index] + records = self.records_data[index] + arg_index = np.argsort(scores)[-k:] + + curr_ret = zip(scores[arg_index], records[arg_index]) + + ret.append(curr_ret) + + return ret + +#class UnitVecCosineMatrixCluster(MatrixCluster): +# def __init__(self, records_features, records_data): +# super(UnitVecCosineMatrixCluster, self).__init__(records_features, +# records_data) +# +# # we inforce 1 hot encodeing. this means that all our values are +# # 0 or 1 +# # since 1^2 == 1, we can do a sum shortcut instad of sum of squares +# # this is much faster and more memory efficent +# self.matrix_root_sum_square = \ +# np.sqrt(np.asarray(self.matrix.sum(axis=1)).reshape(-1)) +# +# def _transform_value(self, v): +# return 1 +# +# def _similarity(self, a): +# """Vectorised cosine similarity""" +# dprod = a.dot(self.matrix.transpose()) * 1.0 +# +# a_root_sum_square = np.asarray(a.sum(axis=1)).reshape(-1) +# a_root_sum_square = a_root_sum_square.reshape(len(a_root_sum_square), 1) +# a_root_sum_square = np.sqrt(a_root_sum_square) +# +# magnitude = 1.0 / (a_root_sum_square * self.matrix_root_sum_square) +# +# return dprod.multiply(magnitude) + +class CosineSimilarity(MatrixCluster): + def __init__(self, records_features, records_data): + super(CosineSimilarity, self).__init__(records_features, records_data) + + m_c = self.matrix.copy() + m_c.data **= 2 + self.matrix_root_sum_square = \ + np.sqrt(np.asarray(m_c.sum(axis=1)).reshape(-1)) + + def _transform_value(self, v): + return v + + def _similarity(self, a): + """Vectorised cosine similarity""" + dprod = a.dot(self.matrix.transpose()) * 1.0 + + a_c = a.copy() + a_c.data **= 2 + a_root_sum_square = np.asarray(a_c.sum(axis=1)).reshape(-1) + a_root_sum_square = \ + a_root_sum_square.reshape(len(a_root_sum_square), 1) + a_root_sum_square = np.sqrt(a_root_sum_square) + + magnitude = 1.0 / (a_root_sum_square * self.matrix_root_sum_square) + + return dprod.multiply(magnitude) + +class ClusterIndex(object): + """ Search structure which gives speedup at slight loss of recall. + + Uses cluster pruning structure as defined in: + http://nlp.stanford.edu/IR-book/html/htmledition/cluster-pruning-1.html + + tldr - searching for a document in an index of K documents is naievely + O(K). However you can create a tree structure where the first level + is O(sqrt(K)) and each of the leaves are also O(sqrt(K)). + + You randomly pick sqrt(K) items to be in the top level. Then for + the K doccuments you assign it to the closest neighbor in the top + level. + + This breaks up one O(K) search into two O(sqrt(K)) searches which + is much much faster when K is big. + """ + def __init__(self, records_features, records_data, + similarity_type=CosineSimilarity): + """Create a search index composed of recursively defined sparse + matricies. + + Args: + records_features: List of features in the format of + {feature_name1 -> value1, feature_name2->value2, ...}. + records_data: Data to return when a doc is matched. Index of + corresponds to records_features. + similarity_class: Class that defines the similarity measure to use. + """ + + self.records_features = np.array(records_features) + self.records_data = np.array(records_data) + + # could make this recursive at the cost of recall accuracy + # keeping to a single layer for simplicity/accuracy + num_clusters = int(math.sqrt(len(self.records_features))) + clusters_selection = random.sample(self.records_features, num_clusters) + + item_to_clusters = collections.defaultdict(list) + + root = similarity_type(clusters_selection, + list(range(len(clusters_selection)))) + + rng_step = 10000 + for rng in range(0, len(records_features), rng_step): + records_rng = records_features[rng:rng + rng_step] + for i, clstrs in enumerate(root.nearest_search(records_rng, k=1)): + for _, cluster in clstrs: + item_to_clusters[cluster].append(i + rng) + + self.clusters = [] + cluster_keeps = [] + for k in range(len(clusters_selection)): + v = item_to_clusters[k] + if len(v) > 0: + mtx = similarity_type(self.records_features[v], + self.records_data[v]) + self.clusters.append(mtx) + cluster_keeps.append(clusters_selection[k]) + + self.root = similarity_type(cluster_keeps, + list(range(len(cluster_keeps)))) + + + # TODO: I think i can save a little time by batching the searches together + # or creating one huge matrix + # TODO: Cut down index construction time + # TODO: Speed comparison tests + def search(self, records_features, k=1, threshold=0.95, k_clusters=1, + return_similarity=True): + """Find the closest item(s) for each feature_list in. + + Args: + features_list: A list where each element is a list of features + to query. + k: Return the k closest results. + threshold: Return items only above the threshold. + k_clusters: number of clusters to search. This increases recall at + the cost of some speed. + return_similarity: Return similarity values? + + Returns: + For each element in features_list, return the k-nearest items + and their similarity clores + [[(score1_1, item1_1), ..., (score1_k, item1_k)], + [(score2_1, item2_1), ..., (score2_k, item2_k)], ...] + + Note: if return_similarity is False then only items are returned + and not as a tuple. + """ + # could make this recursive at the cost of recall accuracy + # should batch requests to clusters to make this more efficent + ret = [] + nearest = self.root.nearest_search(records_features, k=k_clusters) + + # TODO: np.array-ify - this loop can be replaced by array concats + for i, nearest_clusters in enumerate(nearest): + curr_ret = [] + + for score, cluster in nearest_clusters: + + cluster_items = self.clusters[cluster].nearest_search( + [records_features[i]], k=k, threshold=threshold) + + for elements in cluster_items: + if len(elements) > 0: + if return_similarity: + curr_ret.extend(elements) + else: + curr_ret.extend(elements) + ret.append(k_best(curr_ret, k, return_similarity)) + return ret diff --git a/pysparnn/ClusterPruning.pyc b/pysparnn/ClusterPruning.pyc new file mode 100644 index 0000000..bc665fc Binary files /dev/null and b/pysparnn/ClusterPruning.pyc differ diff --git a/pysparnn/__init__.py b/pysparnn/__init__.py new file mode 100644 index 0000000..e5b2a98 --- /dev/null +++ b/pysparnn/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) 2016-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. + +from ClusterPruning import * diff --git a/pysparnn/__init__.pyc b/pysparnn/__init__.pyc new file mode 100644 index 0000000..0ac3304 Binary files /dev/null and b/pysparnn/__init__.pyc differ diff --git a/run_tests.sh b/run_tests.sh new file mode 100755 index 0000000..50ef6f0 --- /dev/null +++ b/run_tests.sh @@ -0,0 +1,2 @@ +#!/bin/sh +python -m unittest tests.test_pysparnn diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..563e499 --- /dev/null +++ b/setup.py @@ -0,0 +1,30 @@ +from distutils.core import setup + +#This is a list of files to install, and where +#(relative to the 'root' dir, where setup.py is) +#You could be more specific. +files = [] + +setup(name = "pysparnn", + version = "0.1", + description = "Sparse (approximate) nearest neighbor search for python!", + author = "Spencer Beecher", + author_email = "spencebeecher@gmail.com", + #url = "", + #Name the folder where your packages live: + #(If you have other packages (dirs) or modules (py files) then + #put them into the package directory - they will be found + #recursively.) + packages = ['pysparnn'], + #'package' package must contain files (see list above) + #I called the package 'package' thus cleverly confusing the whole issue... + #This dict maps the package name =to=> directories + #It says, package *needs* these files. + #package_data = {}, + #'runner' is in the root. + #scripts = [], + long_description = """Sparse (approximate) nearest neighbor search for python!""" + # + #This next part it for the Cheese Shop, look a little down the page. + #classifiers = [] +) diff --git a/sparse_search_comparison.ipynb b/sparse_search_comparison.ipynb new file mode 100644 index 0000000..0d9e22b --- /dev/null +++ b/sparse_search_comparison.ipynb @@ -0,0 +1,246 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Copyright (c) 2016-present, Facebook, Inc.\n", + "# All rights reserved.\n", + "#\n", + "# This source code is licensed under the BSD-style license found in the\n", + "# LICENSE file in the root directory of this source tree. An additional grant\n", + "# of patent rights can be found in the PATENTS file in the same directory.\n", + "\n", + "\n", + "import numpy as np\n", + "import time\n", + "\n", + "from sklearn.datasets import fetch_20newsgroups\n", + "from sklearn.neighbors import LSHForest\n", + "from sklearn.feature_extraction import DictVectorizer" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import pysparnn" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "dataset = fetch_20newsgroups(subset='all', shuffle=True, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "docs = np.array([x.split() for x in dataset.data])\n", + "datas = np.array(range(len(docs)))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false, + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3.6509988308\n" + ] + } + ], + "source": [ + "class SNNSearch:\n", + " def __init__(self, docs, datas):\n", + " \n", + " features = []\n", + " for d in docs:\n", + " features.append(dict([(w, 1) for w in d]))\n", + " self.cp = pysparnn.ClusterIndex(features, datas)\n", + " \n", + " def search(self, docs):\n", + " dicts = []\n", + " for d in docs:\n", + " dicts.append(dict([(w, 1) for w in d]))\n", + " return self.cp.search(dicts, return_similarity=False, k=1, k_clusters=1)\n", + " \n", + "\n", + "t0 = time.time()\n", + "snn_search = SNNSearch(docs, datas)\n", + "print(time.time() - t0)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4.54637718201\n" + ] + } + ], + "source": [ + "class LSHSearch:\n", + " def __init__(self, docs):\n", + " self.lshf = LSHForest(n_estimators=1, n_candidates=1,\n", + " n_neighbors=1)\n", + " self.dv = DictVectorizer()\n", + " dicts = []\n", + " for d in docs:\n", + " dicts.append(dict([(w, 1) for w in d]))\n", + " self.dv.fit(dicts)\n", + " self.lshf.fit(self.dv.transform(dicts))\n", + " \n", + " def search(self, docs):\n", + " dicts = []\n", + " for d in docs:\n", + " dicts.append(dict([(w, 1) for w in d]))\n", + " return self.lshf.kneighbors(self.dv.transform(dicts), return_distance=False)\n", + " \n", + "t0 = time.time() \n", + "lsh = LSHSearch(docs) \n", + "print(time.time() - t0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Compare query speed an accuracy" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import time\n", + "import random\n", + "def accuracy(result, truth):\n", + " ret = []\n", + " for r, t in zip(result, truth):\n", + " ret.append(1 if t in r else 0)\n", + " return np.array(ret)\n", + "\n", + "\n", + "\n", + "def time_it(search_index, docs, query_index):\n", + " t0 = time.time()\n", + " neighbors = search_index.search(docs[query_index])\n", + " delta = time.time() - t0\n", + "\n", + " return delta, accuracy(neighbors, query_index).mean()\n", + "\n", + "def time_it_n(search_index, docs, n=100, k_docs=100):\n", + "\n", + " times = []\n", + " accuracys = []\n", + " for i in range(n):\n", + " query_index = random.sample(range(len(docs)), k_docs)\n", + " time, accuracy = time_it(search_index, docs, query_index)\n", + " times.append(time)\n", + " accuracys.append(accuracy)\n", + " return np.mean(times), np.mean(accuracys)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LSH time per query: 0.290881130695\n", + "LSH average accuracy: 0.9998\n" + ] + } + ], + "source": [ + "lsh_time, lsh_accuracy = time_it_n(lsh, docs)\n", + "print('LSH time per query: {0}'.format(lsh_time)) \n", + "print('LSH average accuracy: {0}'.format(lsh_accuracy)) " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PySparNN time per query: 0.214360470772\n", + "PySparNN average accuracy: 0.9997\n" + ] + } + ], + "source": [ + "snn_time, snn_accuracy = time_it_n(snn_search, docs)\n", + "print('PySparNN time per query: {0}'.format(snn_time)) \n", + "print('PySparNN average accuracy: {0}'.format(snn_accuracy)) " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..f83d651 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) 2016-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. diff --git a/tests/__init__.pyc b/tests/__init__.pyc new file mode 100644 index 0000000..c002760 Binary files /dev/null and b/tests/__init__.pyc differ diff --git a/tests/test_pysparnn.py b/tests/test_pysparnn.py new file mode 100644 index 0000000..e30bce5 --- /dev/null +++ b/tests/test_pysparnn.py @@ -0,0 +1,27 @@ +# Copyright (c) 2016-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. An additional grant +# of patent rights can be found in the PATENTS file in the same directory. + +import unittest +import pysparnn as snn + +class PysparnnTest(unittest.TestCase): + def test(self): + data = [ + 'hello world', + 'oh hello there', + 'Play it', + 'Play it again Sam', + ] + + features = [dict([(x, 1) for x in f.split()]) for f in data] + + cp = snn.ClusterIndex(features, data) + + ret = cp.search(features, threshold=0.50, k=1, k_clusters=1, + return_similarity=False) + + self.assertEqual([[d] for d in data], ret) diff --git a/tests/test_pysparnn.pyc b/tests/test_pysparnn.pyc new file mode 100644 index 0000000..206b7d1 Binary files /dev/null and b/tests/test_pysparnn.pyc differ