update readme with a code sample

fastforwardlabs · Nov 23, 2016 · 2cec748 · 2cec748
1 parent c432cd9
commit 2cec748
Show file tree

Hide file tree

Showing 5 changed files with 158 additions and 24 deletions.
diff --git a/LICENSE.md b/LICENSE.md
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) October 2016 Fast Forward Labs
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -1 +1,65 @@
-# cuckoofilter
+# Cuckoo Filter
+
+The Fast Forward Labs team explored probabilistic data structures
+in our "Probabilistic Methods for Real-time Streams" report and 
+prototype (contact us if you're interested in this topic). We 
+provided an update to that report [here](http://blog.fastforwardlabs.com/post/153566952648/cuckoo-filter), exploring
+Cuckoo filters, a [new](https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf) probabilistic data structure that improves upon the standard Bloom filter. The Cuckoo filter provides a few 
+advantages: 
+1) it enables dynamic deletion and addition of items 
+2) it can be easily implemented compared to Bloom filter variants with similar capabilities, and 
+3) for similar space constraints, the Cuckoo filter provides lower false positives, particularly at lower capacities. We provide a python implementation of the  Cuckoo filter here, and compare it to a counting Bloom filter (a Bloom filter variant).
+
+This repository contains a python implementation of the Cuckoo
+filter, as well as a copy-paste of a counting Bloom filter from 
+the [fuggedaboutit](https://github.com/mynameisfiber/fuggetaboutit/) repository for benchmarking. 
+
+Please see our [post](http://blog.fastforwardlabs.com/post/153566952648/cuckoo-filter) for more details on the
+Cuckoo filter. 
+
+
+# Demo
+
+Below we show how to going about using this package. 
+
+```python
+>>> from cuckoofilter import CuckooFilter
+>>> c_filter = CuckooFilter(10000, 2)
+
+>>> c_filter.insert('James')
+>>> print("James in c_filter == {}".format("James" in c_filter))
+James in c_filter == True
+
+>>> c_filter.remove('James')
+>>> print("James in c_filter == {}".format("James" in c_filter))
+James in c_filter == False
+```
+
+Similarly the counting Bloom filter can be used as well.
+
+```python
+>>> from cuckoofilter import CountingBloomFilter
+>>> b_filter = CountingBloomFilter(10000)
+
+>>> b_filter.insert('James')
+>>> print("James in c_filter == {}".format("James" in c_filter))
+James in b_filter == True
+
+>>> b_filter.remove('James')
+>>> print("James in c_filter == {}".format("James" in c_filter))
+James in b_filter == False
+``` 
+
+## References
+Below we link to a few references that contributed to the work 
+shown here: 
+
+- Fan et. al. [Cuckoo Filter: Practically Better Than Bloom](https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf)
+
+- CS 166 Stanford lecture [Cuckoo Hashing](http://web.stanford.edu/class/cs166/lectures/13/Small13.pdf)
+
+- Charles Ren, Course Notes. [An Overview of Cuckoo Hashing](http://cs.stanford.edu/~rishig/courses/ref/l13a.pdf)
+
+
+
+
diff --git a/cuckoofilter/counting_bloom_filter.py b/cuckoofilter/counting_bloom_filter.py
@@ -20,14 +20,17 @@
 BLOOM_FILENAME = 'bloom.npy'
 META_FILENAME = 'meta.json'
 
+
 def remove_recursive(path):
     if os.path.isdir(path):
         shutil.rmtree(path)
     elif os.path.exists(path):
         os.remove(path)
 
+
 class CountingBloomFilter(object):
     _ENTRIES_PER_8BYTE = 1
+
     def __init__(self, capacity, data_path=None, error=0.005, id=None):
         self.capacity = capacity
         self.error = error
@@ -47,7 +50,8 @@ def __init__(self, capacity, data_path=None, error=0.005, id=None):
             self.data = np.load(bloom_filename)
             self.num_non_zero = np.count_nonzero(self.data)
         else:
-            size = int(math.ceil(self.num_bytes / float(self._ENTRIES_PER_8BYTE)))
+            size = int(
+                math.ceil(self.num_bytes / float(self._ENTRIES_PER_8BYTE)))
             self.data = np.zeros((size,), dtype=np.uint8, order='C')
             self.num_non_zero = 0
 
@@ -115,7 +119,10 @@ def get_size(self):
         """
         Returns the density of the bloom which can be used to determine if the bloom is "full"
         """
-        return -self.num_bytes * math.log(1 - self.num_non_zero / float(self.num_bytes)) / float(self.num_hashes) 
+        return (-self.num_bytes *
+                math.log(1 - self.num_non_zero / float(self.num_bytes)) /
+                float(self.num_hashes)
+                )
 
     def get_meta(self):
         return {
@@ -133,7 +140,8 @@ def flush_data(self, data_path=None):
 
     def save(self, data_path=None):
         data_path, meta_path, bloom_path = self._get_paths(data_path)
-        tmp_data_path, tmp_meta_path, tmp_bloom_path = self._get_paths(data_path + '-tmp')
+        tmp_data_path, tmp_meta_path, tmp_bloom_path = self._get_paths(
+            data_path + '-tmp')
 
         remove_recursive(tmp_data_path)
         os.makedirs(tmp_data_path)
@@ -146,7 +154,8 @@ def save(self, data_path=None):
 
     def _get_paths(self, data_path):
         if not (data_path or self.data_path):
-            raise PersistenceDisabledException("You cannot save without having data_path set.")
+            raise PersistenceDisabledException(
+                "You cannot save without having data_path set.")
         if not data_path:
             data_path = self.data_path
 
@@ -183,7 +192,6 @@ def load(cls, data_path):
 
         return cls(capacity, **kwargs)
 
-
     def __contains__(self, key):
         return self.contains(key)
 
@@ -196,4 +204,4 @@ def __sub__(self, other):
         return self
 
     def __len__(self):
-        return self.get_size()
+        return self.get_size()
diff --git a/cuckoofilter/cuckoofilter.py b/cuckoofilter/cuckoofilter.py
@@ -1,4 +1,4 @@
-import mmh3  # used for hashing items
+import mmh3  # murmur hashing
 import random
 
 from . import cuckootable
@@ -16,28 +16,42 @@ def __init__(self, filter_capacity,
         self.cuckoo_size = 0
         self.table = []
 
-        # load factor
         # initialize the entire table.
         for i in range(self.filter_capacity):
             self.table.append(cuckootable.CuckooTable(size=self.bucket_size))
 
+    # fingerprint of an item is a reduced bit string of
+    # of an input string.
     def obtain_fingerprint(self, string_item):
         hash_value = mmh3.hash_bytes(string_item)
         fingerprint = hash_value[:self.item_fingerprint_size]
         return fingerprint
 
     def obtain_index_from_hash(self, string_item):
+
         hash_value = mmh3.hash_bytes(string_item)
+
+        # this is new for python 3, i.e. how you go from
+        # bytes/bits to int/index values
         index = int.from_bytes(hash_value, byteorder="big")
+
+        # modulo the obtained index by the filter capacity
+        # this helps to restrict indices to 0 - filter_capacity
         index = index % self.filter_capacity
+
         return index
 
     def obtain_indices_from_item(self, string_item):
-        # insert into the cuckoo table
+
+        # obtain the first index
         index_1 = self.obtain_index_from_hash(string_item)
 
+        # obtain finger print of item
         fingerprint = self.obtain_fingerprint(string_item)
 
+        # derive the index from the fingerprint
+        # second index -> first_index xor index
+        # derived from hash(fingerprint)
         index_2 = index_1 ^ self.obtain_index_from_hash(fingerprint)
         index_2 = index_2 % self.filter_capacity
 
@@ -52,19 +66,26 @@ def add(self, item_to_insert):
         if not isinstance(item_to_insert, str):
             raise ValueError("Item being inserted not of type string")
 
+        # obtain the two possible indices where this item
+        # can be inserted.
         index_1, index_2 = self.obtain_indices_from_item(item_to_insert)
         item_fingerprint = self.obtain_fingerprint(item_to_insert)
 
+        # default is to insert into the first index.
         if self.table[index_1].insert(item_fingerprint):
             self.cuckoo_size += 1
             return index_1
 
+        # if the first location is occupied, then insert
+        # in the second location.
         if self.table[index_2].insert(item_fingerprint):
             self.cuckoo_size += 1
             return index_2
 
         # if both indices are full, now we need to swap all current entries.
         # first randomly pick btw index 1 and 2
+        # then swap one item in that bucket for its
+        # alternative location.
         random_index = random.choice((index_1, index_2))
 
         for swap in range(self.num_swaps):
@@ -79,21 +100,27 @@ def add(self, item_to_insert):
                 self.cuckoo_size += 1
                 return random_index
 
-        # this might not be necessary since the table is now full anyway
+        # Notifies that the table is now full.
         raise Exception("CuckooFilter has filled up!")
 
     def remove(self, item_to_remove):
+        # first hash the item and obtain its possible indices
         item_fingerprint = self.obtain_fingerprint(item_to_remove)
         index_1, index_2 = self.obtain_indices_from_item(item_to_remove)
 
+        # check the first index to see if item's fingerprint
+        # is in that bucket.
         if self.table[index_1].remove(item_fingerprint):
             self.cuckoo_size = self.cuckoo_size - 1
             return True
 
+        # item not in first index, so now check the second index
         if self.table[index_2].remove(item_fingerprint):
             self.cuckoo_size = self.cuckoo_size - 1
             return True
 
+        # since item not in both indices, it is not in the
+        # cuckoo table. return false.
         return False
 
     def __contains__(self, item_to_test):
@@ -106,6 +133,16 @@ def __contains__(self, item_to_test):
 
         return bool_contains
 
+    """
+
+    The methods below are getters for various properties of the 
+    CuckooFilter. 
+    - load factor 
+    - size
+    - capacity 
+
+    """
+
     def get_load_factor(self):
         load_factor = self.cuckoo_size / \
             (self.filter_capacity * self.bucket_size)

diff --git a/cuckoofilter/cuckootable.py b/cuckoofilter/cuckootable.py
@@ -1,41 +1,44 @@
+"""
+We represent the buckets as lists. A numpy array with
+pre-specified length might be better, but the 
+python list is quite flexible. 
+"""
+
 import random
 
 
 class CuckooTable:
 
     def __init__(self, size=4):
         self.size = size
-
-        """
-        We represent the buckets as lists. A numpy array with
-        pre-specified length might be better, but the 
-        python list is quite flexible. 
-
-        """
         self.bucket = []
 
     def insert(self, item_fingerprint):
+        # to insert a fingerprint, check to make sure the
+        # current bucket is not full.
         if len(self.bucket) < self.size:
             self.bucket.append(item_fingerprint)
             return True
+
+        # bucket is full, so return false. cuckoo filter class
+        # handles the logic with a failed insert.
         return False
 
     def remove(self, item_fingerprint):
-        # get the hypothetical
+        # first check if the fingerprint is in this bucket.
+        # if yes, then return true, else return false.
         try:
             index = self.bucket.index(item_fingerprint)
             del self.bucket[index]
             return True
         except ValueError:
             return False
 
-    """
-    We implement swapping as a method of the bucket class
-    to make it easier when performing swaps. 
-    
-    """
+    # We implement swapping as a method of the bucket class
+    # to make it easier when performing swaps.
 
     def swap_fingerprints(self, item_fingerprint):
+        # we need to select the index of the fingerprint to swap.
         index_to_select = random.randrange(0, len(self.bucket))
         selected_fingerprint = self.bucket[index_to_select]
 
@@ -45,6 +48,7 @@ def swap_fingerprints(self, item_fingerprint):
 
         return item_fingerprint
 
+    # check if an item is in a bucket, i.e, list.
     def __contains__(self, item_fingerprint):
         if item_fingerprint in self.bucket:
             return True