diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..2e6fcc1 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) October 2016 Fast Forward Labs + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md index 2ea67fd..d5899c4 100644 --- a/README.md +++ b/README.md @@ -1 +1,65 @@ -# cuckoofilter +# Cuckoo Filter + +The Fast Forward Labs team explored probabilistic data structures +in our "Probabilistic Methods for Real-time Streams" report and +prototype (contact us if you're interested in this topic). We +provided an update to that report [here](http://blog.fastforwardlabs.com/post/153566952648/cuckoo-filter), exploring +Cuckoo filters, a [new](https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf) probabilistic data structure that improves upon the standard Bloom filter. The Cuckoo filter provides a few +advantages: +1) it enables dynamic deletion and addition of items +2) it can be easily implemented compared to Bloom filter variants with similar capabilities, and +3) for similar space constraints, the Cuckoo filter provides lower false positives, particularly at lower capacities. We provide a python implementation of the Cuckoo filter here, and compare it to a counting Bloom filter (a Bloom filter variant). + +This repository contains a python implementation of the Cuckoo +filter, as well as a copy-paste of a counting Bloom filter from +the [fuggedaboutit](https://github.com/mynameisfiber/fuggetaboutit/) repository for benchmarking. + +Please see our [post](http://blog.fastforwardlabs.com/post/153566952648/cuckoo-filter) for more details on the +Cuckoo filter. + + +# Demo + +Below we show how to going about using this package. + +```python +>>> from cuckoofilter import CuckooFilter +>>> c_filter = CuckooFilter(10000, 2) + +>>> c_filter.insert('James') +>>> print("James in c_filter == {}".format("James" in c_filter)) +James in c_filter == True + +>>> c_filter.remove('James') +>>> print("James in c_filter == {}".format("James" in c_filter)) +James in c_filter == False +``` + +Similarly the counting Bloom filter can be used as well. + +```python +>>> from cuckoofilter import CountingBloomFilter +>>> b_filter = CountingBloomFilter(10000) + +>>> b_filter.insert('James') +>>> print("James in c_filter == {}".format("James" in c_filter)) +James in b_filter == True + +>>> b_filter.remove('James') +>>> print("James in c_filter == {}".format("James" in c_filter)) +James in b_filter == False +``` + +## References +Below we link to a few references that contributed to the work +shown here: + +- Fan et. al. [Cuckoo Filter: Practically Better Than Bloom](https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf) + +- CS 166 Stanford lecture [Cuckoo Hashing](http://web.stanford.edu/class/cs166/lectures/13/Small13.pdf) + +- Charles Ren, Course Notes. [An Overview of Cuckoo Hashing](http://cs.stanford.edu/~rishig/courses/ref/l13a.pdf) + + + + diff --git a/cuckoofilter/counting_bloom_filter.py b/cuckoofilter/counting_bloom_filter.py index 02ad81c..91a295e 100644 --- a/cuckoofilter/counting_bloom_filter.py +++ b/cuckoofilter/counting_bloom_filter.py @@ -20,14 +20,17 @@ BLOOM_FILENAME = 'bloom.npy' META_FILENAME = 'meta.json' + def remove_recursive(path): if os.path.isdir(path): shutil.rmtree(path) elif os.path.exists(path): os.remove(path) + class CountingBloomFilter(object): _ENTRIES_PER_8BYTE = 1 + def __init__(self, capacity, data_path=None, error=0.005, id=None): self.capacity = capacity self.error = error @@ -47,7 +50,8 @@ def __init__(self, capacity, data_path=None, error=0.005, id=None): self.data = np.load(bloom_filename) self.num_non_zero = np.count_nonzero(self.data) else: - size = int(math.ceil(self.num_bytes / float(self._ENTRIES_PER_8BYTE))) + size = int( + math.ceil(self.num_bytes / float(self._ENTRIES_PER_8BYTE))) self.data = np.zeros((size,), dtype=np.uint8, order='C') self.num_non_zero = 0 @@ -115,7 +119,10 @@ def get_size(self): """ Returns the density of the bloom which can be used to determine if the bloom is "full" """ - return -self.num_bytes * math.log(1 - self.num_non_zero / float(self.num_bytes)) / float(self.num_hashes) + return (-self.num_bytes * + math.log(1 - self.num_non_zero / float(self.num_bytes)) / + float(self.num_hashes) + ) def get_meta(self): return { @@ -133,7 +140,8 @@ def flush_data(self, data_path=None): def save(self, data_path=None): data_path, meta_path, bloom_path = self._get_paths(data_path) - tmp_data_path, tmp_meta_path, tmp_bloom_path = self._get_paths(data_path + '-tmp') + tmp_data_path, tmp_meta_path, tmp_bloom_path = self._get_paths( + data_path + '-tmp') remove_recursive(tmp_data_path) os.makedirs(tmp_data_path) @@ -146,7 +154,8 @@ def save(self, data_path=None): def _get_paths(self, data_path): if not (data_path or self.data_path): - raise PersistenceDisabledException("You cannot save without having data_path set.") + raise PersistenceDisabledException( + "You cannot save without having data_path set.") if not data_path: data_path = self.data_path @@ -183,7 +192,6 @@ def load(cls, data_path): return cls(capacity, **kwargs) - def __contains__(self, key): return self.contains(key) @@ -196,4 +204,4 @@ def __sub__(self, other): return self def __len__(self): - return self.get_size() \ No newline at end of file + return self.get_size() diff --git a/cuckoofilter/cuckoofilter.py b/cuckoofilter/cuckoofilter.py index edb78e6..a87be12 100644 --- a/cuckoofilter/cuckoofilter.py +++ b/cuckoofilter/cuckoofilter.py @@ -1,4 +1,4 @@ -import mmh3 # used for hashing items +import mmh3 # murmur hashing import random from . import cuckootable @@ -16,28 +16,42 @@ def __init__(self, filter_capacity, self.cuckoo_size = 0 self.table = [] - # load factor # initialize the entire table. for i in range(self.filter_capacity): self.table.append(cuckootable.CuckooTable(size=self.bucket_size)) + # fingerprint of an item is a reduced bit string of + # of an input string. def obtain_fingerprint(self, string_item): hash_value = mmh3.hash_bytes(string_item) fingerprint = hash_value[:self.item_fingerprint_size] return fingerprint def obtain_index_from_hash(self, string_item): + hash_value = mmh3.hash_bytes(string_item) + + # this is new for python 3, i.e. how you go from + # bytes/bits to int/index values index = int.from_bytes(hash_value, byteorder="big") + + # modulo the obtained index by the filter capacity + # this helps to restrict indices to 0 - filter_capacity index = index % self.filter_capacity + return index def obtain_indices_from_item(self, string_item): - # insert into the cuckoo table + + # obtain the first index index_1 = self.obtain_index_from_hash(string_item) + # obtain finger print of item fingerprint = self.obtain_fingerprint(string_item) + # derive the index from the fingerprint + # second index -> first_index xor index + # derived from hash(fingerprint) index_2 = index_1 ^ self.obtain_index_from_hash(fingerprint) index_2 = index_2 % self.filter_capacity @@ -52,19 +66,26 @@ def add(self, item_to_insert): if not isinstance(item_to_insert, str): raise ValueError("Item being inserted not of type string") + # obtain the two possible indices where this item + # can be inserted. index_1, index_2 = self.obtain_indices_from_item(item_to_insert) item_fingerprint = self.obtain_fingerprint(item_to_insert) + # default is to insert into the first index. if self.table[index_1].insert(item_fingerprint): self.cuckoo_size += 1 return index_1 + # if the first location is occupied, then insert + # in the second location. if self.table[index_2].insert(item_fingerprint): self.cuckoo_size += 1 return index_2 # if both indices are full, now we need to swap all current entries. # first randomly pick btw index 1 and 2 + # then swap one item in that bucket for its + # alternative location. random_index = random.choice((index_1, index_2)) for swap in range(self.num_swaps): @@ -79,21 +100,27 @@ def add(self, item_to_insert): self.cuckoo_size += 1 return random_index - # this might not be necessary since the table is now full anyway + # Notifies that the table is now full. raise Exception("CuckooFilter has filled up!") def remove(self, item_to_remove): + # first hash the item and obtain its possible indices item_fingerprint = self.obtain_fingerprint(item_to_remove) index_1, index_2 = self.obtain_indices_from_item(item_to_remove) + # check the first index to see if item's fingerprint + # is in that bucket. if self.table[index_1].remove(item_fingerprint): self.cuckoo_size = self.cuckoo_size - 1 return True + # item not in first index, so now check the second index if self.table[index_2].remove(item_fingerprint): self.cuckoo_size = self.cuckoo_size - 1 return True + # since item not in both indices, it is not in the + # cuckoo table. return false. return False def __contains__(self, item_to_test): @@ -106,6 +133,16 @@ def __contains__(self, item_to_test): return bool_contains + """ + + The methods below are getters for various properties of the + CuckooFilter. + - load factor + - size + - capacity + + """ + def get_load_factor(self): load_factor = self.cuckoo_size / \ (self.filter_capacity * self.bucket_size) diff --git a/cuckoofilter/cuckootable.py b/cuckoofilter/cuckootable.py index 9d74115..776f060 100644 --- a/cuckoofilter/cuckootable.py +++ b/cuckoofilter/cuckootable.py @@ -1,3 +1,9 @@ +""" +We represent the buckets as lists. A numpy array with +pre-specified length might be better, but the +python list is quite flexible. +""" + import random @@ -5,23 +11,22 @@ class CuckooTable: def __init__(self, size=4): self.size = size - - """ - We represent the buckets as lists. A numpy array with - pre-specified length might be better, but the - python list is quite flexible. - - """ self.bucket = [] def insert(self, item_fingerprint): + # to insert a fingerprint, check to make sure the + # current bucket is not full. if len(self.bucket) < self.size: self.bucket.append(item_fingerprint) return True + + # bucket is full, so return false. cuckoo filter class + # handles the logic with a failed insert. return False def remove(self, item_fingerprint): - # get the hypothetical + # first check if the fingerprint is in this bucket. + # if yes, then return true, else return false. try: index = self.bucket.index(item_fingerprint) del self.bucket[index] @@ -29,13 +34,11 @@ def remove(self, item_fingerprint): except ValueError: return False - """ - We implement swapping as a method of the bucket class - to make it easier when performing swaps. - - """ + # We implement swapping as a method of the bucket class + # to make it easier when performing swaps. def swap_fingerprints(self, item_fingerprint): + # we need to select the index of the fingerprint to swap. index_to_select = random.randrange(0, len(self.bucket)) selected_fingerprint = self.bucket[index_to_select] @@ -45,6 +48,7 @@ def swap_fingerprints(self, item_fingerprint): return item_fingerprint + # check if an item is in a bucket, i.e, list. def __contains__(self, item_fingerprint): if item_fingerprint in self.bucket: return True