Skip to content

Commit

Permalink
Implement frequency algorithms such as Count Sketch and Count-Min Ske…
Browse files Browse the repository at this point in the history
…tch (#15)

Count Sketch and Count–Min Sketch are simple space-efficient probabilistic data structures
that are used to estimate frequencies of elements in data streams and can address the Heavy hitters problem.

Count Sketch was proposed by Moses Charikar, Kevin Chen, and Martin Farach-Colton in 2002.
Count–Min Sketch was presented in 2003 by Graham Cormode and Shan Muthukrishnan and published in 2005.

In the current implementation, we support up to 2^{32} -1 counters (due to 32-bit hash functions) each of 32 bits.
  • Loading branch information
gakhov committed Aug 27, 2019
1 parent 528cfa6 commit c41d40c
Show file tree
Hide file tree
Showing 11 changed files with 749 additions and 8 deletions.
8 changes: 5 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -112,12 +112,14 @@ cythonize.json


#PDSA
pdsa/cardinality/hyperloglog.cpp
pdsa/cardinality/linear_counter.cpp
pdsa/cardinality/probabilistic_counter.cpp
pdsa/helpers/hashing/mmh.cpp
pdsa/helpers/storage/bitvector.cpp
pdsa/helpers/storage/bitvector_counter.cpp
pdsa/cardinality/linear_counter.cpp
pdsa/cardinality/probabilistic_counter.cpp
pdsa/cardinality/hyperloglog.cpp
pdsa/membership/bloom_filter.cpp
pdsa/membership/counting_bloom_filter.cpp
pdsa/rank/qdigest.cpp
pdsa/frequency/count_min_sketch.cpp
pdsa/frequency/count_sketch.cpp
9 changes: 5 additions & 4 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,14 @@ global-exclude __pycache__/*
global-exclude *.so
global-exclude .DS_Store

global-exclude pdsa/cardinality/hyperloglog.cpp
global-exclude pdsa/cardinality/linear_counter.cpp
global-exclude pdsa/cardinality/probabilistic_counter.cpp
global-exclude pdsa/cardinality/hyperloglog.cpp
global-exclude pdsa/membership/bloom_filter.cpp
global-exclude pdsa/membership/counting_bloom_filter.cpp
global-exclude pdsa/frequency/count_min_sketch.cpp
global-exclude pdsa/frequency/count_sketch.cpp
global-exclude pdsa/helpers/hashing/mmh.cpp
global-exclude pdsa/helpers/storage/bitvector.cpp
global-exclude pdsa/helpers/storage/bitvector_counter.cpp
global-exclude pdsa/membership/bloom_filter.cpp
global-exclude pdsa/membership/counting_bloom_filter.cpp
global-exclude pdsa/rank/qdigest.cpp

4 changes: 4 additions & 0 deletions pdsa/frequency/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .count_min_sketch import CountMinSketch


__all__ = ["CountMinSketch", ]
19 changes: 19 additions & 0 deletions pdsa/frequency/count_min_sketch.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from libc.stdint cimport uint64_t, uint32_t, uint8_t


cdef class CountMinSketch:
cdef uint32_t _MAX_COUNTER_VALUE

cdef uint8_t num_of_counters
cdef uint32_t length_of_counter

cdef uint64_t _length
cdef uint8_t[:] _seeds
cdef uint32_t[:] _counter

cpdef void add(self, object element) except *
cpdef uint32_t frequency(self, object element) except *
cpdef size_t sizeof(self)

cdef bint _increment_counter(self, const uint64_t index)
cdef uint32_t _hash(self, object element, uint8_t seed)
261 changes: 261 additions & 0 deletions pdsa/frequency/count_min_sketch.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,261 @@
"""
Count-Min Sketch.
Count–Min Sketch is a simple space-efficient probabilistic data structure
that is used to estimate frequencies of elements in data streams and can
address the Heavy hitters problem. It was presented in 2003 [1] by
Graham Cormode and Shan Muthukrishnan and published in 2005 [2].
References
----------
[1] Cormode, G., Muthukrishnan, S.
What's hot and what's not: Tracking most frequent items dynamically
Proceedings of the 22th ACM SIGMOD-SIGACT-SIGART symposium on Principles
of database systems, San Diego, California - June 09-11, 2003,
pp. 296–306, ACM New York, NY.
http://www.cs.princeton.edu/courses/archive/spr04/cos598B/bib/CormodeM-hot.pdf
[2] Cormode, G., Muthukrishnan, S.
An Improved Data Stream Summary: The Count–Min Sketch and its Applications
Journal of Algorithms, Vol. 55 (1), pp. 58–75.
http://dimacs.rutgers.edu/~graham/pubs/papers/cm-full.pdf
"""
import cython

from cpython.array cimport array
from libc.math cimport ceil, log, M_E
from libc.stdint cimport uint64_t, uint32_t, uint8_t
from libc.stdint cimport UINT32_MAX, UINT8_MAX
from libc.stdlib cimport rand, RAND_MAX

from pdsa.helpers.hashing.mmh cimport mmh3_x86_32bit


cdef class CountMinSketch:
"""Count-Min Sketch.
Count-Min Sketch is simple data structure that allows for the indexing
of elements from the data stream, results in updating counters,
and can provide the number of times every element has been indexed.
Example
-------
>>> from pdsa.frequency.count_min_sketch import CountMinSketch
>>> cms = CountMinSketch(5, 2000)
>>> cms.add("hello")
>>> cms.frequency("hello")
Note
-----
This implementation uses MurmurHash3 family of hash functions
which yields a 32-bit hash value. Thus, the length of the counters
is expected to be smaller or equal to the (2^{32} - 1), since
we cannot access elements with indexes above this value.
Note
-----
This implementation uses 32-bits counters that freeze at their
maximal values (2^{32} - 1).
Attributes
----------
num_of_counters : :obj:`int`
The number of counter arrays used in the sketch.
length_of_counter : :obj:`int`
The number of counters in each counter array.
"""

@cython.cdivision(True)
def __cinit__(self, const uint8_t num_of_counters, const uint32_t length_of_counter):
"""Create sketch from its dimensions.
Parameters
----------
num_of_counters : :obj:`int`
The number of counter arrays used in the sketch.
length_of_counter : :obj:`int`
The number of counters in each counter array.
Raises
------
ValueError
If `num of counters` is less than 1.
ValueError
If `length_of_counter` is less than 1.
"""
if num_of_counters < 1:
raise ValueError("At least one counter array is required")

if length_of_counter < 1:
raise ValueError("The length of the counter array cannot be less then 1")

self.num_of_counters = num_of_counters
self.length_of_counter = length_of_counter

self._length = self.num_of_counters * self.length_of_counter

self._MAX_COUNTER_VALUE = UINT32_MAX
self._seeds = array('B', [
<uint8_t >((rand()/RAND_MAX) * UINT8_MAX)
for r in range(self.num_of_counters)
])
self._counter = array('I', range(self._length))

cdef uint64_t index
for index in xrange(self._length):
self._counter[index] = 0

@classmethod
def create_from_expected_error(cls, const float deviation, const float error):
"""Create sketch from the expected frequency deviation and error probability.
Parameters
----------
deviation : float
The error ε in answering the paricular query.
For example, if we expect 10^7 elements and allow
the fixed overestimate of 10, the deviation is 10/10^7 = 10^{-6}.
error : float
The standard error δ (0 < error < 1).
Note
----
The Count–Min Sketch is approximate and probabilistic at the same
time, therefore two parameters, the error ε in answering the paricular
query and the error probability δ, affect the space and time
requirements. In fact, it provides the guarantee that the estimation
error for frequencies will not exceed ε x n
with probability at least 1 – δ.
Raises
------
ValueError
If `deviation` is smaller than 10^{-10}.
ValueError
If `error` is not in range (0, 1).
"""
if deviation <= 0.0000000001:
raise ValueError("Deviation is too small. Not enough counters")

if error <= 0 or error >= 1:
raise ValueError("Error rate shell be in (0, 1)")

cdef uint8_t num_of_counters = <uint8_t > (ceil(-log(error)))
cdef uint32_t length_of_counter = <uint32_t > (ceil(M_E / deviation))

return cls(max(1, num_of_counters), max(1, length_of_counter))

cdef uint32_t _hash(self, object key, uint8_t seed):
return mmh3_x86_32bit(key, seed)

def __dealloc__(self):
pass

cdef bint _increment_counter(self, const uint64_t index):
"""Increment counter if the value doesn't exceed maximal allowed.
Parameters
----------
index : obj:`int`
The index of the counter to be incremented.
Note
----
When counter reaches its maximal value, we simple freeze it there.
"""
if self._counter[index] < self._MAX_COUNTER_VALUE:
self._counter[index] += 1
return True
return False

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
cpdef void add(self, object element) except *:
"""Index element into the sketch.
Parameters
----------
element : obj
The element to be indexed into the sketch.
"""
cdef uint8_t counter_index
cdef uint32_t element_index
cdef uint8_t seed
cdef uint64_t index
for counter_index in range(self.num_of_counters):
seed = self._seeds[counter_index]
element_index = self._hash(element, seed) % self.length_of_counter
index = counter_index * (self.length_of_counter - 1) + element_index
self._increment_counter(index)

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
cpdef uint32_t frequency(self, object element) except *:
"""Estimate frequency of element.
Parameters
----------
element : obj
The element to estimate the frequency for.
Returns
-------
uint32_t
The frequency of the element.
"""
cdef uint8_t counter_index
cdef uint32_t element_index
cdef uint8_t seed
cdef uint64_t index
cdef uint32_t frequency = self._MAX_COUNTER_VALUE
for counter_index in range(self.num_of_counters):
seed = self._seeds[counter_index]
element_index = self._hash(element, seed) % self.length_of_counter
index = counter_index * (self.length_of_counter - 1) + element_index
frequency = min(frequency, self._counter[index])
return frequency

cpdef size_t sizeof(self):
"""Size of the sketch in bytes.
Returns
-------
:obj:`int`
Number of bytes allocated for the sketch.
"""
return self._length * sizeof(uint32_t)

def __repr__(self):
return "<CountMinSketch ({} x {})>".format(
self.num_of_counters,
self.length_of_counter
)

def __len__(self):
"""Get length of the filter.
Returns
-------
:obj:`int`
The length of the filter.
"""
return self._length



def debug(self):
"""Return sketch for debug purposes."""
return self._counter
22 changes: 22 additions & 0 deletions pdsa/frequency/count_sketch.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from libc.stdint cimport uint64_t, uint32_t, uint8_t
from libc.stdint cimport int32_t


cdef class CountSketch:
cdef int32_t _MAX_COUNTER_VALUE
cdef int32_t _MIN_COUNTER_VALUE

cdef uint8_t num_of_counters
cdef uint32_t length_of_counter

cdef uint64_t _length
cdef uint8_t[:] _seeds
cdef uint8_t[:] _seeds_for_switcher
cdef int32_t[:] _counter

cpdef void add(self, object element) except *
cpdef uint32_t frequency(self, object element) except *
cpdef size_t sizeof(self)

cdef bint _update_counter(self, const uint64_t index, const bint reverse)
cdef uint32_t _hash(self, object element, uint8_t seed)

0 comments on commit c41d40c

Please sign in to comment.