Skip to content

Commit

Permalink
Update tests for probabilistic counters (#26)
Browse files Browse the repository at this point in the history
The main issue with the probabilistic counter is its
huge and unknown bias for small cardinalities. We try
to cover such a case with a proper test, but still, there is
some uncertainty.
  • Loading branch information
gakhov committed Oct 2, 2019
1 parent a217f47 commit cb63e80
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 4 deletions.
2 changes: 1 addition & 1 deletion docs/cardinality/probabilistic_counter.rst
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ To build a counter, specify its length.
from pdsa.cardinality.probabilistic_counter import ProbabilisticCounter
pc = ProbabilisticCounter(
numbder_of_counters=256,
number_of_counters=256,
with_small_cardinality_correction=True)
Expand Down
2 changes: 1 addition & 1 deletion pdsa/cardinality/probabilistic_counter.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ cdef class ProbabilisticCounter:
[1] Flajolet, P., Martin, G.N.: Probabilistic Counting Algorithms
for Data Base Applications. Journal of Computer and System Sciences.
Vol. 31 (2), 182--209 (1985)
[2] Flajolet, P., Martin, G.N.: Near-Optimal Compression of Probabilistic
[2] Scheuermann, B., Mauve, M.: Near-Optimal Compression of Probabilistic
Counting Sketches for Networking Applications
In Dial M-POMC 2007: Proceedings of the 4th ACM SIGACT-SIGOPS
International Workshop on Foundation of Mobile Computing, 2007
Expand Down
37 changes: 35 additions & 2 deletions tests/cardinality/test_probabilistic_counter.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,16 @@ def test_count_small():
pc = ProbabilisticCounter(
num_of_counters, with_small_cardinality_correction=True)

# Actually, for small cardinalities we have no estimate. It is
# just seems that the the errors have to be bigger.
std = 0.78 / sqrt(num_of_counters)

boundary = 2 * num_of_counters

errors = []

cardinality = 0
for i in range(1000):
for i in range(boundary):
cardinality += 1
element = "element_{}".format(i)
pc.add(element)
Expand All @@ -77,7 +81,36 @@ def test_count_small():
avg_error = abs(sum(errors)) / float(len(errors))

assert avg_error >= 0
assert avg_error <= 2 * std # Even with correction, still not so good
assert avg_error <= 3 * std # There is no known theoretical expectation.


def test_correction():
pc_with_corr = ProbabilisticCounter(
256, with_small_cardinality_correction=True)
pc = ProbabilisticCounter(256)

errors = []
errors_with_corr = []

cardinality = 0
for i in range(100):
cardinality += 1
element = "element_{}".format(i)
pc_with_corr.add(element)
pc.add(element)

error_with_corr = (
cardinality - pc_with_corr.count()) / float(cardinality)
errors_with_corr.append(error_with_corr)

error = abs(cardinality - pc.count()) / float(cardinality)
errors.append(error)

avg_error_with_corr = abs(sum(errors_with_corr)) / \
float(len(errors_with_corr))
avg_error = abs(sum(errors)) / float(len(errors))

assert avg_error_with_corr < avg_error


def test_len():
Expand Down

0 comments on commit cb63e80

Please sign in to comment.