Update tests for probabilistic counters (#26)

The main issue with the probabilistic counter is its huge and unknown bias for small cardinalities. We try to cover such a case with a proper test, but still, there is some uncertainty.
gakhov · Oct 2, 2019 · cb63e80 · cb63e80
1 parent a217f47
commit cb63e80
Show file tree

Hide file tree

Showing 3 changed files with 37 additions and 4 deletions.
diff --git a/docs/cardinality/probabilistic_counter.rst b/docs/cardinality/probabilistic_counter.rst
@@ -60,7 +60,7 @@ To build a counter, specify its length.
         from pdsa.cardinality.probabilistic_counter import ProbabilisticCounter
 
         pc = ProbabilisticCounter(
-            numbder_of_counters=256,
+            number_of_counters=256,
             with_small_cardinality_correction=True)
 
 

diff --git a/pdsa/cardinality/probabilistic_counter.pyx b/pdsa/cardinality/probabilistic_counter.pyx
@@ -257,7 +257,7 @@ cdef class ProbabilisticCounter:
         [1] Flajolet, P., Martin, G.N.: Probabilistic Counting Algorithms
             for Data Base Applications. Journal of Computer and System Sciences.
             Vol. 31 (2), 182--209  (1985)
-        [2] Flajolet, P., Martin, G.N.: Near-Optimal Compression of Probabilistic
+        [2] Scheuermann, B., Mauve, M.: Near-Optimal Compression of Probabilistic
             Counting Sketches for Networking Applications
             In Dial M-POMC 2007: Proceedings of the 4th ACM SIGACT-SIGOPS
             International Workshop on Foundation of Mobile Computing, 2007

diff --git a/tests/cardinality/test_probabilistic_counter.py b/tests/cardinality/test_probabilistic_counter.py
@@ -61,12 +61,16 @@ def test_count_small():
     pc = ProbabilisticCounter(
         num_of_counters, with_small_cardinality_correction=True)
 
+    # Actually, for small cardinalities we have no estimate. It is
+    # just seems that the the errors have to be bigger.
     std = 0.78 / sqrt(num_of_counters)
 
+    boundary = 2 * num_of_counters
+
     errors = []
 
     cardinality = 0
-    for i in range(1000):
+    for i in range(boundary):
         cardinality += 1
         element = "element_{}".format(i)
         pc.add(element)
@@ -77,7 +81,36 @@ def test_count_small():
     avg_error = abs(sum(errors)) / float(len(errors))
 
     assert avg_error >= 0
-    assert avg_error <= 2 * std  # Even with correction, still not so good
+    assert avg_error <= 3 * std  # There is no known theoretical expectation.
+
+
+def test_correction():
+    pc_with_corr = ProbabilisticCounter(
+        256, with_small_cardinality_correction=True)
+    pc = ProbabilisticCounter(256)
+
+    errors = []
+    errors_with_corr = []
+
+    cardinality = 0
+    for i in range(100):
+        cardinality += 1
+        element = "element_{}".format(i)
+        pc_with_corr.add(element)
+        pc.add(element)
+
+        error_with_corr = (
+            cardinality - pc_with_corr.count()) / float(cardinality)
+        errors_with_corr.append(error_with_corr)
+
+        error = abs(cardinality - pc.count()) / float(cardinality)
+        errors.append(error)
+
+    avg_error_with_corr = abs(sum(errors_with_corr)) / \
+        float(len(errors_with_corr))
+    avg_error = abs(sum(errors)) / float(len(errors))
+
+    assert avg_error_with_corr < avg_error
 
 
 def test_len():