Permalink
Browse files

Fix massive documentation error about delta, add width/depth/size fun…

…ctions, add preliminary test script.

Signed-off-by: Edward Z. Yang <ezyang@mit.edu>
  • Loading branch information...
1 parent cd0fd0d commit 83673ad0c013efe3894b0e8e62e664356c0c7a78 @ezyang committed Jan 9, 2011
Showing with 99 additions and 19 deletions.
  1. +1 −1 _tags
  2. +3 −15 cminsketch.ml
  3. +14 −3 cminsketch.mli
  4. +81 −0 test.ml
View
@@ -1 +1 @@
-<*>: pkg_batteries,pkg_threads
+<*>: pkg_batteries, pkg_threads, pkg_gsl, pkg_extlib
View
@@ -94,6 +94,9 @@ let make ~epsilon ~delta =
let epsilon s = euler /. (float_of_int (1 lsl s.lg_width))
let delta s = 1. /. exp (float_of_int (Array.length s.count))
+let width s = 1 lsl s.lg_width
+let depth s = Array.length s.count
+let size s = (width s) * (depth s)
let update s ~ix ~c =
Array.iteri (fun i a -> step_matrix s.count i (multiply_shift s.lg_width a ix) c) s.hash_functions
@@ -119,18 +122,3 @@ let dot_product_nquery a b = median (Array.map2 dot_product a.count b.count)
(* To implement: *)
(* range_query - needs hierarchical array of sketches *)
(* phi_quantiles, heavy_hitters *)
-
-let () =
- let x = make 1.5 0.9 in
- update x 3 4;
- update x 3 (-6);
- update x 24435 5;
- update x 2323434 1;
- update x 223434 1;
- print_int (nquery x 3);
- print_string "\n";
- print_float (epsilon x);
- print_string "\n";
- print_float (delta x);
- print_string "\n";
- ()
View
@@ -35,9 +35,10 @@ type sketch
val multiply_shift : m:int -> a:int -> x:int -> int
(** Create a count-min sketch for which the error in answering
- a query is within a factor of [epsilon] with probability [delta].
- You get more accurate results for small epsilon and large delta,
- but use less memory for larger epsilon and smaller delta.
+ a query is within a factor of [epsilon], and gets it totally wrong
+ only with probability [delta]. You get more accurate results for
+ small epsilon and small delta, but use less memory for larger
+ epsilon and larger delta.
More detailed bounds regarding [epsilon] and [delta] can be found
in the relevant estimation functions.
@@ -65,6 +66,16 @@ val epsilon : sketch -> float
(** Returns the true error probability for a sketch. *)
val delta : sketch -> float
+(** Returns the width of the sketch. *)
+val width : sketch -> int
+
+(** Returns the depth of the sketch. *)
+val depth : sketch -> int
+
+(** Returns the size of the sketch. This should be smaller than your
+ input space, otherwise you're not getting any benefit! *)
+val size : sketch -> int
+
(** [O(log(1/delta))]. Updates a sketch adding [c] to the field [ix]. *)
val update : sketch -> ix:int -> c:int -> unit
View
@@ -0,0 +1,81 @@
+let rec repeat n thunk =
+ if n == 0 then ()
+ else (thunk (); repeat (n-1) thunk)
+
+(* RNG initialization and debugging output. You can modify the
+ * generator and seed using GSL_RNG_TYPE and GSL_RNG_SEED. We
+ * reseed OCaml's RNG with this, so our program is entirely
+ * deterministic based on these parameters. *)
+
+let () =
+ Gsl_error.init ();
+ Gsl_rng.env_setup ()
+
+let rng = Gsl_rng.make (Gsl_rng.default ())
+
+let () =
+ Printf.printf "\027[34m";
+ Printf.printf "gsl rng type=%s seed=%nu\n" (Gsl_rng.name rng) (Gsl_rng.default_seed());
+ let seed = Nativeint.to_int (Gsl_rng.get rng) in
+ Random.init seed;
+ Printf.printf "ocaml rng seed=%d (first gsl rng output)\n" seed;
+ Printf.printf "\027[0m";
+ Printf.printf "\n"
+
+(** Creates a sketch with parameters [epsilon] and [delta], increments
+ a random key (determined by the [kf] thunk) [n] times, and then
+ returns a tuple of this sketch and a hash table contanining the
+ true counts of the values. *)
+let generate ~epsilon ~delta ~n ~kf =
+ let s = Cminsketch.make epsilon delta in
+ let t = Hashtbl.create (n/8) in (* Worst case needs to grow 3 times *)
+ (* It's not necessary to test higher increments, since they
+ * are equivalent to calling the function that many times
+ * (so we punt to the underling random distribution in kf). *)
+ repeat n (fun () -> let k = kf () in
+ Cminsketch.update s k 1;
+ Hashtbl.replace t k (try Hashtbl.find t k + 1 with Not_found -> 1)
+ );
+ (s, t)
+
+(** List of distributions we'll test with. These functions are suitable
+ to be passed in as the [kf] parameter in [generate]. *)
+let distributions = [
+ fun () -> Gsl_randist.poisson rng 10.0;
+ ]
+let some_distribution = fun () -> Nativeint.to_int (Gsl_rng.get rng)
+
+let () =
+ let n = 50000
+ and input_epsilon = 0.0001
+ and input_delta = 0.8 in
+ let (s,t) = generate input_epsilon input_delta n some_distribution in
+ let epsilon = Cminsketch.epsilon s
+ and delta = Cminsketch.delta s in
+ let margin = float_of_int n *. epsilon in
+ Printf.printf "epsilon=%.5f delta=%.5f norm=%n, margin=%.0f\n" epsilon delta n margin;
+ Printf.printf "width=%d depth=%d size=%d\n" (Cminsketch.width s) (Cminsketch.depth s) (Cminsketch.size s);
+ let wrong =
+ let wrongref = ref 0 in
+ Hashtbl.iter
+ (fun k x ->
+ let y = Cminsketch.query s k in
+ let f = float_of_int (abs (x - y)) in
+ if f > margin then wrongref := !wrongref + 1
+ else ()
+ )
+ t;
+ !wrongref in
+ let domain_size = Hashtbl.length t in
+ Printf.printf "domain=%d\n" domain_size;
+ if Cminsketch.size s > domain_size
+ then Printf.printf "\027[33mWARNING: Sketch is bigger than hash table\027[0m\n"
+ else ();
+ let wrong_bound = delta
+ and wrong_rate = (float_of_int wrong /. float_of_int domain_size) in
+ Printf.printf "wrong: bound=%.5f actual=%.5f\n" wrong_bound wrong_rate;
+ Printf.printf "\n";
+ (* some statistics about the variance from true value would be cool *)
+ if wrong_bound > wrong_rate
+ then Printf.printf "\027[1m\027[32mOK\027[0m\n"
+ else Printf.printf "\027[1m\027[31mFAIL\027[0m\n"

0 comments on commit 83673ad

Please sign in to comment.