Permalink
Browse files

Implement untested dot product query, add batteries dep.

Signed-off-by: Edward Z. Yang <ezyang@mit.edu>
  • Loading branch information...
1 parent 01dfeed commit 90fc039afd5d61ad4988af0e3994755594526b9d @ezyang committed Jan 8, 2011
Showing with 254 additions and 39 deletions.
  1. +3 −3 .gitignore
  2. +1 −0 .odocl
  3. +1 −0 _tags
  4. +31 −15 cminsketch.ml
  5. +33 −21 cminsketch.mli
  6. +185 −0 myocamlbuild.ml
View
6 .gitignore
@@ -1,3 +1,3 @@
-*.html
-*.cmi
-*.css
+*.native
+*.byte
+_build
View
1 .odocl
@@ -0,0 +1 @@
+Cminsketch
View
1 _tags
@@ -0,0 +1 @@
+<*>: pkg_batteries,pkg_threads
View
46 cminsketch.ml
@@ -1,3 +1,5 @@
+open Batteries
+
(* Multiply shift is done on normal ints. These are not necessarily
* 32-bit or 64-bit (usually, they have one bit less precision), so
* care must be taken. *)
@@ -57,29 +59,39 @@ let median a =
let () = Array.sort compare b in
b.(l/2)
+(** Computes the dot product of two arrays ("vectors"). Arrays must have exactly the
+ same dimensions. *)
+let dot_product a b =
+ if Array.length a != Array.length b then invalid_arg "Cminsketch.dot_product: dimensions mismatch" else
+ let len = Array.length a in
+ let rec loop acc = function i when i = len -> acc
+ | i -> loop (acc + a.(i) * b.(i)) (i + 1) in
+ loop 0 0
+
type sketch = { lg_width : int;
count: int array array;
hash_functions : int array }
-let make ~epsilon ~delta =
- if epsilon <= 0.0 then invalid_arg "Cminsketch.make: epsilon must be greater than 0.0" else
- if delta >= 1.0 then invalid_arg "Cminsketch.make: delta must be less than 1.0" else
- if delta <= 0.0 then invalid_arg "Cminsketch.make: delta must be greater than 0.0" else
+let make_int ~depth ~width =
+ if depth <= 0 then invalid_arg "Cminsketch.make_raw: depth must be greater than 0" else
+ if width <= 0 then invalid_arg "Cminsketch.make_raw: width must be greater than 0" else
(* We fudge the width to be a little larger, ensuring that it
- * is a power of two for the benefit of our algorithm. This means
- * the actual epsilon you will get is smaller than what you
- * originally specified. *)
- let m = int_ceil (lg (euler /. epsilon)) in
- if m < 0 then failwith "Cminsketch.make: internal error, lg_width less than 0" else
- let width = 1 lsl m
- and depth = int_ceil (log (1. /. delta)) in
- if width <= 0 then failwith "Cminsketch.make: internal error, width less than 1" else
- if depth <= 0 then failwith "Cminsketch.make: internal error, depth less than 1" else
+ * is a power of two for the benefit of our hash family. *)
+ let m = int_ceil (lg (float_of_int width)) in
+ let rounded_width = 1 lsl m in
{ lg_width = m;
- count = Array.make_matrix depth width 0;
+ count = Array.make_matrix depth rounded_width 0;
hash_functions = Array.init depth (fun _ -> random_odd_int ());
}
+let make ~epsilon ~delta =
+ if epsilon <= 0.0 then invalid_arg "Cminsketch.make: epsilon must be greater than 0.0" else
+ if delta >= 1.0 then invalid_arg "Cminsketch.make: delta must be less than 1.0" else
+ if delta <= 0.0 then invalid_arg "Cminsketch.make: delta must be greater than 0.0" else
+ let depth = int_ceil (log (1. /. delta))
+ and width = int_ceil (euler /. epsilon) in
+ make_int depth width
+
let epsilon s = euler /. (float_of_int (1 lsl s.lg_width))
let delta s = 1. /. exp (float_of_int (Array.length s.count))
@@ -97,9 +109,13 @@ let get_counts s ~ix =
let query s ~ix = minimum (get_counts s ix)
let nquery s ~ix = median (get_counts s ix)
+(* We deviate from the original naming, because "inner product" is too
+ * for the algorithm they've described. *)
+let dot_product_query a b =
+ minimum (Array.map2 dot_product a.count b.count)
+
(* To implement: *)
(* range_query - needs customized array of sketches *)
-(* inner_product_query *)
(* phi_quantiles, heavy_hitters *)
let () =
View
54 cminsketch.mli
@@ -1,6 +1,6 @@
(** This module implements the count-min sketch, a sublinear
space, probabilistic data structure
- invented by Graham Cormode and S. Muthukrishnan, described in
+ invented by Graham Cormode and S Muthukrishnan, described in
"An Improved Data Stream Summary: The Count-Min Sketch and its
Applications." It is well suited for summarizing data streams and
finding quantiles/frequent items. It has also found novel
@@ -18,7 +18,8 @@
type sketch
(** Multiply shift, a weak universal hash family that this implemenation
- uses to back its sketches (a more conventional choice is Carter
+ uses to back its sketches (a more conventional choice that was not
+ used in this implementation is Carter
and Wegman's universal hash family). It was presented in
"A Reliable Randomized Algorithm for the Closest-Pair Problem"
and has the property that for all ints [x] and [y] such that
@@ -37,10 +38,6 @@ val multiply_shift : m:int -> a:int -> x:int -> int
a query is within a factor of [epsilon] with probability [delta].
You get more accurate results for small epsilon and large delta,
but use less memory for larger epsilon and smaller delta.
- You can only trade accuracy for memory so far: in one direction, if
- epsilon is sufficiently large ([> 2.72]) the sketch degenerates into
- a single-valued counter, in the opposite direction there's no point
- using a count-min sketch if you're going to demand perfect results.
More detailed bounds regarding [epsilon] and [delta] can be found
in the relevant estimation functions.
@@ -49,36 +46,51 @@ val multiply_shift : m:int -> a:int -> x:int -> int
the hash functions.
@raise Invalid_argument if [epsilon <= 0.0], [delta >= 1.0] or
- [delta <= 0.0] *)
+ [delta <= 0.0]. *)
val make : epsilon:float -> delta:float -> sketch
+(** Creates a count-min sketch, manually specifying the dimensions of
+ the backing matrix. The relationship between [depth]/[width] and
+ [epsilon]/[delta] is as follows: [width = ceil(e / epsilon)] and
+ [depth = ceil(ln(1/delta))] (where [e] denotes Euler's constant).
+ You can determine the effective [epsilon] and [delta] using the
+ respective functions.
+
+ @raise Invalid_argument if [depth <= 0] or [width <= 0]. *)
+val make_int : depth:int -> width:int -> sketch
+
(** Returns the true error factor for a sketch. *)
val epsilon : sketch -> float
(** Returns the true error probability for a sketch. *)
val delta : sketch -> float
-(** O(). Updates a sketch adding [c] to the field [ix]. *)
+(** [O(log(1/delta))]. Updates a sketch adding [c] to the field [ix]. *)
val update : sketch -> ix:int -> c:int -> unit
-(** Estimates the count of the field [ix] if all the actual counts
- are non-negative. Use [nquery] if actual counts may be negative.
+(** [O(log(1/delta))]. Estimates the count of the field [ix] if all the
+ actual counts are non-negative. Use [nquery] if actual counts may b
+ negative.
- This estimate is never
- less than the true value and, with probability of at least
- [1 - delta], the overestimation is no greater than [epsilon * |a|1],
- where [|a|1] denotes the L1 (taxicab) norm on the actual vector
- [a] (i.e. the sum of all updates done to all keys in the sketch.) *)
+ This estimate is never less than the true value and, with
+ probability of at least [1 - delta], the overestimation is no
+ greater than [epsilon * |a|1], where [|a|1] denotes the L1 (taxicab)
+ norm on the actual vector [a] (i.e. the sum of all updates done to
+ all keys in the sketch.) *)
val query : sketch -> ix:int -> int
-(** Estimates the count of the field [ix]. This works in both negative
- and non-negative actual counts, but is not as accurate as [query]
- in the non-negative case.
+(** [~O(log(1/delta))]. Estimates the count of the
+ field [ix]. This works in both negative and non-negative actual
+ counts, but is not as accurate as [query] in the non-negative case.
With probability of at least [1 - delta^(1/4)], the estimate falls
- within [3 epsilon * |a|1] of the true value. *)
+ within [3 epsilon * |a|1] of the true value.
+
+ The true asymptotic time complexity of this particular
+ implementation is actually [O(log(1/delta) * log(log(1/delta))]
+ due to the use of an [O(n * log(n)] median algorithm. *)
val nquery : sketch -> ix:int -> int
-(** Estimates the inner product of the two sketches, that is,
+(** Estimates the dot product of the two sketches, that is,
[sum_i(a.(i) * b.(i))]. *)
-(**val inner_product_query : sketch -> sketch -> int*)
+val dot_product_query : sketch -> sketch -> int
View
185 myocamlbuild.ml
@@ -0,0 +1,185 @@
+open Ocamlbuild_plugin
+open Command (* no longer needed for OCaml >= 3.10.2 *)
+
+(**
+Overview of tags:
+- [pkg_batteries] to use Batteries as a library, without syntax extensions
+- [use_batteries] and [use_batteries_r] to use both Batteries and all the non-destructive syntax extensions
+- [pkg_sexplib.syntax] with [syntax_camlp4o] or [syntax_camlp4r] for sexplib
+*)
+
+
+(**
+{1 OCamlFind}
+*)
+
+let run_and_read = Ocamlbuild_pack.My_unix.run_and_read
+
+let blank_sep_strings = Ocamlbuild_pack.Lexers.blank_sep_strings
+
+module OCamlFind =
+struct
+ (* this lists all supported packages *)
+ let find_packages () =
+ blank_sep_strings &
+ Lexing.from_string &
+ run_and_read "ocamlfind list | cut -d' ' -f1"
+
+ (* this is supposed to list available syntaxes, but I don't know how to do it. *)
+ let find_syntaxes () = ["camlp4o"; "camlp4r"]
+
+ (* ocamlfind command *)
+ let ocamlfind x = S[A"ocamlfind"; x]
+
+ let before_options () =
+ (* by using Before_options one let command line options have an higher priority *)
+ (* on the contrary using After_options will guarantee to have the higher priority *)
+
+ (* override default commands by ocamlfind ones *)
+ Options.ocamlc := ocamlfind & A"ocamlc";
+ Options.ocamlopt := ocamlfind & A"ocamlopt";
+ Options.ocamldep := ocamlfind & A"ocamldep";
+ Options.ocamldoc := ocamlfind & A"ocamldoc";
+ Options.ocamlmktop := ocamlfind & A"ocamlmktop"
+
+ let get_ocamldoc_directory () =
+ let ocamldoc_directory = run_and_read "ocamlfind ocamldoc -customdir" in
+ let length = String.length ocamldoc_directory in
+ assert (length != 0);
+ let char = ocamldoc_directory.[length - 1] in
+if (char = '\n') || (char = '\r') then String.sub ocamldoc_directory 0 (length - 1)
+else ocamldoc_directory
+
+ let after_rules () =
+ (* When one link an OCaml library/binary/package, one should use -linkpkg *)
+ flag ["ocaml"; "byte"; "link"; "program"] & A"-linkpkg";
+ flag ["ocaml"; "native"; "link"; "program"] & A"-linkpkg";
+
+
+ (* For each ocamlfind package one inject the -package option when
+* compiling, computing dependencies, generating documentation and
+* linking. *)
+ List.iter begin fun pkg ->
+ flag ["ocaml"; "compile"; "pkg_"^pkg] & S[A"-package"; A pkg];
+ flag ["ocaml"; "ocamldep"; "pkg_"^pkg] & S[A"-package"; A pkg];
+ flag ["ocaml"; "doc"; "pkg_"^pkg] & S[A"-package"; A pkg];
+ flag ["ocaml"; "link"; "pkg_"^pkg] & S[A"-package"; A pkg];
+ end (find_packages ());
+
+ (* Like -package but for extensions syntax. Morover -syntax is useless
+* when linking. *)
+ List.iter begin fun syntax ->
+ flag ["ocaml"; "compile"; "syntax_"^syntax] & S[A"-syntax"; A syntax];
+ flag ["ocaml"; "ocamldep"; "syntax_"^syntax] & S[A"-syntax"; A syntax];
+ flag ["ocaml"; "doc"; "syntax_"^syntax] & S[A"-syntax"; A syntax];
+ end (find_syntaxes ());
+
+ (* The default "thread" tag is not compatible with ocamlfind.
+Indeed, the default rules add the "threads.cma" or "threads.cmxa"
+options when using this tag. When using the "-linkpkg" option with
+ocamlfind, this module will then be added twice on the command line.
+
+To solve this, one approach is to add the "-thread" option when using
+the "threads" package using the previous plugin.
+*)
+ flag ["ocaml"; "pkg_threads"; "compile"] (S[A "-thread"]);
+ flag ["ocaml"; "pkg_threads"; "link"] (S[A "-thread"]);
+end
+
+(**
+{1 OCaml Batteries Included}
+*)
+
+module Batteries =
+struct
+ let before_options () = ()
+
+ let after_rules () =
+ flag ["ocaml"; "link"; "byte"; "use_ocamldoc_info"] (S[A "-I"; A "+ocamldoc"; A "odoc_info.cma"]);
+ flag ["ocaml"; "link"; "native"; "use_ocamldoc_info"] (S[A "-I"; A "+ocamldoc"(*; A "odoc_info.cmxa"*)]);
+ flag ["ocaml"; "docfile"; "use_ocamldoc_info"] (S[A "-I"; A "+ocamldoc"]);
+ flag ["ocaml"; "docdir"; "use_ocamldoc_info"] (S[A "-I"; A "+ocamldoc"]);
+ flag ["ocaml"; "doc"; "use_ocamldoc_info"] (S[A "-I"; A "+ocamldoc"]);
+
+ (*The command-line for [use_batteries] and [use_batteries_r]*)
+
+ let cl_use_boilerplate = [A"-package"; A "batteries.pa_type_conv.syntax,batteries,sexplib.syntax"]
+ and cl_use_batteries = [A"-package"; A "batteries.pa_openin.syntax,batteries.pa_where.syntax,batteries.pa_batteries.syntax"; A "-package"; A "batteries"]
+ and cl_use_batteries_o = []
+ (*[cl_use_batteries_o]: extensions which only make sense in original syntax*)
+ and cl_camlp4o = [A"-syntax"; A "camlp4o"]
+ and cl_camlp4r = [A"-syntax"; A "camlp4r"] in
+
+ let cl_boilerplate_original = cl_use_boilerplate @ cl_camlp4o
+ and cl_boilerplate_revised = cl_use_boilerplate @ cl_camlp4r
+ and cl_batteries_original = cl_use_batteries @ cl_use_batteries_o @ cl_camlp4o
+ and cl_batteries_revised = cl_use_batteries @ cl_camlp4r in
+
+ (** Tag [use_boilerplate] provides boilerplate syntax extensions,
+in original syntax*)
+
+ flag ["ocaml"; "compile"; "use_boilerplate"] & S cl_boilerplate_original ;
+ flag ["ocaml"; "ocamldep"; "use_boilerplate"] & S cl_boilerplate_original ;
+ flag ["ocaml"; "doc"; "use_boilerplate"] & S cl_boilerplate_original ;
+ flag ["ocaml"; "link"; "use_boilerplate"] & S cl_boilerplate_original ;
+
+ (** Tag [use_boilerplate_r] provides boilerplate syntax extensions,
+in original syntax*)
+
+ flag ["ocaml"; "compile"; "use_boilerplate_r"] & S cl_boilerplate_revised ;
+ flag ["ocaml"; "ocamldep"; "use_boilerplate_r"] & S cl_boilerplate_revised ;
+ flag ["ocaml"; "doc"; "use_boilerplate_r"] & S cl_boilerplate_revised ;
+ flag ["ocaml"; "link"; "use_boilerplate_r"] & S cl_boilerplate_revised ;
+
+ (** Tag [use_batteries] provides both package [batteries]
+and all syntax extensions, in original syntax. *)
+
+ flag ["ocaml"; "compile"; "use_batteries"] & S cl_batteries_original ;
+ flag ["ocaml"; "ocamldep"; "use_batteries"] & S cl_batteries_original ;
+ flag ["ocaml"; "doc"; "use_batteries"] & S cl_batteries_original ;
+ flag ["ocaml"; "link"; "use_batteries"] & S cl_batteries_original ;
+
+ (** Tag [use_batteries_r] provides both package [batteries]
+and all syntax extensions, in revised syntax. *)
+
+ flag ["ocaml"; "compile"; "use_batteries_r"] & S cl_batteries_revised;
+ flag ["ocaml"; "ocamldep"; "use_batteries_r"] & S cl_batteries_revised;
+ flag ["ocaml"; "doc"; "use_batteries_r"] & S cl_batteries_revised;
+ flag ["ocaml"; "link"; "use_batteries_r"] & S cl_batteries_revised
+
+
+(* flag ["ocaml"; "compile"; "use_batteries"] & S[A "-verbose";
+A"-package"; A "batteries.syntax.full";
+A"-syntax"; A "batteries.syntax.full"];
+flag ["ocaml"; "ocamldep"; "use_batteries"] & S[A "-verbose";
+A"-package"; A "batteries.syntax.full";
+A"-syntax"; A "batteries.syntax.full"];
+flag ["ocaml"; "doc"; "use_batteries"] & S[A "-verbose";
+A"-package"; A "batteries.syntax.full";
+A"-syntax"; A "batteries.syntax.full"];
+flag ["ocaml"; "link"; "use_batteries"] & S[A "-verbose";
+A"-package"; A "batteries.syntax.full";
+A"-syntax"; A "batteries.syntax.full"];*)
+
+
+end
+
+let _ = dispatch begin function
+ | Before_options ->
+ OCamlFind.before_options ();
+ Batteries.before_options ()
+ | After_rules ->
+ OCamlFind.after_rules ();
+ Batteries.after_rules ()
+
+
+ | _ -> ()
+end
+
+
+
+(**
+which ocamlrun -> header
+
+print_backtrace -> ajouter "-b" après le header
+**)

0 comments on commit 90fc039

Please sign in to comment.