Browse files

reservoir sampling

  • Loading branch information...
1 parent 3c8721f commit f110dd4d4463d64e6ce23ceb3ce6516f557471cf @stuarthalloway stuarthalloway committed Feb 3, 2013
Showing with 24 additions and 0 deletions.
  1. +14 −0 src/main/clojure/clojure/data/generators.clj
  2. +10 −0 src/test/clojure/clojure/data/generators_test.clj
View
14 src/main/clojure/clojure/data/generators.clj
@@ -282,6 +282,20 @@ instance you can get a repeatable basis for tests."
;; we'll get the same shuffle, given the same *rnd*.
(fisher-yates coll))
+(defn reservoir-sample
+ "Reservoir sample ct items from coll, using *rnd*."
+ [ct coll]
+ (loop [result (transient (core/vec (take ct coll)))
+ n ct
+ coll (drop ct coll)]
+ (if (seq coll)
+ (let [pos (uniform 0 n)]
+ (recur (if (< pos ct)
+ (assoc! result pos (first coll))
+ result)
+ (inc n)
+ (rest coll)))
+ (persistent! result))))
View
10 src/test/clojure/clojure/data/generators_test.clj
@@ -9,3 +9,13 @@
shuf (gen/shuffle coll)]
(is (= (into #{} coll)
(into #{} shuf))))))
+
+(deftest test-reservoir-sample-consistency
+ []
+ (dotimes [n 50]
+ (let [coll (range 100)
+ sample-1 (binding [gen/*rnd* (java.util.Random. n)]
+ (gen/reservoir-sample 10 coll))
+ sample-2 (binding [gen/*rnd* (java.util.Random. n)]
+ (gen/reservoir-sample 10 coll))]
+ (is (= sample-1 sample-2)))))

0 comments on commit f110dd4

Please sign in to comment.