Updated convolve, added gradient1d, and updated rolling to include va…

…riable windows.
cnuernber · May 27, 2021 · 84f51fa · 84f51fa
1 parent 0c590a6
commit 84f51fa
Show file tree

Hide file tree

Showing 11 changed files with 372 additions and 81 deletions.
diff --git a/java/tech/v3/datatype/Convolve1D.java b/java/tech/v3/datatype/Convolve1D.java
@@ -13,24 +13,34 @@ public static enum Mode
     Same,
     Valid,
   };
+  public static enum EdgeMode {
+    Clamp,
+    Zero,
+  }
   //Precondition is that ndata is <= nwindow
-  public static double[] convolve
+  //The difference between convolve and correlate is convolve reverses
+  //the window.  Callers can reverse the window before passing it in.
+  //The reason this works with double-arrays is because it is an inherently
+  //n^2 operation so we want to minimize indexing costs.
+  public static double[] correlate
     (BiFunction<Long,BiFunction<Long,Long,Object>,Object> parallelizer,
-     Buffer data,
-     Buffer rev_window,
+     double[] data,
+     double[] window,
+     int stepsize,
      DoubleUnaryOperator in_finalize,
-     Mode mode) {
+     Mode mode,
+     EdgeMode edgeMode) {
 
-    final int nData = data.size();
-    final int nWin = rev_window.size();
+    final int nData = data.length;
+    final int nWin = window.length;
     if (nData < nWin)
       throw new RuntimeException("Data size must be <= window size");
     final int nWinDec = nWin - 1;
     final int nResult;
     switch (mode) {
-    case Full: nResult = nData + nWinDec; break;
-    case Same: nResult = nData; break;
-    case Valid: nResult = nData - nWin + 1; break;
+    case Full: nResult = (nData + nWinDec)/stepsize; break;
+    case Same: nResult = nData/stepsize; break;
+    case Valid: nResult = (nData - nWin + 1)/stepsize; break;
     default: throw new RuntimeException("Unrecognized mode.");
     }
 
@@ -40,31 +50,46 @@ public static enum Mode
 	};
     }
     final DoubleUnaryOperator finalize = in_finalize;
-    double[] window = new double[nWin];
-    for (int idx = 0; idx < nWin; ++idx) {
-      window[idx] = rev_window.readDouble(nWinDec - idx);
+
+
+    final int windowOffset;
+    switch(mode) {
+    case Full: windowOffset = nWinDec; break;
+    case Same: windowOffset = (nWinDec / 2); break;
+    case Valid: windowOffset = 0; break;
+    default: throw new RuntimeException("Unrecognized mode.");
     }
+
+
+    final double left_edge;
+    final double right_edge;
+    switch(edgeMode) {
+    case Clamp: left_edge=data[0]; right_edge=data[nData-1]; break;
+    case Zero: left_edge=0.0; right_edge=0.0; break;
+    default: throw new RuntimeException("Unrecognized edge mode");
+    }
+
+
     double[] retval = new double[nResult];
     parallelizer.apply( new Long(nResult), new BiFunction<Long,Long,Object> () {
 	public Object apply(Long sidx, Long glen) {
 	  int start_idx = RT.intCast(sidx);
 	  int group_len = RT.intCast(glen);
 	  int end_idx = start_idx + group_len;
-	  final int windowOffset;
-	  switch(mode) {
-	  case Full: windowOffset = nWinDec; break;
-	  case Same: windowOffset = (nWinDec / 2); break;
-	  case Valid: windowOffset = 0; break;
-	  default: throw new RuntimeException("Unrecognized mode.");
-	  }
+
+	  //This loop is an interesting case for a blog post on the vectorization API.
 	  for(int idx = start_idx; idx < end_idx; ++idx ) {
-	    int data_start_idx = idx - windowOffset;
+	    int data_start_idx = (idx*stepsize) - windowOffset;
 	    double sum = 0.0;
 	    for (int win_idx = 0; win_idx < nWin; ++win_idx) {
 	      int data_idx = data_start_idx + win_idx;
-	      double data_val = 0.0;
-	      if (data_idx < nData && data_idx >= 0)
-		data_val = data.readDouble(data_idx);
+	      double data_val;
+	      if (data_idx < 0)
+		data_val = left_edge;
+	      else if (data_idx >= nData)
+		data_val = right_edge;
+	      else
+		data_val = data[data_idx];
 	      sum += data_val * window[win_idx];
 	    }
 	    retval[idx] = finalize.applyAsDouble(sum);

diff --git a/project.clj b/project.clj
@@ -43,6 +43,7 @@
                                    tech.v3.datatype.datetime
                                    tech.v3.datatype.mmap
                                    tech.v3.datatype.convolve
+                                   tech.v3.datatype.gradient
                                    tech.v3.datatype.mmap-writer
                                    tech.v3.datatype.native-buffer
                                    tech.v3.datatype.sampling

diff --git a/src/tech/v3/datatype/convolve.clj b/src/tech/v3/datatype/convolve.clj
@@ -3,82 +3,121 @@
   convolutions are supported."
   (:require [tech.v3.datatype.base :as dt-base]
             [tech.v3.datatype.array-buffer :as array-buffer]
+            [tech.v3.datatype.copy-make-container :as dt-cmc]
             [tech.v3.parallel.for :as pfor])
-  (:import [tech.v3.datatype Convolve1D Convolve1D$Mode ArrayHelpers]
+  (:import [tech.v3.datatype Convolve1D Convolve1D$Mode ArrayHelpers DoubleReader
+            Convolve1D$EdgeMode]
            [java.util.function BiFunction]
            [org.apache.commons.math3.distribution NormalDistribution]))
 
 
-(defn convolve1d
+(defn correlate1d
   "Convolve window over data.  Window must be smaller than data.
   Returns result in `:float64` space.  Convolutions outside the valid
   space of data are supplied zeros to match numpy conventions.
 
   Options:
 
   * `:mode` - One of :full, :same, or :valid.
+  * `:edge-mode` - One of :zero or :clamp.  Clamp clamps the edge value to the start
+    or end respectively. Defaults to :zero.
   * `:finalizer` - A java.util.function.DoubleUnaryOperator that performs
   one more operation on the data after summation but before assigned to
   result.
   * `:force-serial` - For serial execution of the convolution.  Unnecessary except
   for profiling and comparison purposes.
+  * `:stepsize` - Defaults to 1, this steps across the input data in stepsize
+    increments.
+```"
+  ([data win {:keys [mode finalizer force-serial edge-mode stepsize]
+              :or {mode :full
+                   edge-mode :zero
+                   stepsize 1}}]
+   (let [data (dt-cmc/->double-array data)
+         win (dt-cmc/->double-array win)]
+     (-> (Convolve1D/correlate
+          (reify BiFunction
+            (apply [this n-elems applier]
+              (if force-serial
+                (.apply ^BiFunction applier 0 n-elems)
+                (pfor/indexed-map-reduce
+                 n-elems
+                 (fn [start-idx group-len]
+                   (.apply ^BiFunction applier start-idx group-len))
+                 dorun))))
+          data
+          win
+          (int stepsize)
+          finalizer
+          (case mode
+            :full Convolve1D$Mode/Full
+            :same Convolve1D$Mode/Same
+            :valid Convolve1D$Mode/Valid)
+          (case edge-mode
+            :zero Convolve1D$EdgeMode/Zero
+            :clamp Convolve1D$EdgeMode/Clamp))
+         (array-buffer/array-buffer))))
+  ([data win]
+   (correlate1d data win nil)))
 
-  Example:
+
+(defn convolve1d
+  "Convolve a window across a signal.  The only difference from correlate is the
+  window is reversed and then correlate is called.  See options for correlate, this
+  has the same defaults.
+
+  Examples:
 
 ```clojure
-user> (require '[tech.v3.datatype.convolve :as conv])
+user> (require '[tech.v3.datatype.convolve :as dt-conv])
 nil
-user> (conv/convolve1d [1, 2, 3], [0, 1, 0.5])
+user> (dt-conv/convolve1d [1, 2, 3], [0, 1, 0.5])
 #array-buffer<float64>[5]
 [0.000, 1.000, 2.500, 4.000, 1.500]
-user> (conv/convolve1d [1, 2, 3], [0, 1, 0.5] {:mode :same})
+user> (dt-conv/convolve1d [1, 2, 3], [0, 1, 0.5] {:mode :same})
 #array-buffer<float64>[3]
 [1.000, 2.500, 4.000]
-user> (conv/convolve1d [1, 2, 3], [0, 1, 0.5] {:mode :valid})
+user> (dt-conv/convolve1d [1, 2, 3], [0, 1, 0.5] {:mode :valid})
 #array-buffer<float64>[1]
 [2.500]
 ```"
-  ([data win {:keys [mode finalizer force-serial]
-              :or {mode :full}}]
-   (-> (Convolve1D/convolve
-        (reify BiFunction
-          (apply [this n-elems applier]
-            (if force-serial
-              (.apply ^BiFunction applier 0 n-elems)
-              (pfor/indexed-map-reduce
-               n-elems
-               (fn [start-idx group-len]
-                 (.apply ^BiFunction applier start-idx group-len))
-               dorun))))
-        (dt-base/->buffer data)
-        (dt-base/->buffer win)
-        finalizer
-        (case mode
-          :full Convolve1D$Mode/Full
-          :same Convolve1D$Mode/Same
-          :valid Convolve1D$Mode/Valid))
-       (array-buffer/array-buffer)))
+  ([data win options]
+   (let [win (dt-base/->buffer win)
+         n-win (.lsize win)
+         n-win-dec (dec n-win)]
+     (correlate1d data
+                  (reify DoubleReader
+                    (lsize [this] n-win)
+                    (readDouble [this idx]
+                      (.readDouble win (- n-win-dec idx))))
+                  options)))
   ([data win]
    (convolve1d data win nil)))
 
 
 (defn gaussian1d
   "Perform a 1d gaussian convolution.  Options are passed to convolve1d - but the
   default mode is `:same` as opposed to `:full`.  Values are drawn from a 0 centered
-  normal distribution with stddev of 1 on the range of -range,range and with
+  normal distribution with stddev of 1 on the range of -range*sigma,range*sigma at
   window-len increments. Then the window vector is normalized to length 1 and convolved
-  over the data using convolve1d.  This means with a range of '2' you are drawing
+  over the data using correlate1d.  This means with a range of '2' you are drawing
   from a 0-centered normal distribution 2 standard distributions so the edges will have
   a pre-normalized value of around 0.05.
 
   Options:
 
-  * `:mode` - default so `:same`, see options for convolve1d.
+  * `:mode` - defaults to `:same`, see options for correlate1d.
+  * `:edge-mode` - defaults to `:clamp`, see options for correlate1d.
   * `:range` - Defaults to 2, values are drawn from (-range, range) with
-               window-len increments."
-  ([data window-len {:keys [range mode]
+    window-len increments.
+  * `:stepsize` - Defaults to 1.  Increasing this decreases the size of the
+    result by a factor of stepsize.  So you can efficiently downsample your data
+    by using a gaussian window and increasing stepsize to the desired downsample
+    amount."
+  ([data window-len {:keys [range mode edge-mode]
                      :or {range 2
-                          mode :same}
+                          mode :same
+                          edge-mode :clamp}
                      :as options}]
    (let [dist (NormalDistribution.)
          range (double range)
@@ -104,7 +143,7 @@ user> (conv/convolve1d [1, 2, 3], [0, 1, 0.5] {:mode :valid})
      ;;Normalize the vector
      (dotimes [idx window-len]
        (ArrayHelpers/accumMul window idx inv-len))
-     (convolve1d data window (assoc options :mode mode))))
+     (correlate1d data window (assoc options :mode mode :edge-mode edge-mode))))
   ([data window-len]
    (gaussian1d data window-len nil)))
 
@@ -114,5 +153,6 @@ user> (conv/convolve1d [1, 2, 3], [0, 1, 0.5] {:mode :valid})
   (convolve1d [1, 2, 3], [0, 1, 0.5])
   (convolve1d [1, 2, 3], [0, 1, 0.5] {:mode :same})
   (convolve1d [1, 2, 3], [0, 1, 0.5] {:mode :valid})
-
+  (convolve1d (range 100), [0, 1, 0.5] {:edge-mode :zero
+                                        :stepsize 2})
   )
diff --git a/src/tech/v3/datatype/datetime.clj b/src/tech/v3/datatype/datetime.clj
@@ -106,4 +106,4 @@ A general outline is:
                 datetime->epoch
                 epoch->datetime
                 between
-                variable-rolling-window)
+                variable-rolling-window-indexes)
diff --git a/src/tech/v3/datatype/datetime/operations.clj b/src/tech/v3/datatype/datetime/operations.clj
@@ -5,6 +5,7 @@
             [tech.v3.datatype.packing :as packing]
             [tech.v3.datatype.datetime.packing :as dt-packing]
             [tech.v3.datatype.datetime.base :as dt-base]
+            [tech.v3.datatype.rolling :as dt-rolling]
             [tech.v3.datatype.functional :as dfn]
             [tech.v3.datatype.binary-op :as bin-op]
             [tech.v3.datatype.argops :as argops]
@@ -571,20 +572,18 @@
       retval)))
 
 
-(defn variable-rolling-window
+(defn variable-rolling-window-indexes
   "Given a reader of monotonically increasing source datetime data, return an
   iterable of ranges that describe the windows in index space.  There will be one
   window per source input and windows are applied in int64 microsecond space.  Be
-  aware that windows near the end cannot possibly satisfy the windowing requirement."
+  aware that windows near the end cannot possibly satisfy the windowing requirement.
+
+  See options for tech.v3.datatype.rolling/variable-rolling-window-indexes for further
+  options, of which stepsize may be of interest."
   ([src-data window-length units options]
    (let [^BinaryOperator tweener (between-op (dtype-base/elemwise-datatype src-data)
-                                             units)
-         src-data (dtype-base/->buffer src-data)
-         n-windows (long (:n-windows options (.lsize src-data)))]
-     (reify Iterable
-       (iterator [this]
-         (ObjectRollingIterator.  0 1 (long window-length)
-                                  n-windows (.lsize src-data)
-                                  tweener src-data)))))
+                                             units)]
+     (dt-rolling/variable-rolling-window-indexes
+      src-data window-length (merge {:comp-fn tweener} options))))
   ([src-data window-length units]
-   (variable-rolling-window src-data window-length units nil)))
+   (variable-rolling-window-indexes src-data window-length units nil)))