Merge pull request #143 from alanvgreen/avg11

Avg11
google · Mar 9, 2021 · 541d47f · 541d47f
2 parents 7cf2de8 + 7247443
commit 541d47f
Show file tree

Hide file tree

Showing 11 changed files with 602 additions and 52 deletions.
diff --git a/common/src/menu.c b/common/src/menu.c
@@ -69,9 +69,9 @@ void menu_run(struct Menu *menu)
             }
             else
             {
-                puts("\n");
+                printf("\nRunning %s\n", item->description );
                 item->fn();
-                puts("\n---");
+                puts("---");
             }
         }
     }

diff --git a/proj/mnv2_first/Makefile b/proj/mnv2_first/Makefile
@@ -17,7 +17,7 @@
 export DEFINES :=
 
 # Uncomment this line to use software defined CFU functions in software_cfu.cc
-#DEFINES += CFU_SOFTWARE_DEFINED
+DEFINES += CFU_SOFTWARE_DEFINED
 
 # Uncomment this line to skip debug code (large effect on performance)
 DEFINES += NDEBUG

diff --git a/proj/mnv2_first/gateware/mnv2_cfu.py b/proj/mnv2_first/gateware/mnv2_cfu.py
@@ -13,33 +13,75 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from nmigen_cfu import InstructionBase, Cfu
+from .post_process import SRDHMInstruction, RoundingDividebyPOTInstruction
+from .param_store import ParamStoreSetter
+from nmigen_cfu import Cfu
+from .getset import GetSetInstruction, RegisterSetter
 
+OUTPUT_CHANNEL_PARAM_DEPTH = 512
 
-class TemplateInstruction(InstructionBase):
-    """Template instruction
-    """
+
+class Mnv2RegisterInstruction(GetSetInstruction):
+    def __init__(self):
+        self.set_input_depth = RegisterSetter()
+        self.set_output_depth = RegisterSetter()
+        self.set_input_offset = RegisterSetter()
+        self.set_output_offset = RegisterSetter()
+        self.set_activation_min = RegisterSetter()
+        self.set_activation_max = RegisterSetter()
+        self.set_output_batch_size = RegisterSetter()
+        self.store_output_mutiplier = ParamStoreSetter(
+            width=32, depth=OUTPUT_CHANNEL_PARAM_DEPTH)
+        self.store_output_shift = ParamStoreSetter(
+            width=32, depth=OUTPUT_CHANNEL_PARAM_DEPTH)
+        self.store_output_bias = ParamStoreSetter(
+            width=32, depth=OUTPUT_CHANNEL_PARAM_DEPTH)
+        xetters = {
+            10: self.set_input_depth,
+            11: self.set_output_depth,
+            12: self.set_input_offset,
+            13: self.set_output_offset,
+            14: self.set_activation_min,
+            15: self.set_activation_max,
+            20: self.set_output_batch_size,
+            21: self.store_output_mutiplier,
+            22: self.store_output_shift,
+            23: self.store_output_bias,
+        }
+        super().__init__(xetters)
+
+    def register_xetters(self, m):
+        m.submodules['set_input_depth'] = self.set_input_depth
+        m.submodules['set_output_depth'] = self.set_output_depth
+        m.submodules['set_input_offset'] = self.set_input_offset
+        m.submodules['set_output_offset'] = self.set_output_offset
+        m.submodules['set_activation_min'] = self.set_activation_min
+        m.submodules['set_activation_max'] = self.set_activation_max
+        m.submodules['set_output_batch_size'] = self.set_output_batch_size
+        m.submodules['store_output_mutiplier'] = self.store_output_mutiplier
+        m.submodules['store_output_shift'] = self.store_output_shift
+        m.submodules['store_output_bias'] = self.store_output_bias
 
     def elab(self, m):
-        with m.If(self.start):
-            m.d.sync += self.output.eq(self.in0 + self.in1)
-            m.d.sync += self.done.eq(1)
-        with m.Else():
-            m.d.sync += self.done.eq(0)
+        self.register_xetters(m)
 
 
 class Mnv2Cfu(Cfu):
+    """Simple CFU for Mnv2.
+
+    Most functionality is provided through a single set of registers.
+    """
+
     def __init__(self):
         super().__init__({
-            0: TemplateInstruction(),
+            0: Mnv2RegisterInstruction(),
+            6: RoundingDividebyPOTInstruction(),
+            7: SRDHMInstruction(),
         })
 
     def elab(self, m):
         super().elab(m)
 
 
 def make_cfu():
-    return Cfu({
-        # Add instructions here...
-        0: TemplateInstruction(),
-    })
+    return Mnv2Cfu()
diff --git a/proj/mnv2_first/gateware/post_process.py b/proj/mnv2_first/gateware/post_process.py
@@ -0,0 +1,114 @@
+#!/bin/env python
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cfu import InstructionBase
+from nmigen_cfu import Mux, Signal, signed
+
+from nmigen_cfu import SimpleElaboratable
+
+
+INT32_MIN = 0x8000_0000
+INT32_MAX = 0x7fff_ffff
+
+
+class SRDHM(SimpleElaboratable):
+    """Implements gemmlowp::SaturatingRoundingDoublingHighMul
+
+    It multiplies two 32 bit numbers, then returns bits 62 to 31 of the
+    64 bit result. This is 2x the high word (allowing for saturating and
+    rounding).
+
+    Implemented as a pipeline so that results are always available 3
+    cycles after setting inputs.
+
+    Note that there is a bug to investigated here. This implementation
+    matches the behavior of the compiled source, however, "nudge" may be
+    one of two values.
+
+    Public Interface
+    ----------------
+      a: Signal(signed(32)) input
+        First operand
+      b: Signal(signed(32)) input
+        Second operand
+      result: Signal(signed(32)) output
+        The result of a*b
+    """
+
+    def __init__(self):
+        self.a = Signal(signed(32))
+        self.b = Signal(signed(32))
+        self.result = Signal(signed(32))
+
+    def elab(self, m):
+        areg = Signal.like(self.a)
+        breg = Signal.like(self.b)
+        ab = Signal(signed(64))
+        overflow = Signal()
+
+        # for some reason negative nudge is not used
+        nudge = 1 << 30
+
+        # cycle 0, register a and b
+        m.d.sync += [
+            areg.eq(self.a),
+            breg.eq(self.b),
+        ]
+        # cycle 1, decide if this is an overflow and multiply
+        m.d.sync += [
+            overflow.eq((areg == INT32_MIN) & (breg == INT32_MIN)),
+            ab.eq(areg * breg),
+        ]
+        # cycle 2, apply nudge determine result
+        m.d.sync += [
+            self.result.eq(Mux(overflow, INT32_MAX, (ab + nudge)[31:])),
+        ]
+
+
+class SRDHMInstruction(InstructionBase):
+    def elab(self, m):
+        m.submodules['srdhm'] = srdhm = SRDHM()
+        countdown = Signal(signed(3))
+        m.d.comb += self.done.eq(countdown == 0)
+
+        m.d.comb += [
+            srdhm.a.eq(self.in0),
+            srdhm.b.eq(self.in1),
+            self.output.eq(srdhm.result),
+        ]
+        with m.If(self.start):
+            m.d.sync += countdown.eq(2)
+        with m.Else():
+            m.d.sync += countdown.eq(Mux(countdown != -1, countdown - 1, -1))
+
+
+def rounding_divide_by_pot(x, exponent):
+    """Implements gemmlowp::RoundingDivideByPOT
+
+    This divides by a power of two, rounding to the nearest whole number.
+    """
+    mask = (1 << exponent) - 1
+    remainder = x & mask
+    threshold = (mask >> 1) + x[31]
+    rounding = Mux(remainder > threshold, 1, 0)
+    return (x >> exponent) + rounding
+
+
+class RoundingDividebyPOTInstruction(InstructionBase):
+    def elab(self, m):
+        m.d.comb += [
+            self.output.eq(rounding_divide_by_pot(self.in0s, self.in1[:5])),
+            self.done.eq(1),
+        ]
diff --git a/proj/mnv2_first/src/cpp_math.cc b/proj/mnv2_first/src/cpp_math.cc
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2021 The CFU-Playground Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cpp_math.h"
+
+#include "cfu.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+
+int32_t cpp_math_mul_by_quantized_mul_software(int32_t x,
+                                               int32_t quantized_multiplier,
+                                               int shift) {
+  return tflite::MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
+}
+
+// Expanded source of tflite::MultiplyByQuantizedMultiplier
+int32_t cpp_math_mul_by_quantized_mul_gateware1(int32_t x,
+                                                int32_t quantized_multiplier,
+                                                int shift) {
+  using gemmlowp::RoundingDivideByPOT;
+  int left_shift = shift > 0 ? shift : 0;
+  int right_shift = shift > 0 ? 0 : -shift;
+  int64_t left_shifted = x * (1 << left_shift);
+  int32_t srdhm = cfu_op7_hw(0, left_shifted, quantized_multiplier);
+  return RoundingDivideByPOT(srdhm, right_shift);
+}
+
+// Expanded source of tflite::MultiplyByQuantizedMultiplier
+int32_t cpp_math_mul_by_quantized_mul_gateware2(int32_t x,
+                                                int32_t quantized_multiplier,
+                                                int shift) {
+  int left_shift = shift > 0 ? shift : 0;
+  int right_shift = shift > 0 ? 0 : -shift;
+  int32_t left_shifted = x << left_shift;
+  int32_t srdhm = cfu_op7_hw(0, left_shifted, quantized_multiplier);
+  return cfu_op6_hw(0, srdhm, right_shift);
+}
+
+
+int32_t cpp_math_srdhm_software(int32_t a, int32_t b) {
+  return gemmlowp::SaturatingRoundingDoublingHighMul(a, b);
+}
+
+
+int32_t cpp_math_rdbpot_software(int32_t value, int shift) {
+  return gemmlowp::RoundingDivideByPOT(value, shift);
+}
diff --git a/proj/mnv2_first/src/cpp_math.h b/proj/mnv2_first/src/cpp_math.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2021 The CFU-Playground Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef _CPP_MATH_H
+#define _CPP_MATH_H
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// C++ (and other) math functions for C
+
+int32_t cpp_math_mul_by_quantized_mul_software(int32_t x, int32_t quantized_multiplier,
+                             int shift);
+
+int32_t cpp_math_mul_by_quantized_mul_gateware1(int32_t x, int32_t quantized_multiplier,
+                             int shift);
+
+int32_t cpp_math_mul_by_quantized_mul_gateware2(int32_t x, int32_t quantized_multiplier,
+                             int shift);
+
+int32_t cpp_math_srdhm_software(int32_t a, int32_t b);
+
+// Rounding divide by power of two
+int32_t cpp_math_rdbpot_software(int32_t value, int shift);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // _CPP_MATH_H
diff --git a/proj/mnv2_first/src/golden_op_tests.cc b/proj/mnv2_first/src/golden_op_tests.cc
@@ -1329,14 +1329,18 @@ void golden_op_run_1x1conv(void) {
   puts("");
 
   // Check results are ok
-  bool ok = true;
+  int fails = 0;
   for (size_t i = 0; i < 2400; i++) {
     if (actual_output[i] != expected_output[i]) {
-      printf("FAIL - output tensor mismatch at %u\n", i);
-      ok = false;
+      if (!fails) {
+        printf("FAIL - first output tensor mismatch at %u\n", i);
+      }
+      fails++;
     }
   }
-  if (ok) {
+  if (fails) {
+    printf("FAIL - %d fails\n", fails);
+  } else {
     puts("OK - output tensor matches");
   }
 }