Merge pull request #28 from alanvgreen/maccs

Maccs
google · Mar 16, 2021 · 977a650 · 977a650
2 parents 6b338ea + cacf49e
commit 977a650
Show file tree

Hide file tree

Showing 10 changed files with 214 additions and 38 deletions.
diff --git a/proj/mnv2_first/Makefile b/proj/mnv2_first/Makefile
@@ -37,7 +37,7 @@ RUN_MENU_ITEMS := 3 1
 
 # Uncomment to use smaller batches for debugging. This helps expose 
 # bugs with processing multiple batch handling.
-DEFINES += USE_CONV_SMALL_BATCHES
+#DEFINES += USE_CONV_SMALL_BATCHES
 
 DEFINES += ACCEL_CONV
 

diff --git a/proj/mnv2_first/gateware/macc.py b/proj/mnv2_first/gateware/macc.py
@@ -0,0 +1,52 @@
+#!/bin/env python
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nmigen import Signal, signed
+
+from nmigen_cfu import SimpleElaboratable
+
+from .registerfile import Xetter
+
+
+class ExplicitMacc4(Xetter):
+    """A Macc4 that operates on in0 (input_vals) and in1 (filter_vals).
+
+    Public Interface
+    ----------------
+    input_offset: Signal(signed(8)) input
+        Offset to be added to all inputs.
+
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.input_offset = Signal(signed(9))
+
+    def elab(self, m):
+        muls = []
+        for n in range(4):
+            tmp = Signal(signed(9))
+            inval = self.in0.word_select(n, 8).as_signed()
+            fval = self.in1.word_select(n, 8).as_signed()
+            mul = Signal(signed(32))
+            m.d.comb += [
+                tmp.eq(inval + self.input_offset),
+                mul.eq(tmp * fval)
+            ]
+            muls.append(mul)
+        m.d.comb += [
+            self.output.eq(sum(muls)),
+            self.done.eq(1),
+        ]
diff --git a/proj/mnv2_first/gateware/mnv2_cfu.py b/proj/mnv2_first/gateware/mnv2_cfu.py
@@ -18,6 +18,7 @@
 from .post_process import PostProcessXetter, SRDHMInstruction, RoundingDividebyPOTInstruction
 from .store import CircularIncrementer, FilterValueGetter, InputStore, InputStoreGetter, InputStoreSetter, StoreSetter
 from .registerfile import RegisterFileInstruction, RegisterSetter
+from .macc import ExplicitMacc4
 
 OUTPUT_CHANNEL_PARAM_DEPTH = 512
 NUM_FILTER_DATA_WORDS = 512
@@ -101,10 +102,19 @@ def _make_input_store(self, m, name, restart_signal, input_depth):
             ins.r_finished.eq(read_finished),
         ]
 
+    def _make_explicit_macc_4(self, m, reg_num, name, input_offset):
+        """Constructs and registers an explicit macc4 instruction
+
+        """
+        xetter = ExplicitMacc4()
+        m.d.comb += xetter.input_offset.eq(input_offset)
+        m.submodules[name] = xetter
+        self.register_xetter(reg_num, xetter)
+
     def elab_xetters(self, m):
         input_depth, set_id = self._make_setter(m, 10, 'set_input_depth')
         self._make_setter(m, 11, 'set_output_depth')
-        self._make_setter(m, 12, 'set_input_offset')
+        input_offset, _ = self._make_setter(m, 12, 'set_input_offset')
         offset, _ = self._make_setter(m, 13, 'set_output_offset')
         activation_min, _ = self._make_setter(m, 14, 'set_activation_min')
         activation_max, _ = self._make_setter(m, 15, 'set_activation_max')
@@ -131,6 +141,9 @@ def elab_xetters(self, m):
 
         self._make_input_store(m, 'ins', set_id, input_depth)
 
+        # MACC 4
+        self._make_explicit_macc_4(m, 30, 'ex_m4', input_offset)
+
         m.d.comb += [
             ppx.offset.eq(offset),
             ppx.activation_min.eq(activation_min),

diff --git a/proj/mnv2_first/gateware/post_process.py b/proj/mnv2_first/gateware/post_process.py
@@ -27,16 +27,16 @@ class SRDHM(SimpleElaboratable):
     """Implements gemmlowp::SaturatingRoundingDoublingHighMul
 
     The function calculated is approximately ((a*b) >> 31) & 0xffffffff,
-    allowing for saturating and rounding. In other words,it multiplies 
-    two 32 bit numbers, then returns bits 62 to 31 of the 64 bit result. 
-    This is 2x the high word of a 64 bit multiplication (allowing for 
+    allowing for saturating and rounding. In other words,it multiplies
+    two 32 bit numbers, then returns bits 62 to 31 of the 64 bit result.
+    This is 2x the high word of a 64 bit multiplication (allowing for
     saturating and rounding).
 
     Implemented as a pipeline so that results are always available 3
     cycles after setting inputs.
 
     Note that there is a bug to investigated here. This implementation
-    matches the behavior of the RISCV compiled source, however, it seems 
+    matches the behavior of the RISCV compiled source, however, it seems
     that "nudge" is only ever one of the two values in RISCV.
 
     Public Interface
@@ -226,7 +226,7 @@ def elab(self, m):
 class PostProcessXetter(Xetter):
     """Does post-processing of an accumulator value.
 
-    The output channel index is implied by processing order. This 
+    The output channel index is implied by processing order. This
     is mostly a wrapper around PostProcessor.
 
     Attributes

diff --git a/proj/mnv2_first/gateware/test_macc.py b/proj/mnv2_first/gateware/test_macc.py
@@ -0,0 +1,60 @@
+#!/bin/env python
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nmigen.sim import Delay
+
+from nmigen_cfu import TestBase
+
+from .macc import ExplicitMacc4
+
+
+def pack_vals(a, b, c, d):
+    return ((a & 0xff)
+            + ((b & 0xff) << 8)
+            + ((c & 0xff) << 16)
+            + ((d & 0xff) << 24))
+
+
+class ExplicitMacc4Test(TestBase):
+    def create_dut(self):
+        return ExplicitMacc4()
+
+    def test(self):
+        DATA = [
+            ((0, 0, 0), 0),
+            ((2, 1, 1), 3),
+            ((2, 1, 1), 3),
+            ((128, pack_vals(-128, -128, -128, -128), pack_vals(1, 1, 1, 1)), 0),
+            ((128, pack_vals(-128, -127, -126, -125),
+              pack_vals(10, 11, 12, 13)), 1 * 11 + 2 * 12 + 3 * 13),
+            ((128, pack_vals(127, 0, 0, 0),
+              pack_vals(10, 11, 12, 13)), 10 * 255 + 11 * 128 + 12 * 128 + 13 * 128),
+        ]
+
+        def process():
+            for n, (inputs, expected) in enumerate(DATA):
+                input_offset, input_value, filter_value = inputs
+                yield self.dut.input_offset.eq(input_offset)
+                yield self.dut.in0.eq(input_value)
+                yield self.dut.in1.eq(filter_value)
+                yield self.dut.start.eq(1)
+                yield Delay(0.25)
+                yield self.dut.start.eq(0)
+                while not (yield self.dut.done):
+                    yield
+                self.assertEqual(
+                    (yield self.dut.output.as_signed()), expected, f"case={n}")
+                yield
+        self.run_sim(process, True)
diff --git a/proj/mnv2_first/gateware/test_mnv2_cfu.py b/proj/mnv2_first/gateware/test_mnv2_cfu.py
@@ -51,11 +51,15 @@ def test(self):
             ((0, 110, 0, 0), 333),
             ((0, 110, 0, 0), 444),
             ((0, 110, 0, 0), 555),
+            # Set input offset to 5, then do a macc
+            ((0, 12, 5, 0), 0),
+            ((0, 30, 0x01020304, 0x02040608), 6 * 2 + 7 * 4 + 8 * 6 + 9 * 8),
+
         ]
         return self.run_ops(DATA, False)
 
     def test_input_store(self):
-        DATA = []
+        DATA= []
 
         def set_val(val):
             return ((0, 25, val, 0), 0)
@@ -69,12 +73,12 @@ def set_input_depth(val):
         def finish_read():
             return ((0, 112, 0, 0), 0)
 
-        DATA = (
+        DATA= (
             [set_input_depth(10)] +
             [set_val(v) for v in range(100, 110)] +
             [get_val(v) for v in range(100, 110)] +
             [set_val(v) for v in range(200, 210)] +
             [get_val(v) for v in range(100, 110)] +
-            [finish_read() ]+
+            [finish_read()]+
             [get_val(v) for v in range(200, 210)])
-        return self.run_ops(DATA, True)
+        return self.run_ops(DATA, False)
diff --git a/proj/mnv2_first/src/mnv2_cfu.h b/proj/mnv2_first/src/mnv2_cfu.h
@@ -43,6 +43,8 @@ extern "C" {
 #define CFU_STORE_FILTER_VALUE(in0) CFU_SET(24, in0)
 #define CFU_STORE_INPUT_VALUE(in0) CFU_SET(25, in0)
 
+#define CFU_MACC4_EXPLICIT(input_vals, filter_vals) cfu_op0(30, input_vals, filter_vals)
+
 // Supports incremental development
 #define CFU_GET_FILTER_VALUE() CFU_GET(110)
 #define CFU_GET_INPUT_VALUE() CFU_GET(111)

diff --git a/proj/mnv2_first/src/proj_menu.c b/proj/mnv2_first/src/proj_menu.c
@@ -118,6 +118,37 @@ static void do_rdbpot_tests() {
   }
 }
 
+static void do_explicit_macc4_tests() {
+  int failcount = 0;
+
+  // sim_trace_enable_write(1);
+  int64_t r = 0x0;
+  for (int i = 0; i < 1024; i++) {
+    int32_t in_val = next_random(&r);
+    int32_t f_val = next_random(&r);
+    int32_t offset = next_random(&r);
+    if (offset & 0x100) {
+      offset = 0x80;
+    } else {
+      offset = (int8_t)offset;
+    }
+
+    cfu_op0_sw(12, offset, 0);
+    cfu_op0_hw(12, offset, 0);
+    int32_t sw = cfu_op0_sw(30, in_val, f_val);
+    int32_t hw = cfu_op0_hw(30, in_val, f_val);
+
+    if (hw != sw) {
+      printf("off=%08lx in=%08lx filt=%08lx -->sw=%08lx, hw=%08lx\n", offset,
+             in_val, f_val, sw, hw);
+      failcount++;
+    }
+  }
+  if (!failcount) {
+    printf("All OK\n");
+  }
+}
+
 static struct Menu MENU = {
     "Project Menu",
     "mnv2_first",
@@ -127,6 +158,7 @@ static struct Menu MENU = {
         MENU_ITEM('3', "srdhm tests", do_srdhm_tests),
         MENU_ITEM('4', "rdbpot tests", do_rdbpot_tests),
         MENU_ITEM('5', "mbqm tests", do_mbqm_tests),
+        MENU_ITEM('6', "explicit macc 4", do_explicit_macc4_tests),
         MENU_END,
     },
 };

diff --git a/proj/mnv2_first/src/software_cfu.c b/proj/mnv2_first/src/software_cfu.c
@@ -272,6 +272,36 @@ static uint32_t input_store_dump(struct InputStore* is) {
   return 0;
 }
 
+static inline int32_t macc(const int8_t input_val, int8_t filter_val) {
+  // Assumes filter values being used sequentially
+  int32_t sum = ((int32_t)filter_val) * ((int32_t)input_val + reg_input_offset);
+
+#if 0
+  static int dbg_ctr = 0;
+  if (dbg_ctr >= 96 * 24 && dbg_ctr < 96 * 25) {
+    printf("%6d, %4d, %6ld, %6ld\n", filter_val, input_val, input_offset, sum);
+  }
+  dbg_ctr++;
+#endif
+  return sum;
+}
+
+static int32_t macc4_explicit_inputs(uint32_t input_vals,
+                                     uint32_t filter_vals) {
+  int32_t result = 0;
+  result += macc(input_vals & 0xff, filter_vals & 0xff);
+  filter_vals >>= 8;
+  input_vals >>= 8;
+  result += macc(input_vals & 0xff, filter_vals & 0xff);
+  filter_vals >>= 8;
+  input_vals >>= 8;
+  result += macc(input_vals & 0xff, filter_vals & 0xff);
+  filter_vals >>= 8;
+  input_vals >>= 8;
+  result += macc(input_vals & 0xff, filter_vals & 0xff);
+  return result;
+}
+
 // Set register instruction
 static uint32_t set_reg(int funct7, uint32_t in0, uint32_t in1) {
   switch (funct7) {
@@ -312,6 +342,9 @@ static uint32_t set_reg(int funct7, uint32_t in0, uint32_t in1) {
     case 25:
       return input_store_set(&input_store, in0);
 
+    case 30:
+      return macc4_explicit_inputs(in0, in1);
+
     case 110:
       return filter_store_read(&filter_store);
     case 111:

diff --git a/proj/mnv2_first/src/tensorflow/lite/kernels/internal/reference/integer_ops/mnv2_conv.cc b/proj/mnv2_first/src/tensorflow/lite/kernels/internal/reference/integer_ops/mnv2_conv.cc
@@ -26,37 +26,17 @@ limitations under the License.
 namespace tflite {
 namespace reference_integer_ops {
 
-static inline int32_t macc(const int8_t input_val, int8_t filter_val,
-                           int32_t input_offset) {
-  // Assumes filter values being used sequentially
-  int32_t sum = ((int32_t)filter_val) * ((int32_t)input_val + input_offset);
-
-#if 0
-  static int dbg_ctr = 0;
-  if (dbg_ctr >= 96 * 24 && dbg_ctr < 96 * 25) {
-    printf("%6d, %4d, %6ld, %6ld\n", filter_val, input_val, input_offset, sum);
-  }
-  dbg_ctr++;
-#endif
-  return sum;
-}
-
 static inline int32_t accumulate(int input_depth, int32_t input_offset) {
   int32_t acc = 0;
-  for (int in_channel = 0; in_channel < input_depth; in_channel += 4) {
-    // Fetch 4 filter values and 4 input vals
+  for (int in_channel = 0; in_channel < input_depth; in_channel += 8) {
+    // Fetch 4 filter values and 4 input vals and Macc
     uint32_t filter_vals = CFU_GET_FILTER_VALUE();
     uint32_t input_vals = CFU_GET_INPUT_VALUE();
-    acc += macc(input_vals & 0xff, filter_vals & 0xff, input_offset);
-    filter_vals >>= 8;
-    input_vals >>= 8;
-    acc += macc(input_vals & 0xff, filter_vals & 0xff, input_offset);
-    filter_vals >>= 8;
-    input_vals >>= 8;
-    acc += macc(input_vals & 0xff, filter_vals & 0xff, input_offset);
-    filter_vals >>= 8;
-    input_vals >>= 8;
-    acc += macc(input_vals & 0xff, filter_vals & 0xff, input_offset);
+    acc += CFU_MACC4_EXPLICIT(input_vals, filter_vals);
+    // Do it again
+    filter_vals = CFU_GET_FILTER_VALUE();
+    input_vals = CFU_GET_INPUT_VALUE();
+    acc += CFU_MACC4_EXPLICIT(input_vals, filter_vals);
   }
   return acc;
 }