Skip to content

Commit

Permalink
Merge pull request #28 from alanvgreen/maccs
Browse files Browse the repository at this point in the history
Maccs
  • Loading branch information
tcal-x committed Mar 16, 2021
2 parents 6b338ea + cacf49e commit 977a650
Show file tree
Hide file tree
Showing 10 changed files with 214 additions and 38 deletions.
2 changes: 1 addition & 1 deletion proj/mnv2_first/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ RUN_MENU_ITEMS := 3 1

# Uncomment to use smaller batches for debugging. This helps expose
# bugs with processing multiple batch handling.
DEFINES += USE_CONV_SMALL_BATCHES
#DEFINES += USE_CONV_SMALL_BATCHES

DEFINES += ACCEL_CONV

Expand Down
52 changes: 52 additions & 0 deletions proj/mnv2_first/gateware/macc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/bin/env python
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from nmigen import Signal, signed

from nmigen_cfu import SimpleElaboratable

from .registerfile import Xetter


class ExplicitMacc4(Xetter):
"""A Macc4 that operates on in0 (input_vals) and in1 (filter_vals).
Public Interface
----------------
input_offset: Signal(signed(8)) input
Offset to be added to all inputs.
"""

def __init__(self):
super().__init__()
self.input_offset = Signal(signed(9))

def elab(self, m):
muls = []
for n in range(4):
tmp = Signal(signed(9))
inval = self.in0.word_select(n, 8).as_signed()
fval = self.in1.word_select(n, 8).as_signed()
mul = Signal(signed(32))
m.d.comb += [
tmp.eq(inval + self.input_offset),
mul.eq(tmp * fval)
]
muls.append(mul)
m.d.comb += [
self.output.eq(sum(muls)),
self.done.eq(1),
]
15 changes: 14 additions & 1 deletion proj/mnv2_first/gateware/mnv2_cfu.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from .post_process import PostProcessXetter, SRDHMInstruction, RoundingDividebyPOTInstruction
from .store import CircularIncrementer, FilterValueGetter, InputStore, InputStoreGetter, InputStoreSetter, StoreSetter
from .registerfile import RegisterFileInstruction, RegisterSetter
from .macc import ExplicitMacc4

OUTPUT_CHANNEL_PARAM_DEPTH = 512
NUM_FILTER_DATA_WORDS = 512
Expand Down Expand Up @@ -101,10 +102,19 @@ def _make_input_store(self, m, name, restart_signal, input_depth):
ins.r_finished.eq(read_finished),
]

def _make_explicit_macc_4(self, m, reg_num, name, input_offset):
"""Constructs and registers an explicit macc4 instruction
"""
xetter = ExplicitMacc4()
m.d.comb += xetter.input_offset.eq(input_offset)
m.submodules[name] = xetter
self.register_xetter(reg_num, xetter)

def elab_xetters(self, m):
input_depth, set_id = self._make_setter(m, 10, 'set_input_depth')
self._make_setter(m, 11, 'set_output_depth')
self._make_setter(m, 12, 'set_input_offset')
input_offset, _ = self._make_setter(m, 12, 'set_input_offset')
offset, _ = self._make_setter(m, 13, 'set_output_offset')
activation_min, _ = self._make_setter(m, 14, 'set_activation_min')
activation_max, _ = self._make_setter(m, 15, 'set_activation_max')
Expand All @@ -131,6 +141,9 @@ def elab_xetters(self, m):

self._make_input_store(m, 'ins', set_id, input_depth)

# MACC 4
self._make_explicit_macc_4(m, 30, 'ex_m4', input_offset)

m.d.comb += [
ppx.offset.eq(offset),
ppx.activation_min.eq(activation_min),
Expand Down
10 changes: 5 additions & 5 deletions proj/mnv2_first/gateware/post_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,16 @@ class SRDHM(SimpleElaboratable):
"""Implements gemmlowp::SaturatingRoundingDoublingHighMul
The function calculated is approximately ((a*b) >> 31) & 0xffffffff,
allowing for saturating and rounding. In other words,it multiplies
two 32 bit numbers, then returns bits 62 to 31 of the 64 bit result.
This is 2x the high word of a 64 bit multiplication (allowing for
allowing for saturating and rounding. In other words,it multiplies
two 32 bit numbers, then returns bits 62 to 31 of the 64 bit result.
This is 2x the high word of a 64 bit multiplication (allowing for
saturating and rounding).
Implemented as a pipeline so that results are always available 3
cycles after setting inputs.
Note that there is a bug to investigated here. This implementation
matches the behavior of the RISCV compiled source, however, it seems
matches the behavior of the RISCV compiled source, however, it seems
that "nudge" is only ever one of the two values in RISCV.
Public Interface
Expand Down Expand Up @@ -226,7 +226,7 @@ def elab(self, m):
class PostProcessXetter(Xetter):
"""Does post-processing of an accumulator value.
The output channel index is implied by processing order. This
The output channel index is implied by processing order. This
is mostly a wrapper around PostProcessor.
Attributes
Expand Down
60 changes: 60 additions & 0 deletions proj/mnv2_first/gateware/test_macc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/bin/env python
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from nmigen.sim import Delay

from nmigen_cfu import TestBase

from .macc import ExplicitMacc4


def pack_vals(a, b, c, d):
return ((a & 0xff)
+ ((b & 0xff) << 8)
+ ((c & 0xff) << 16)
+ ((d & 0xff) << 24))


class ExplicitMacc4Test(TestBase):
def create_dut(self):
return ExplicitMacc4()

def test(self):
DATA = [
((0, 0, 0), 0),
((2, 1, 1), 3),
((2, 1, 1), 3),
((128, pack_vals(-128, -128, -128, -128), pack_vals(1, 1, 1, 1)), 0),
((128, pack_vals(-128, -127, -126, -125),
pack_vals(10, 11, 12, 13)), 1 * 11 + 2 * 12 + 3 * 13),
((128, pack_vals(127, 0, 0, 0),
pack_vals(10, 11, 12, 13)), 10 * 255 + 11 * 128 + 12 * 128 + 13 * 128),
]

def process():
for n, (inputs, expected) in enumerate(DATA):
input_offset, input_value, filter_value = inputs
yield self.dut.input_offset.eq(input_offset)
yield self.dut.in0.eq(input_value)
yield self.dut.in1.eq(filter_value)
yield self.dut.start.eq(1)
yield Delay(0.25)
yield self.dut.start.eq(0)
while not (yield self.dut.done):
yield
self.assertEqual(
(yield self.dut.output.as_signed()), expected, f"case={n}")
yield
self.run_sim(process, True)
12 changes: 8 additions & 4 deletions proj/mnv2_first/gateware/test_mnv2_cfu.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,15 @@ def test(self):
((0, 110, 0, 0), 333),
((0, 110, 0, 0), 444),
((0, 110, 0, 0), 555),
# Set input offset to 5, then do a macc
((0, 12, 5, 0), 0),
((0, 30, 0x01020304, 0x02040608), 6 * 2 + 7 * 4 + 8 * 6 + 9 * 8),

]
return self.run_ops(DATA, False)

def test_input_store(self):
DATA = []
DATA= []

def set_val(val):
return ((0, 25, val, 0), 0)
Expand All @@ -69,12 +73,12 @@ def set_input_depth(val):
def finish_read():
return ((0, 112, 0, 0), 0)

DATA = (
DATA= (
[set_input_depth(10)] +
[set_val(v) for v in range(100, 110)] +
[get_val(v) for v in range(100, 110)] +
[set_val(v) for v in range(200, 210)] +
[get_val(v) for v in range(100, 110)] +
[finish_read() ]+
[finish_read()]+
[get_val(v) for v in range(200, 210)])
return self.run_ops(DATA, True)
return self.run_ops(DATA, False)
2 changes: 2 additions & 0 deletions proj/mnv2_first/src/mnv2_cfu.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ extern "C" {
#define CFU_STORE_FILTER_VALUE(in0) CFU_SET(24, in0)
#define CFU_STORE_INPUT_VALUE(in0) CFU_SET(25, in0)

#define CFU_MACC4_EXPLICIT(input_vals, filter_vals) cfu_op0(30, input_vals, filter_vals)

// Supports incremental development
#define CFU_GET_FILTER_VALUE() CFU_GET(110)
#define CFU_GET_INPUT_VALUE() CFU_GET(111)
Expand Down
32 changes: 32 additions & 0 deletions proj/mnv2_first/src/proj_menu.c
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,37 @@ static void do_rdbpot_tests() {
}
}

static void do_explicit_macc4_tests() {
int failcount = 0;

// sim_trace_enable_write(1);
int64_t r = 0x0;
for (int i = 0; i < 1024; i++) {
int32_t in_val = next_random(&r);
int32_t f_val = next_random(&r);
int32_t offset = next_random(&r);
if (offset & 0x100) {
offset = 0x80;
} else {
offset = (int8_t)offset;
}

cfu_op0_sw(12, offset, 0);
cfu_op0_hw(12, offset, 0);
int32_t sw = cfu_op0_sw(30, in_val, f_val);
int32_t hw = cfu_op0_hw(30, in_val, f_val);

if (hw != sw) {
printf("off=%08lx in=%08lx filt=%08lx -->sw=%08lx, hw=%08lx\n", offset,
in_val, f_val, sw, hw);
failcount++;
}
}
if (!failcount) {
printf("All OK\n");
}
}

static struct Menu MENU = {
"Project Menu",
"mnv2_first",
Expand All @@ -127,6 +158,7 @@ static struct Menu MENU = {
MENU_ITEM('3', "srdhm tests", do_srdhm_tests),
MENU_ITEM('4', "rdbpot tests", do_rdbpot_tests),
MENU_ITEM('5', "mbqm tests", do_mbqm_tests),
MENU_ITEM('6', "explicit macc 4", do_explicit_macc4_tests),
MENU_END,
},
};
Expand Down
33 changes: 33 additions & 0 deletions proj/mnv2_first/src/software_cfu.c
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,36 @@ static uint32_t input_store_dump(struct InputStore* is) {
return 0;
}

static inline int32_t macc(const int8_t input_val, int8_t filter_val) {
// Assumes filter values being used sequentially
int32_t sum = ((int32_t)filter_val) * ((int32_t)input_val + reg_input_offset);

#if 0
static int dbg_ctr = 0;
if (dbg_ctr >= 96 * 24 && dbg_ctr < 96 * 25) {
printf("%6d, %4d, %6ld, %6ld\n", filter_val, input_val, input_offset, sum);
}
dbg_ctr++;
#endif
return sum;
}

static int32_t macc4_explicit_inputs(uint32_t input_vals,
uint32_t filter_vals) {
int32_t result = 0;
result += macc(input_vals & 0xff, filter_vals & 0xff);
filter_vals >>= 8;
input_vals >>= 8;
result += macc(input_vals & 0xff, filter_vals & 0xff);
filter_vals >>= 8;
input_vals >>= 8;
result += macc(input_vals & 0xff, filter_vals & 0xff);
filter_vals >>= 8;
input_vals >>= 8;
result += macc(input_vals & 0xff, filter_vals & 0xff);
return result;
}

// Set register instruction
static uint32_t set_reg(int funct7, uint32_t in0, uint32_t in1) {
switch (funct7) {
Expand Down Expand Up @@ -312,6 +342,9 @@ static uint32_t set_reg(int funct7, uint32_t in0, uint32_t in1) {
case 25:
return input_store_set(&input_store, in0);

case 30:
return macc4_explicit_inputs(in0, in1);

case 110:
return filter_store_read(&filter_store);
case 111:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,37 +26,17 @@ limitations under the License.
namespace tflite {
namespace reference_integer_ops {

static inline int32_t macc(const int8_t input_val, int8_t filter_val,
int32_t input_offset) {
// Assumes filter values being used sequentially
int32_t sum = ((int32_t)filter_val) * ((int32_t)input_val + input_offset);

#if 0
static int dbg_ctr = 0;
if (dbg_ctr >= 96 * 24 && dbg_ctr < 96 * 25) {
printf("%6d, %4d, %6ld, %6ld\n", filter_val, input_val, input_offset, sum);
}
dbg_ctr++;
#endif
return sum;
}

static inline int32_t accumulate(int input_depth, int32_t input_offset) {
int32_t acc = 0;
for (int in_channel = 0; in_channel < input_depth; in_channel += 4) {
// Fetch 4 filter values and 4 input vals
for (int in_channel = 0; in_channel < input_depth; in_channel += 8) {
// Fetch 4 filter values and 4 input vals and Macc
uint32_t filter_vals = CFU_GET_FILTER_VALUE();
uint32_t input_vals = CFU_GET_INPUT_VALUE();
acc += macc(input_vals & 0xff, filter_vals & 0xff, input_offset);
filter_vals >>= 8;
input_vals >>= 8;
acc += macc(input_vals & 0xff, filter_vals & 0xff, input_offset);
filter_vals >>= 8;
input_vals >>= 8;
acc += macc(input_vals & 0xff, filter_vals & 0xff, input_offset);
filter_vals >>= 8;
input_vals >>= 8;
acc += macc(input_vals & 0xff, filter_vals & 0xff, input_offset);
acc += CFU_MACC4_EXPLICIT(input_vals, filter_vals);
// Do it again
filter_vals = CFU_GET_FILTER_VALUE();
input_vals = CFU_GET_INPUT_VALUE();
acc += CFU_MACC4_EXPLICIT(input_vals, filter_vals);
}
return acc;
}
Expand Down

0 comments on commit 977a650

Please sign in to comment.