Skip to content

Commit

Permalink
Merge pull request #31 from alanvgreen/maccs2
Browse files Browse the repository at this point in the history
use 4-wide multiply-add with implicit inputs
  • Loading branch information
tcal-x committed Mar 19, 2021
2 parents c2555a9 + 07f1312 commit ad5f236
Show file tree
Hide file tree
Showing 10 changed files with 301 additions and 111 deletions.
124 changes: 105 additions & 19 deletions proj/mnv2_first/gateware/macc.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,55 @@

from nmigen import Signal, signed

from nmigen_cfu import Sequencer
from nmigen_cfu import all_words, Sequencer, SimpleElaboratable, tree_sum

from .registerfile import Xetter


class Madd4Pipeline(SimpleElaboratable):
"""A 4-wide Multiply Add pipeline.
Pipeline takes 2 additional cycles.
f_data and i_data each contain 4 signed 8 bit values. The
calculation performed is:
result = sum((i_data[n] + offset) * f_data[n] for n in range(4))
Public Interface
----------------
offset: Signal(signed(8)) input
Offset to be added to all inputs.
f_data: Signal(32) input
4 bytes of filter data to use next
i_data: Signal(32) input
4 bytes of input data data to use next
result: Signal(signed(32)) output
Result of the multiply and add
"""
PIPELINE_CYCLES = 2

def __init__(self):
super().__init__()
self.offset = Signal(signed(9))
self.f_data = Signal(32)
self.i_data = Signal(32)
self.result = Signal(signed(32))

def elab(self, m):
# Product is 17 bits: 8 bits * 9 bits = 17 bits
products = [Signal(signed(17), name=f"product_{n}") for n in range(4)]
for i_val, f_val, product in zip(
all_words(self.i_data, 8), all_words(self.f_data, 8), products):
f_tmp = Signal(signed(9))
m.d.sync += f_tmp.eq(f_val.as_signed())
i_tmp = Signal(signed(9))
m.d.sync += i_tmp.eq(i_val.as_signed() + self.offset)
m.d.comb += product.eq(i_tmp * f_tmp)

m.d.sync += self.result.eq(tree_sum(products))


class ExplicitMacc4(Xetter):
"""A Macc4 that operates on in0 (input_vals) and in1 (filter_vals).
Expand All @@ -35,24 +79,66 @@ def __init__(self):
self.input_offset = Signal(signed(9))

def elab(self, m):

muls = []
for n in range(4):
tmp = Signal(signed(9))
inval = self.in0.word_select(n, 8).as_signed()
fval = self.in1.word_select(n, 8).as_signed()
mul = Signal(signed(17)) # 8bits * 9 bits = 17 bits
m.d.comb += tmp.eq(inval + self.input_offset)
m.d.sync += mul.eq(tmp * fval)
muls.append(mul)
sum_muls = Signal(signed(19)) # 4 lots of 17 bits = 19 bits
m.d.comb += sum_muls.eq(sum(muls))

# Use a sequencer to count one cycle between start and end
m.submodules['seq'] = seq = Sequencer(1)
m.submodules['madd4'] = madd4 = Madd4Pipeline()
m.d.comb += [
madd4.offset.eq(self.input_offset),
madd4.f_data.eq(self.in1),
madd4.i_data.eq(self.in0),
self.output.eq(madd4.result),
]
m.submodules['seq'] = seq = Sequencer(Madd4Pipeline.PIPELINE_CYCLES)
m.d.comb += seq.inp.eq(self.start)
with m.If(seq.sequence[-1]):
m.d.comb += self.done.eq(seq.sequence[-1])


class ImplicitMacc4(Xetter):
"""A Macc4 that operates on input_vals and filter_vals provided
from within the CFU.
Public Interface
----------------
input_offset: Signal(signed(8)) input
Offset to be added to all inputs.
f_data: Signal(32) input
Filter data to use next
f_next: Signal() output
Indicates filter data has been used
i_data: Signal(32) input
Input data to use next
i_next: Signal() output
Indicates input data has been used
i_ready: Signal() input
Whether or not i_data is valid.
"""

def __init__(self):
super().__init__()
self.input_offset = Signal(signed(9))
self.f_data = Signal(32)
self.f_next = Signal()
self.i_data = Signal(32)
self.i_next = Signal()
self.i_ready = Signal()

def elab(self, m):
m.submodules['madd4'] = madd4 = Madd4Pipeline()
m.d.comb += [
madd4.offset.eq(self.input_offset),
madd4.f_data.eq(self.f_data),
madd4.i_data.eq(self.i_data),
self.output.eq(madd4.result),
]

# Signal only when i_ready has been set, then start sequence to be done next cycle
m.submodules['seq'] = seq = Sequencer(Madd4Pipeline.PIPELINE_CYCLES)
m.d.comb += self.done.eq(seq.sequence[-1])
waiting_for_i_ready = Signal()
with m.If(self.i_ready & (waiting_for_i_ready | self.start)):
m.d.comb += [
self.done.eq(1),
self.output.eq(sum_muls),
self.f_next.eq(1),
self.i_next.eq(1),
seq.inp.eq(1),
]
m.d.sync += waiting_for_i_ready.eq(0)
with m.Elif(self.start & ~self.i_ready):
m.d.sync += waiting_for_i_ready.eq(1)
55 changes: 42 additions & 13 deletions proj/mnv2_first/gateware/mnv2_cfu.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@
from nmigen_cfu import Cfu, DualPortMemory, is_sim_run

from .post_process import PostProcessXetter, SRDHMInstruction, RoundingDividebyPOTInstruction
from .store import CircularIncrementer, FilterValueGetter, InputStore, InputStoreGetter, InputStoreSetter, StoreSetter
from .store import CircularIncrementer, FilterValueFetcher, InputStore, InputStoreSetter, NextWordGetter, StoreSetter
from .registerfile import RegisterFileInstruction, RegisterSetter
from .macc import ExplicitMacc4
from .macc import ExplicitMacc4, ImplicitMacc4

OUTPUT_CHANNEL_PARAM_DEPTH = 512
NUM_FILTER_DATA_WORDS = 512
Expand Down Expand Up @@ -91,16 +91,13 @@ def _make_input_store(self, m, name, restart_signal, input_depth):
m.submodules[f'{name}_set'] = insset = InputStoreSetter()
m.d.comb += insset.connect(ins)
self.register_xetter(25, insset)
m.submodules[f'{name}_get'] = insget = InputStoreGetter()
m.d.comb += insget.connect(ins)
self.register_xetter(111, insget)

_, read_finished = self._make_setter(m, 112, 'mark_read')
m.d.comb += [
ins.restart.eq(restart_signal),
ins.input_depth.eq(input_depth),
ins.r_finished.eq(read_finished),
]
return ins

def _make_explicit_macc_4(self, m, reg_num, name, input_offset):
"""Constructs and registers an explicit macc4 instruction
Expand All @@ -111,6 +108,16 @@ def _make_explicit_macc_4(self, m, reg_num, name, input_offset):
m.submodules[name] = xetter
self.register_xetter(reg_num, xetter)

def _make_implicit_macc_4(self, m, reg_num, name, input_offset):
"""Constructs and registers an implicit macc4 instruction
"""
xetter = ImplicitMacc4()
m.d.comb += xetter.input_offset.eq(input_offset)
m.submodules[name] = xetter
self.register_xetter(reg_num, xetter)
return xetter

def elab_xetters(self, m):
input_depth, set_id = self._make_setter(m, 10, 'set_input_depth')
self._make_setter(m, 11, 'set_output_depth')
Expand All @@ -128,21 +135,43 @@ def elab_xetters(self, m):
fv_mems, fv_count = self._make_filter_value_store(
m, 24, 'store_filter_values', restart)

m.submodules['fvg'] = fvg = FilterValueGetter(4, NUM_FILTER_DATA_WORDS)
m.d.comb += fvg.connect_read_port(fv_mems)
m.submodules['fvf'] = fvf = FilterValueFetcher(
4, NUM_FILTER_DATA_WORDS)
m.d.comb += fvf.connect_read_port(fv_mems)
m.d.comb += [
fvg.limit.eq(fv_count),
fvg.restart.eq(restart)
fvf.limit.eq(fv_count),
fvf.restart.eq(restart)
]

self.register_xetter(110, fvg)
m.submodules['ppx'] = ppx = PostProcessXetter()
self.register_xetter(120, ppx)

self._make_input_store(m, 'ins', set_id, input_depth)
ins = self._make_input_store(m, 'ins', set_id, input_depth)

m.submodules['fvg'] = fvg = NextWordGetter()
m.d.comb += [
fvf.next.eq(fvg.next),
fvg.data.eq(fvf.data),
fvg.ready.eq(1),
]
self.register_xetter(110, fvg)
m.submodules['insget'] = insget = NextWordGetter()
m.d.comb += [
insget.data.eq(ins.r_data),
insget.ready.eq(ins.r_ready),
ins.r_next.eq(insget.next),
]
self.register_xetter(111, insget)

# MACC 4
self._make_explicit_macc_4(m, 30, 'ex_m4', input_offset)
im4 = self._make_implicit_macc_4(m, 31, 'im4_m4', input_offset)
m.d.comb += [
im4.f_data.eq(fvf.data),
fvf.next.eq(im4.f_next | fvg.next),
im4.i_data.eq(ins.r_data),
im4.i_ready.eq(ins.r_ready),
ins.r_next.eq(im4.i_next | insget.next),
]

m.d.comb += [
ppx.offset.eq(offset),
Expand Down
97 changes: 44 additions & 53 deletions proj/mnv2_first/gateware/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,10 +134,8 @@ def elab(self, m):
self.limit - 1, 0, last_addr + 1))


class FilterValueGetter(Xetter):
"""Gets next word from 4-way filter value store.
This is a temporary getter. The full accelerator will use filter values internally.
class FilterValueFetcher(SimpleElaboratable):
"""Fetches next single word from a 4-way filter value store.
Parameters
----------
Expand All @@ -153,9 +151,15 @@ class FilterValueGetter(Xetter):
Current read pointer
r_bank: Signal(range(num_memories)) output
Which of the memories to get from
limit: Signal(range(depth)) input
Number of entries currently contained in the filter store
restart: Signal() input
Soft reset signal to restart all processing
data: Signal(32) output
Value fetched
next: Signal() input
Indicates that fetched value has been read.
"""

def __init__(self, num_memories, depth):
super().__init__()
assert 0 == ((num_memories - 1) &
Expand All @@ -165,6 +169,8 @@ def __init__(self, num_memories, depth):
self.r_bank = Signal(range(num_memories))
self.limit = Signal(range(self.num_memories * depth))
self.restart = Signal()
self.data = Signal(32)
self.next = Signal()

def connect_read_port(self, dual_port_memories):
"""Connect read ports of a list of dual port memories to self.
Expand All @@ -173,24 +179,54 @@ def connect_read_port(self, dual_port_memories):
"""
result = [dp.r_addr.eq(self.r_addr) for dp in dual_port_memories]
r_datas = Array(dp.r_data for dp in dual_port_memories)
result.append(self.output.eq(r_datas[self.r_bank]))
result.append(self.data.eq(r_datas[self.r_bank]))
return result

def elab(self, m):
num_memories_bits = (self.num_memories - 1).bit_length()
count = Signal.like(self.limit)
count_at_limit = count == (self.limit - 1)
m.d.comb += [
self.done.eq(True),
self.r_addr.eq(count[num_memories_bits:]),
self.r_bank.eq(count[:num_memories_bits]),
]
with m.If(self.restart):
m.d.sync += count.eq(0)
with m.Elif(self.start):
with m.Elif(self.next):
m.d.sync += count.eq(Mux(count_at_limit, 0, count + 1))


class NextWordGetter(Xetter):
"""Gets the next word from a store.
Public Interface
----------------
data: Signal(32) input
The current value to be fetched
next: Signal() output
Indicates that fetched value has been read.
ready: Signal() input
Signal from the store that data is valid. The read only completes when ready is true.
"""
def __init__(self):
super().__init__()
self.data = Signal(32)
self.next = Signal()
self.ready = Signal()

def elab(self, m):
waiting = Signal()
with m.If(self.ready & (waiting | self.start)):
m.d.comb += [
self.output.eq(self.data),
self.next.eq(1),
self.done.eq(1),
]
m.d.sync += waiting.eq(0)
with m.Elif(self.start & ~self.ready):
m.d.sync += waiting.eq(1)


class InputStore(SimpleElaboratable):
"""Stores one "pixel" of input values for processing.
Expand Down Expand Up @@ -363,51 +399,6 @@ def elab(self, m):
self._elab_read(m, dps, r_full)


class InputStoreGetter(Xetter):
"""Gets next word from an input store
This is a temporary getter. The full accelerator will use filter values internally.
Public Interface
----------------
r_data: Signal(32) input
Data from Input value store
r_next: Signal() output
Indicate data has been read
r_ready: Signal() input
Indicate data is ready to be read
"""

def __init__(self):
super().__init__()
self.r_data = Signal(32)
self.r_next = Signal()
self.r_ready = Signal()

def connect(self, input_store):
"""Connect to self to input_store.
Returns a list of statements that performs the connection.
"""
return [
self.r_data.eq(input_store.r_data),
self.r_ready.eq(input_store.r_ready),
input_store.r_next.eq(self.r_next),
]

def elab(self, m):
waiting = Signal()
with m.If(self.r_ready & (waiting | self.start)):
m.d.comb += [
self.output.eq(self.r_data),
self.r_next.eq(1),
self.done.eq(1),
]
m.d.sync += waiting.eq(0)
with m.Elif(self.start & ~self.r_ready):
m.d.sync += waiting.eq(1)


class InputStoreSetter(Xetter):
"""Puts a word into the input store.
Expand Down

0 comments on commit ad5f236

Please sign in to comment.