In [None]:
from pynq import Overlay, allocate
import numpy as np

# RTL-defined Matrix Parameters
I_OUTER_DIM = 8
W_OUTER_DIM = 6
INNER_DIM   = 4
BLOCK_SIZE  = 2
NUM_CORES   = 2

# --- FPGA and Design Parameters ---
INPUT_WIDTH_BITS = 64 * NUM_CORES
WEIGHT_WIDTH_BITS = 64
OUTPUT_WIDTH_BITS = 64 * NUM_CORES
DATA_UNIT_BITS = 32  # uint32
WORDS_PER_OUTPUT = OUTPUT_WIDTH_BITS // DATA_UNIT_BITS

# Derived output size
ROWS = I_OUTER_DIM // BLOCK_SIZE
COLS = W_OUTER_DIM // BLOCK_SIZE
TOTAL_OUTPUT_WORDS = (ROWS * COLS) // NUM_CORES
OUTPUT_BUFFER_WORDS = TOTAL_OUTPUT_WORDS * WORDS_PER_OUTPUT

# File paths
INPUT_MEM_FILE = "i.mem"
WEIGHT_MEM_FILE = "w.mem"
OUTPUT_MEM_FILE = "o.mem"

# --- Load Overlay ---
overlay = Overlay("/home/xilinx/jupyter_notebooks/Matrix_Multiplier/design_1.bit")
print("Overlay loaded.")

In [None]:
dma_i = overlay.axi_dma_0
dma_w = overlay.axi_dma_1
dma_o = overlay.axi_dma_2

In [None]:
# --- Load HEX .mem File ---
def load_mem_file(filename, word_bits):
    with open(filename, "r") as f:
        hex_data = f.read().replace("\n", "").strip()
    word_hex_len = word_bits // 4
    chunks = [hex_data[i:i+word_hex_len] for i in range(0, len(hex_data), word_hex_len)]
    data = []
    for word in chunks:
        word = word.zfill(word_hex_len)
        for i in range(0, len(word), 8):
            data.append(int(word[len(word)-8-i:len(word)-i], 16))
    return np.array(data, dtype=np.uint32)

In [None]:
# --- Load Inputs ---
input_data = load_mem_file(INPUT_MEM_FILE, INPUT_WIDTH_BITS)
weight_data = load_mem_file(WEIGHT_MEM_FILE, WEIGHT_WIDTH_BITS)

In [None]:
# --- Allocate Buffers ---
input_buffer = allocate(shape=input_data.shape, dtype=np.uint32)
weight_buffer = allocate(shape=weight_data.shape, dtype=np.uint32)
output_buffer = allocate(shape=(OUTPUT_BUFFER_WORDS,), dtype=np.uint32)

np.copyto(input_buffer, input_data)
np.copyto(weight_buffer, weight_data)
input_buffer.flush()
weight_buffer.flush()

In [None]:
print(f"Input buffer size: {input_buffer.nbytes} bytes ({input_buffer.shape[0]} words)")
print(f"Weight buffer size: {weight_buffer.nbytes} bytes ({weight_buffer.shape[0]} words)")
print(f"Output buffer size: {output_buffer.nbytes} bytes ({output_buffer.shape[0]} words)")

In [None]:
def print_hex_chunks(data_array, bits_per_word, label):
    print(f"\n{label} contents ({bits_per_word}-bit words):")
    words_per_line = bits_per_word // 32
    reshaped = data_array.reshape((-1, words_per_line))
    for i, line in enumerate(reshaped):
        hex_word = ''.join(f"{x:08x}" for x in reversed(line))
        print(f"{label} {i}: {hex_word}")

print_hex_chunks(input_buffer, INPUT_WIDTH_BITS, "Input")
print_hex_chunks(weight_buffer, WEIGHT_WIDTH_BITS, "Weight")

In [None]:
# --- Start Transfers ---
dma_o.recvchannel.transfer(output_buffer)
dma_i.sendchannel.transfer(input_buffer)
dma_w.sendchannel.transfer(weight_buffer)

In [None]:
output_buffer.invalidate()
print_hex_chunks(output_buffer, OUTPUT_WIDTH_BITS, "Output")

In [None]:
# --- Reconstruct Output Data ---
output_chunks = output_buffer.reshape((-1, WORDS_PER_OUTPUT))
output_words = [
    ''.join(f"{x:08x}" for x in reversed(chunk)) for chunk in output_chunks
]

print("Output received (128-bit hex):")
for i, word in enumerate(output_words):
    print(f"Output {i}: {word}")


In [None]:
with open(OUTPUT_MEM_FILE, "w") as f:
    for word in output_words:
        f.write(word + "\n")

print(f"Saved output to {OUTPUT_MEM_FILE}")

In [None]:
# Delete buffer to prevent memory leak
del input_buffer, weight_buffer, output_buffer