In [None]:
# Imports
import pandas as pd
from random import randint
from src import *
from src.simulator import SIMULATOR

In [None]:
sim = SIMULATOR()

# --------------------------------------------
#               KERNEL CONFIGURATION
# --------------------------------------------
kernel_path = './kernels/mmul/'
kernel_number = 1 
column_usage = [True, False] 
nInstrPerCol = 11 
imem_add_start = 0 
srf_spm_addres = 0 
version=""

sim.kernel_config(column_usage, nInstrPerCol, imem_add_start, srf_spm_addres, kernel_number)

In [None]:
# --------------------------------------------
#                DATA SIZES
# --------------------------------------------
# DISCO-CGRA Configuration
nRCs = 4
nElementsPerVWRSlice = 32

# Basic Block
basic_block_rows_A = 4
basic_black_cols_A = 32
basic_block_cols_B = 32

# Our test
nRowsA = 2
nColsA = 14
nColsB = 32

nRowsAOriginal = nRowsA # Auxiliar for one specific case

matrix_A = np.random.randint(1, 15, size=(nRowsA, nColsA))
matrix_B = np.random.randint(1, 15, size=(nColsA, nColsB))
matrix_C = np.zeros((nRowsA, nColsB), dtype=int)

In [None]:
# --------------------------------------------
#                LOAD SPM DATA
# --------------------------------------------
# SPM[0] = SRF (zeros)
# SPM[1] -- SPM[32] = B_blocks (duplicated the cols for each RC)
# SPM[33] = C_block
# SPM[34] = A_block
# --------------------------------------------
# SRF[0] = Last index of the number of cols on the A block (= nColsA - 1)
# SRF[1] = Last index of the number of elements of C on each RC slice (= nElemsOfCPerRC - 1)
# SRF[2] = Line of the SPM where the block of C is stored (= 33)
# --------------------------------------------

# Default SPM lines
srf_spm_line = 0
b_spm_first_line = srf_spm_line + 1
c_spm_line = b_spm_first_line + basic_block_cols_B # B will always use 32 lines independent of the number of columns
a_spm_line = c_spm_line + 1

# Default SRF values
srf = [0 for i in range(SPM_NWORDS)]
srf[0] = nColsA -1 # Last index of the number of cols on the A block
srf[1] = nColsB -1 # Last index of the number of elements of C on each RC slice
srf[2] = 33 # Line of the SPM where the block of C is stored

# Default VWRS values
vector_C = matrix_C.flatten()
vector_A = matrix_A.flatten()
special_case_for_b = 0
special_case_for_a = 0

# ------------------------------------
# Check data dims and prepare data
# ------------------------------------
# CASE 1: nRowsA < basic_block_rows_A
# ------------------------------------
if nRowsA < basic_block_rows_A:
    # Prepare vector_C
    print("SPECIAL CASE 1")
    vector_C = np.zeros(SPM_NWORDS, dtype=int)
    if nRowsA == 3: # Special case
        # Add padding and let it be 4
        new_row = np.zeros(nColsA, dtype=int)
        matrix_A = np.vstack([matrix_A, new_row])
        nRowsA = 4
        vector_A = matrix_A.flatten()
    else:
        nDuplicateA = int (basic_block_rows_A / nRowsA)
        # Prepare matrix_A
        special_case_for_a = 1
        vector_A = np.zeros(shape=0,dtype=int)
        for row in range(nRowsA):
            row_a = matrix_A[row,:]
            row_a_filled = np.concatenate((row_a, np.zeros(nElementsPerVWRSlice - len(row_a), dtype=int)))
            vector_aux = np.tile(row_a_filled, nDuplicateA)
            vector_A = np.concatenate((vector_A, vector_aux))
        # Prepare matrix_B
        special_case_for_b = 1
        b_spm_line = b_spm_first_line
        nColsBPerRC = int(nColsB/nDuplicateA)

        for col in range(int(nColsB/nDuplicateA)):
            vector_B = np.zeros(shape=0,dtype=int)
            for j in range(int(nRCs/nDuplicateA)):
                for i in range(nDuplicateA):
                    col_b = matrix_B[:,col+i*nColsBPerRC]
                    col_b_filled = np.concatenate((col_b, np.zeros(nElementsPerVWRSlice - len(col_b), dtype=int)))
                    vector_B = np.concatenate((vector_B, col_b_filled))
            sim.setSPMLine(b_spm_line, vector_B.copy())
            b_spm_line+=1

        # Prepare loops
        nElemsOfCPerRC = int(nRowsA*nColsB/nRCs)
        srf[1] = nElemsOfCPerRC - 1

# ------------------------------------
# CASE 2: nColsA < basic_black_cols_A
# ------------------------------------
if nColsA < basic_black_cols_A:
    print("SPECIAL CASE 2")
    # Fill VWR_A and B with 0s (B is handled afterwards)
    if special_case_for_a == 0:
        vector_A = np.zeros(shape=0,dtype=int)
        for row in range(nRowsA):
            row_a = matrix_A[row,:]
            row_a_filled = np.concatenate((row_a, np.zeros(nElementsPerVWRSlice - len(row_a), dtype=int)))
            vector_A = np.concatenate((vector_A, row_a_filled))

# ------------------------------------
# CASE 3: nColsB < basic_black_cols_B
# -----------------------------------
# TODO: Implement this case
# -

# Load SRF
sim.setSPMLine(srf_spm_line, srf.copy())

# Load matrices A and C
sim.setSPMLine(a_spm_line, vector_A.copy())
sim.setSPMLine(c_spm_line, vector_C.copy())

# Load matrix B (prepared to be filled with 0s)
if special_case_for_b == 0:
    b_spm_line = b_spm_first_line
    for col in range(nColsB):
        col_b = matrix_B[:,col]
        col_b_filled = np.concatenate((col_b, np.zeros(32 - len(col_b), dtype=int))) # 32 = # slice elements
        vector_aux = np.tile(col_b_filled, nRCs)
        sim.setSPMLine(b_spm_line, vector_aux.copy())
        b_spm_line+=1

sim.displaySPMLine(0)  # SRF
sim.displaySPMLine(1)  # B0
sim.displaySPMLine(33) # C
sim.displaySPMLine(34) # A

In [None]:
# --------------------------------------------
#              COMPILE ASM TO HEX
# --------------------------------------------
sim.compileAsmToHex(kernel_path, kernel_number, version=version)

Finally, we load the kernel into the internal memory of the specialized units and run it.

In [None]:
# --------------------------------------------
#                 LOAD KERNEL
# --------------------------------------------

# This needs the hex instructions, if you don't provide them, generate then compiling the asm
sim.kernel_load(kernel_path, version=version + "_autogen", kernel_number=kernel_number)

# --------------------------------------------
#               SIMULATE EXECUTION
# --------------------------------------------
show_lcu = []
show_srf = []
show_lsu = []
show_rcs = [[],[],[],[]]
show_mxcu = []
display_ops = [show_lcu, show_lsu, show_mxcu, show_rcs, show_srf]

sim.run(kernel_number, display_ops=display_ops)

We can check it more rigorously. We can define our function in python and check that the output matches the CGRA output.

In [None]:
def mmul (in_A, in_B, nRowsA, nColsA, nColsB, out):
    for i in range(nRowsA):
        for j in range(nColsB):
            sum = 0
            for k in range(nColsA):
                #print(f"i: {i}, j: {j}, k: {k}")
                sum += in_A[i][k] * in_B[k][j]
            out[i*nColsB + j] = sum

In [None]:
# Get output from the CGRA
disco_cgra_res = sim.getSPMLine(c_spm_line)

In [None]:
# Prepare output in some non standar cases
test = [x for x in disco_cgra_res if x != 0]
if nRowsA < basic_block_rows_A -1:
    nElemsOfCPerRC = int(nRowsA*nColsB/nRCs)
    real_C = []
    for i in range(nRCs):
        real_C.extend(disco_cgra_res[i*nElementsPerVWRSlice:i*nElementsPerVWRSlice + nElemsOfCPerRC])
    disco_cgra_res = real_C

In [None]:
errors_idx = []
expected_output = [0 for i in range(nRowsA*nColsB)]
mmul(matrix_A, matrix_B, nRowsA, nColsA, nColsB, expected_output)
for i in range(len(expected_output)):
    if expected_output[i] != disco_cgra_res[i]:
        errors_idx.append(i)
if len(errors_idx) == 0:
    print("The result is correct!")
else:
    print("Oops, something went wrong. There are " + str(len(errors_idx)) + " errors.")
    print(errors_idx)
    print("DISCO-CGRA result:")
    print(disco_cgra_res)
    print("Expected result:")
    print(expected_output)