In [239]:
# Imports
import pandas as pd
from random import randint
from src import *
from src.simulator import SIMULATOR

In [240]:
sim = SIMULATOR()

# --------------------------------------------
#               KERNEL CONFIGURATION
# --------------------------------------------
kernel_path = './kernels/mmul/'
kernel_number = 1 
column_usage = [True, False] 
nInstrPerCol = 11 
imem_add_start = 0 
srf_spm_addres = 0 
version=""

sim.kernel_config(column_usage, nInstrPerCol, imem_add_start, srf_spm_addres, kernel_number)

In [241]:
# --------------------------------------------
#                DATA SIZES
# --------------------------------------------
# DISCO-CGRA Configuration
nRCs = 4
nElementsPerVWRSlice = 32

# Basic Block
basic_block_rows_A = 4
basic_block_cols_A = 32
basic_block_cols_B = 32

In [242]:
# Our test
nRowsA = 1
nColsA = 13
nColsB = 5

# Only 1 case: (4,32,32), (3,32,32), (2,32,32), (1,32,32), (4,13,32), (4,32,5)
# Case 1 and 2: (3,13,32), (2,13,32), (1,13,32)
# Case 2 and 3: (4,13,5)
# Case 1 and 3: (3,32,5), (2,32,5), (1,32,5)
# Case 1, 2 and 3: (3,13,5), (2,13,5), (1,13,5)

nRowsAOriginal = nRowsA # Auxiliar for one specific case

matrix_A = np.random.randint(1, 15, size=(nRowsA, nColsA))
matrix_B = np.random.randint(1, 15, size=(nColsA, nColsB))
matrix_C = np.zeros((nRowsA, nColsB), dtype=int)

In [243]:
# --------------------------------------------
#                LOAD SPM DATA
# --------------------------------------------
# SPM[0] = SRF (zeros)
# SPM[1] -- SPM[32] = B_blocks (duplicated the cols for each RC)
# SPM[33] = C_block
# SPM[34] = A_block
# --------------------------------------------
# SRF[0] = Last index of the number of cols on the A block (= nColsA - 1)
# SRF[1] = Last index of the number of elements of C on each RC slice (= nElemsOfCPerRC - 1)
# SRF[2] = Line of the SPM where the block of C is stored (= 33)
# --------------------------------------------

# Default SPM lines
srf_spm_line = 0
b_spm_first_line = srf_spm_line + 1
c_spm_line = b_spm_first_line + basic_block_cols_B # B will always use 32 lines independent of the number of columns
a_spm_line = c_spm_line + 1

# Default SRF values
srf = [0 for i in range(SPM_NWORDS)]
srf[0] = nColsA -1 # Last index of the number of cols on the A block
srf[1] = nColsB -1 # Last index of the number of elements of C on each RC slice
srf[2] = 33 # Line of the SPM where the block of C is stored


# ------------------------------------
# Check data dims and prepare data
# ------------------------------------
# CASE 1: nRowsA < basic_block_rows_A
# ------------------------------------
special_case_1 = 1 if (nRowsA < basic_block_rows_A) else 0
if special_case_1:
    print("SPECIAL CASE 1")
# ------------------------------------
# CASE 2: nColsA < basic_block_cols_A
# ------------------------------------
special_case_2 = 1 if (nColsA < basic_block_cols_A) else 0
if special_case_2:
    print("SPECIAL CASE 2")
# ------------------------------------
# CASE 3: nColsB < basic_block_cols_B
# ------------------------------------
special_case_3 = 1 if (nColsB < basic_block_cols_B) else 0
if special_case_3:  
    print("SPECIAL CASE 3")


# ----------------------
# Exception (nRowsA = 3)
# ----------------------
if nRowsA == 3:
    # Add padding and let it be 4
    new_row = np.zeros(nColsA, dtype=int)
    matrix_A = np.vstack([matrix_A, new_row])
    nRowsA = 4
    vector_A = matrix_A.flatten()
    special_case_1 = 0


# Auxiliars
c_filled_w_zeros = 1 if special_case_1 or special_case_3 else 0
special_case_for_b = 0


# ----------------------
# Matrix C
# ----------------------
vector_C = np.zeros(SPM_NWORDS, dtype=int) # Default
if special_case_3:
    nElemsOfCPerRC = [nColsB for i in range(nRCs)]
    #nElemsOfCPerRC = [int(nColsB/nRCs) for i in range(nRCs)] # 32/2 or 32/4
    #extraElems =  nColsB % nRCs
    #nRCExtraElem = 0
    #while(extraElems > 0):
    #    nElemsOfCPerRC[nRCExtraElem] += 1
    #    nRCExtraElem += 1
    #    nRCExtraElem = nRCExtraElem % nRCs
    #    extraElems -= 1
# ----------------------
# Matrix A
# ----------------------
vector_A = matrix_A.flatten() # Default
if special_case_2:  
    # Fill VWR_A with 0s
    vector_A = np.zeros(shape=0,dtype=int)
    for row in range(nRowsA):
        row_a = matrix_A[row,:]
        row_a_filled = np.concatenate((row_a, np.zeros(nElementsPerVWRSlice - len(row_a), dtype=int)))
        vector_A = np.concatenate((vector_A, row_a_filled))

if special_case_1:
    # Duplicate A
    nDuplicateA = int (basic_block_rows_A / nRowsA)
    vector_A = np.zeros(shape=0,dtype=int)
    for row in range(nRowsA):
        row_a = matrix_A[row,:]
        row_a_filled = np.concatenate((row_a, np.zeros(nElementsPerVWRSlice - len(row_a), dtype=int)))
        vector_aux = np.tile(row_a_filled, nDuplicateA)
        vector_A = np.concatenate((vector_A, vector_aux))

# ----------------------
# Loops
# ----------------------
if special_case_1:
    srf[1] = int(nRowsA*nColsB/nRCs) - 1
    if special_case_3 and (nColsB*nRowsA % nRCs != 0):
        srf[1]+=1

# ----------------------
# Matrix B
# ----------------------
if special_case_1:
    special_case_for_b = 1
    nDuplicateA = int (nRCs / nRowsA) # 4/2 or 4/1
    nElemsOfCPerRC = [int(nColsB/nDuplicateA) for i in range(nRCs)] # 32/2 or 32/4
    if special_case_3: # 1 & 3
        # It might happen that different RCs have different number of elements of C
        extraElems =  nColsB % nDuplicateA
        nRCExtraElem = 0
        while(extraElems > 0):
            for row in range(nRowsA):
                nElemsOfCPerRC[nRCExtraElem] += 1
                nRCExtraElem += nDuplicateA
                nRCExtraElem = nRCExtraElem % nRCs
            extraElems -= 1
        
        auxIndexBForRCs = np.zeros(int(nRCs/nRowsA), dtype=int)
        for rc in range(1, int(nRCs/nRowsA)):
            auxIndexBForRCs[rc] = auxIndexBForRCs[rc-1] + nElemsOfCPerRC[rc-1]
        indexBForRCs = np.tile(auxIndexBForRCs,nRowsA)

    else: # Not 3
        indexBForRCs = np.tile([nElemsOfCPerRC[rc]*rc for rc in range(int(nRCs/nRowsA))],nRowsA)
        
    
    b_spm_line = b_spm_first_line
    itLoop = np.max(nElemsOfCPerRC)
    auxElemsCPerRC = nElemsOfCPerRC.copy()
    print(nElemsOfCPerRC)
    for i in range(itLoop):
        vector_B = np.zeros(shape=0,dtype=int)
        for rc in range(nRCs):
            if auxElemsCPerRC[rc] > 0:
                print(indexBForRCs[rc])
                col_b = matrix_B[:,indexBForRCs[rc]]
                indexBForRCs[rc] += 1
                auxElemsCPerRC[rc] -= 1
                col_b_filled = np.concatenate((col_b, np.zeros(nElementsPerVWRSlice - len(col_b), dtype=int)))
            else:
                col_b_filled = np.zeros(nElementsPerVWRSlice, dtype=int)
            vector_B = np.concatenate((vector_B, col_b_filled))
        sim.setSPMLine(b_spm_line, vector_B.copy())
        b_spm_line+=1


# Load SRF
sim.setSPMLine(srf_spm_line, srf.copy())

# Load matrices A and C
sim.setSPMLine(a_spm_line, vector_A.copy())
sim.setSPMLine(c_spm_line, vector_C.copy())

# Load matrix B (prepared to be filled with 0s)
if special_case_for_b == 0:
    b_spm_line = b_spm_first_line
    for col in range(nColsB):
        col_b = matrix_B[:,col]
        col_b_filled = np.concatenate((col_b, np.zeros(32 - len(col_b), dtype=int))) # 32 = # slice elements
        vector_aux = np.tile(col_b_filled, nRCs)
        print(vector_aux)
        sim.setSPMLine(b_spm_line, vector_aux.copy())
        b_spm_line+=1

sim.displaySPMLine(0)  # SRF
sim.displaySPMLine(1)  # B
sim.displaySPMLine(2)  # B
sim.displaySPMLine(33) # C
sim.displaySPMLine(34) # A

SPECIAL CASE 1
SPECIAL CASE 2
SPECIAL CASE 3
[2, 1, 1, 1]
0
2
3
4
1
SPM 0: [12, 1, 33, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]
SPM 1: [1, 6, 8, 13, 4, 7, 4, 13, 3, 2, 6, 4, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 2, 2, 6, 11, 10, 9, 1, 7, 14, 1, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 9, 13, 4, 8, 13, 12, 7, 7, 7, 12, 10, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 14, 5, 2, 4, 5, 7, 2, 11, 6, 13, 3, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]
SPM 2: [5, 2, 7, 11, 11, 7, 10, 8, 11, 14, 14, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [244]:
# --------------------------------------------
#              COMPILE ASM TO HEX
# --------------------------------------------
sim.compileAsmToHex(kernel_path, kernel_number, version=version)

ASM to Hex
Processing file: ./kernels/mmul/instructions_asm.csv...
Creating file: ./kernels/mmul/dsip_bitstream.h
Creating file: ./kernels/mmul/instructions_hex_autogen.csv


Finally, we load the kernel into the internal memory of the specialized units and run it.

In [245]:
# --------------------------------------------
#                 LOAD KERNEL
# --------------------------------------------

# This needs the hex instructions, if you don't provide them, generate then compiling the asm
sim.kernel_load(kernel_path, version=version + "_autogen", kernel_number=kernel_number)

# --------------------------------------------
#               SIMULATE EXECUTION
# --------------------------------------------
show_lcu = []
show_srf = []
show_lsu = []
show_rcs = [[],[],[],[]]
show_mxcu = []
display_ops = [show_lcu, show_lsu, show_mxcu, show_rcs, show_srf]

sim.run(kernel_number, display_ops=display_ops)

Processing file: ./kernels/mmul/instructions_hex_autogen.csv...
---------------------
       PC: 0
---------------------
LSU: LOR R7, ZERO, ZERO/NOP --> ALU res = 0
RC0: NOP --> ALU res = 0
RC1: NOP --> ALU res = 0
RC2: NOP --> ALU res = 0
RC3: NOP --> ALU res = 0
MXCU: LOR R5, LAST, ZERO (VWR selected: 0, not writting SRF, R0: 0) --> ALU res = 31
LCU: NOP --> ALU res = 0
---------------------
       PC: 1
---------------------
LSU: LOR R7, ZERO, SRF(2)/LD.VWR SRF --> ALU res = 33
RC0: NOP --> ALU res = 0
RC1: NOP --> ALU res = 0
RC2: NOP --> ALU res = 0
RC3: NOP --> ALU res = 0
MXCU: LOR R6, LAST, ZERO (VWR selected: 0, not writting SRF, R0: 0) --> ALU res = 31
LCU: NOP --> ALU res = 0
---------------------
       PC: 2
---------------------
LSU: NOP/NOP --> ALU res = 0
RC0: NOP --> ALU res = 0
RC1: NOP --> ALU res = 0
RC2: NOP --> ALU res = 0
RC3: NOP --> ALU res = 0
MXCU: LOR R7, LAST, ZERO (VWR selected: 0, not writting SRF, R0: 0) --> ALU res = 31
LCU: SADD R1, ZERO, SRF(1) --> AL

We can check it more rigorously. We can define our function in python and check that the output matches the CGRA output.

In [246]:
def mmul (in_A, in_B, nRowsA, nColsA, nColsB, out):
    for i in range(nRowsA):
        for j in range(nColsB):
            sum = 0
            for k in range(nColsA):
                #print(f"i: {i}, j: {j}, k: {k}")
                sum += in_A[i][k] * in_B[k][j]
            out[i*nColsB + j] = sum

In [247]:
# Get output from the CGRA
disco_cgra_res = sim.getSPMLine(c_spm_line)

In [248]:
# Prepare output in some non standar cases
print(disco_cgra_res)
# TODO
if c_filled_w_zeros:
    real_C = []
    for rc in range(nRCs):
        real_C.extend(disco_cgra_res[rc*nElementsPerVWRSlice:rc*nElementsPerVWRSlice + nElemsOfCPerRC[rc]])
    disco_cgra_res = real_C
    print(disco_cgra_res)
if nRowsA != nRowsAOriginal:
    disco_cgra_res = disco_cgra_res[:nRowsAOriginal*nColsB]

[ 680  780    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0  637    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0 1024    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0  671    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]
[680, 780, 637, 1024, 671]


In [249]:
errors_idx = []
expected_output = [0 for i in range(nRowsAOriginal*nColsB)]
mmul(matrix_A, matrix_B, nRowsAOriginal, nColsA, nColsB, expected_output)
print(len(expected_output))
print(len(disco_cgra_res))
for i in range(len(expected_output)):
    if expected_output[i] != disco_cgra_res[i]:
        errors_idx.append(i)
if len(errors_idx) == 0:
    print("The result is correct!")
else:
    print("Oops, something went wrong. There are " + str(len(errors_idx)) + " errors.")
    print(errors_idx)
    print("DISCO-CGRA result:")
    print(disco_cgra_res)
    print("Expected result:")
    print(expected_output)

5
5
The result is correct!
