In [None]:
# runtime of the lu block in seconds
def lu_block(bsize, sbsize, f_kernel, f_channel=156e6, f_mem=300e6):
    t_total = 0
    #load block from global memory to local memory buffer
    t_total += (bsize * bsize / sbsize) / min(f_mem, f_kernel)
    # total time needed for updating LU sub-blocks plus latency of non-pipelined logic
    t_total += (bsize * (sbsize + 100)) / f_kernel
    # total time needed to update all sub-blocks
    t_total += ((bsize - sbsize) * (bsize / sbsize) * bsize / sbsize / 2)/ f_kernel
    # total time needed to send rows and columns to network kernel
    t_total += (bsize * sbsize / 2) / min(f_kernel, f_channel)
    #store block from local memory to global memory
    t_total += (bsize * bsize / sbsize) / min(f_mem, f_kernel)
    return t_total


In [None]:
# runtime for top, left block update in seconds
def topleft_block(bsize, sbsize, f_kernel, f_channel=156e6, f_mem=300e6):
    t_total = 0
    #load block from global memory to local memory buffer
    t_total += (bsize * bsize / sbsize) / min(f_mem, f_kernel)
    # receive row from extern. Will be external channel bound in worst case and has a high latency because of the global memory access
    t_total += (bsize * (bsize/sbsize + 240)) / min(f_kernel, f_channel, f_mem)
    # total time needed to update all sub-blocks
    t_total += (bsize * (bsize / sbsize) * bsize / sbsize / 2)/ f_kernel
    #store block from local memory to global memory
    t_total += (bsize * bsize / sbsize) / min(f_mem, f_kernel)
    return t_total

In [None]:
# runtime for inner block update in seconds
def inner_block(bsize, sbsize, f_kernel, f_channel=156e6, f_mem=300e6):
    t_total = 0
    #load block from global memory to local memory buffer
    t_total += (bsize * bsize / sbsize) / min(f_mem, f_kernel)
    # receive row from extern. Will be external channel bound in worst case and has a high latency because of the global memory access
    t_total += (bsize * (bsize/sbsize + 240)) / min(f_kernel, f_channel, f_mem)
    # total time needed to update all sub-blocks
    t_total += (bsize * (bsize / sbsize) * bsize / sbsize)/ f_kernel
    #store block from local memory to global memory
    t_total += (bsize * bsize / sbsize) / min(f_mem, f_kernel)
    return t_total

In [None]:
# runtime of the whole calculation for a matrix with multiple blocks in seconds
def combined_single_fpga(size_in_blocks, f_kernel, bsize = 1024, f_mem=300e6):
    t_total = 0
    # for every block row
    for brow in range(1,size_in_blocks):
        # inner block update is bottleneck, everything else is pipelined, so just cound inner block time
        t_total += (brow ** 2) * inner_block(bsize, 8, f_kernel, f_mem, f_mem)
        # moreover, we have the first iteration of the LU update for every block row
        #load block from global memory for LU is neglected because it is represented by the first execution of the inner update
        # time needed to update all sub-blocks of the lu block for the first iteration
        t_total += ((bsize / 8) ** 2)/ f_kernel
    # at the end we need to do an additional lu block
    t_total += lu_block(bsize,8,f_kernel, f_mem, f_mem)
    return t_total

In [None]:
def gflops_single_fpga(size_in_blocks, f_kernel, bsize):
    return (2*(size_in_blocks * bsize)**3/3)/combined_single_fpga(size_in_blocks, f_kernel, bsize) * 1.0e-9

In [None]:
gflops_single_fpga(8,150e6, 1024)

In [None]:
import matplotlib.pyplot as plt

for y in range(7,11):
    plt.plot(list(range(16)), [gflops_single_fpga(x, 150e6, 2 ** y) for x in range(16)])

In [None]:
# runtime for an additional inner update which uses matrix multiplication
def inner_block_mm(bsize, sbsize, f_kernel, f_mem=300e6, load_stalls=0.0):
    t_total = 0
    #we need to load three blocks from global memory now
    #because of memory interlaving this may happen simulataneously
    t_total += (bsize * bsize / sbsize) / (min(f_mem, f_kernel) * (1.0 - load_stalls))
    # total time needed to update all sub-blocks
    t_total += ((bsize / sbsize) ** 3)/ f_kernel
    # we still only need to store a single block
    t_total += (bsize * bsize / sbsize) / min(f_mem, f_kernel)
    return t_total

In [None]:
# runtime of the whole calculation for a matrix with multiple blocks in seconds using also the additional inner update block
def combined_single_fpga_mm(size_in_blocks, f_kernel, bsize = 1024, f_mem=300e6):
    t_total = 0
    # for every block row
    for brow in range(1,size_in_blocks):
        # inner block update is bottleneck, everything else is pipelined, so just count inner block time
        # we execute the old inner block update only for the diagonal blocks
        t_total += brow * inner_block(bsize, 8, f_kernel, f_mem, f_mem)
        # all other blocks will be updated with the faster version
        t_total += ((brow - 1) * brow) * inner_block_mm(bsize, 8, f_kernel, f_mem)
        # moreover, we have the first iteration of the LU update for every block row
        #load block from global memory for LU is neglected because it is represented by the first execution of the inner update
        # time needed to update all sub-blocks of the lu block for the first iteration
        t_total += ((bsize / 8) ** 2)/ f_kernel
    # at the end we need to do an additional lu block
    t_total += lu_block(bsize,8,f_kernel, f_mem, f_mem)
    return t_total

def gflops_single_fpga_mm(size_in_blocks, f_kernel, bsize):
    return (2*(size_in_blocks * bsize)**3/3)/combined_single_fpga_mm(size_in_blocks, f_kernel, bsize) * 1.0e-9

In [None]:
import matplotlib.pyplot as plt

plt.title("Expected performance on single FPGA with kernel frequency 150MHz and block size 512")
plt.ylabel("GFLOP/s")
plt.xlabel("Matrix size")
plt.plot([ x  for x in range(0,32)], [gflops_single_fpga_mm(x, 300e6, 512) for x in range(0,32)], label="Approach using additional MM kernel")
plt.plot([ x  for x in range(0,32)], [gflops_single_fpga(x, 300e6, 512) for x in range(0,32)], label= "Current approach")
plt.legend()

In [None]:
import matplotlib.pyplot as plt

plt.title("Expected performance on single FPGA with kernel frequency 150MHz an MM kernel")
plt.ylabel("GFLOP/s")
plt.xlabel("Matrix size")
plt.plot([ x * 512 for x in range(0,16)], [gflops_single_fpga_mm(x, 150e6, 512) for x in range(0,16)], label="block 512")
plt.plot([ x * 256 for x in range(0,32)], [gflops_single_fpga_mm(x, 150e6, 256) for x in range(0,32)], label= "block 256")
plt.plot([ x * 128 for x in range(0,64)], [gflops_single_fpga_mm(x, 150e6, 128) for x in range(0,64)], label= "block 128")
plt.legend()

In [None]:
gflops_single_fpga_mm(16, 219.45e6, 512)

In [None]:
import math

# runtime of the whole calculation for a matrix with multiple blocks in seconds using also the additional inner update block which can be replicated
def combined_single_fpga_mm_multi(size_in_blocks, f_kernel, bsize = 1024,  mm_blocks=1, f_mem=300e6, load_stalls=0.0):
    t_total = 0
    # for every block row
    for brow in range(1,size_in_blocks):
        # inner block update is bottleneck, everything else is pipelined, so just count inner block time
        # we execute the old inner block update only for the diagonal blocks
        t_total += brow * inner_block(bsize, 8, f_kernel, f_mem, f_mem)
        # all other blocks will be updated with the faster version
        t_total += math.ceil(((brow - 1) * brow) / mm_blocks) * inner_block_mm(bsize, 8, f_kernel, f_mem, load_stalls)
        # moreover, we have the first iteration of the LU update for every block row
        #load block from global memory for LU is neglected because it is represented by the first execution of the inner update
        # time needed to update all sub-blocks of the lu block for the first iteration
        t_total += ((bsize / 8) ** 2)/ f_kernel
    # at the end we need to do an additional lu block
    t_total += lu_block(bsize,8,f_kernel, f_mem, f_mem)
    return t_total

def gflops_single_fpga_mm_multi(size_in_blocks, f_kernel, bsize, mm_blocks, load_stalls=0.0):
    return (2*(size_in_blocks * bsize)**3/3)/combined_single_fpga_mm_multi(size_in_blocks, f_kernel, bsize, mm_blocks, load_stalls=load_stalls) * 1.0e-9

In [None]:
print(gflops_single_fpga_mm_multi(32, 219.45e6, 512,3))
print(gflops_single_fpga_mm_multi(32, 219.45e6, 256,5))

In [None]:
plt.title("Performance of 3 and 4 replications over the matrix size")
plt.xlabel("Matrix width in elements")
plt.ylabel("GFLOP/s")
plt.plot(list(range(512,512* 512, 512)), [gflops_single_fpga_mm_multi(x, 157.14e6, 512,3) for x in range(1,512)], label="3 replications")
plt.plot(list(range(512,512*512, 512)), [gflops_single_fpga_mm_multi(x, 116.67e6, 512,4) for x in range(1,512)], label="4 replications")
plt.legend()


In [None]:
import pandas as pd
import numpy as np
# create data frame out of measurements
df_measurements = pd.DataFrame({ "matrix_size" : [512, 1024, 2048, 4096, 8192, 16384, 20480,22528, 24576, 32768, 87040],
    "initial" : [11.09,32.16,56.68,92.23,141.45,198.47,214.97, 212.65, 202.10, 136.20, np.nan],
    "multi_queue" : [np.nan,np.nan,np.nan,92.3,np.nan,np.nan,np.nan, 213.44, np.nan, 128.57, np.nan],
    "reduce_events": [np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,216.32, 223.76, 212.05, 151.03, np.nan],
    "sliding_window": [np.nan,np.nan,np.nan,92.20,np.nan,np.nan,216.58, np.nan, 230.58, 251.35, 302.91]})
#df_measurements.set_index("matrix_size", inplace=True)
df_measurements

In [None]:
import numpy as np
plt.title("Compare performance of Synthesized Design to Model")
plt.xlabel("Matrix width in blocks of 512 elements")
plt.ylabel("GFLOP/s")

plt.plot(list(range(1,200)), [gflops_single_fpga_mm_multi(x, 157.14e6, 512,3) for x in range(1,200)], label="Model")
plt.plot(list(range(1,200)), [gflops_single_fpga_mm_multi(x, 157.14e6, 512,3, load_stalls=0.5) for x in range(1,200)], label="Model, load_stalls=50%")
plt.scatter(df_measurements["matrix_size"] / 512, df_measurements["initial"], label="Measurements")
plt.scatter(df_measurements["matrix_size"]  / 512, df_measurements["multi_queue"], label="Multi Queue Measurements")
plt.scatter(df_measurements["matrix_size"]  / 512, df_measurements["reduce_events"], label="Reduced Events Measurements")
plt.scatter(df_measurements["matrix_size"]  / 512, df_measurements["sliding_window"], label="JIT Release Measurements")
#plt.xscale("log")
#plt.xlim((0,180))
#plt.ylim((0,350))
plt.legend()
#plt.savefig("measurement_anomaly4.png")

In [None]:
gflops_single_fpga_mm_multi(40, 157.14e6, 512,3, load_stalls=0.5)

In [None]:
inner_block_mm(512,8,175e6, load_stalls=0.47)

In [None]:
#58.21
gflops_single_fpga_mm_multi(16, 175e6, 512,3)