# Benchmark

In [None]:
!python -m pip install pandas plotly pyyaml

In [None]:
import pandas as pd, numpy as np
import os, glob, re, datetime, time, shutil
import pathlib
from pathlib import Path
import plotly as plotly, plotly.express as px
import hjson
from ruamel.yaml import YAML ## use instead of pyyaml
yaml = YAML()
# yaml.default_flow_style = False

# Helper methods
def slugify(text: str):
    text = re.sub(r'[\':{\}]', '', text)
    text = re.sub(r'\s', '_', text)
    text = re.sub(r',', '_', text)
    return re.sub(r'\W', '', text)
    return text

# yaml dump np types
def represent_numpy_float64(self, value):
    return self.represent_float(value)  # alternatively dump as a tagged float

def represent_numpy_int64(self, value):
    return self.represent_int(value)  # alternatively dump as a tagged int

def represent_numpy_array(self, array, flow_style=None):
    tag = '' # '!numpy.ndarray'
    value = []
    node = ruamel.yaml.nodes.SequenceNode(tag, value, flow_style=flow_style)
    for elem in array:
        node_elem = self.represent_data(elem)
        value.append(node_elem)
    if flow_style is None:
        node.flow_style = True
    return node

yaml.Representer.add_representer(np.ndarray, represent_numpy_array)
yaml.Representer.add_representer(np.float64, represent_numpy_float64)
yaml.Representer.add_representer(np.int64, represent_numpy_int64)

# Compatability with outside of jupyter
import subprocess
def run(cmd, env=None, dryrun=False):
    if dryrun:
        print(cmd)
    else:
        p = subprocess.Popen(cmd, env=env, shell=True)
        retcode = p.wait()
        if retcode != 0:
            sys.exit(retcode)

def extend_environment(env=None, **kwargs):
    if not env:
        env = os.environ.copy()
    env.update(kwargs)
    return env

## Workflow for a simulating single app

In [None]:
# Compile hardware for Questa (vsim)
!questa-2022.3 make bin/snitch_cluster.vsim

In [None]:
# Compile software
!make DEBUG=ON sw

In [None]:
# Post process traces
!make -j traces
!make logs/perf.csv

In [None]:
# Read profile data
perf = pd.read_csv('logs/perf.csv', index_col=0)
perf.filter(regex=("1_."))

In [None]:
# Plot some results
fig = px.scatter(perf, y=['1_total_ipc', '1_fpss_occupancy', '1_fpss_fpu_occupancy', '1_snitch_occupancy'])
fig.update_layout(yaxis_range=[0,1])

In [None]:
!make CFG_OVERRIDE={cfg_file} rtl

# Benchmark Configuration

In [None]:
# Load top-level benchmark config, where all sweep information is stored
bench_config_name = Path('bench/bench.yaml')
with open(bench_config_name) as f:
    bench_config = yaml.load(f)
bench_config

In [None]:
# flatten into a table
hw = pd.json_normalize(bench_config['hw']).add_prefix('hw.').convert_dtypes()
sw = pd.json_normalize(bench_config['sw']).add_prefix('sw.').convert_dtypes()
configs = hw.merge(sw, how='cross')

In [None]:
# Evaluate expressions, any property ending in .eval is executed
eval_cols = configs.filter(regex=(r'.*\.eval')).columns.tolist()
eval_cols_short = [x.removesuffix('.eval') for x in eval_cols]
for i, col in enumerate(eval_cols):
    short = eval_cols_short[i]
    print(short)
    configs[col] = configs[col].apply(lambda x: eval(x) if type(x) == str else x)            
    
configs.rename(dict(zip(eval_cols, eval_cols_short)), axis=1, inplace=True)

In [None]:
# Explode sweep arrays to get all combinations to run
# each row is now a single test
for col in configs.columns.tolist():
    if 'sweep.' in col:
        configs = configs.explode(col)
        # configs.rename({col: col.replace('sweep.', '')}, axis=1, inplace=True)
configs.reset_index(inplace=True, drop=True)
configs = configs.convert_dtypes()
configs

In [None]:
def compile_hw(config: str):
    print(f'Compiling hw: {config}')

def compile_sw(config: str):
    print(f'Compiling sw: ')

def write_test_configs(test: dict, app_config: dict, destination: Path):
    os.makedirs(destination, exist_ok=True)

    # Store full config row
    with open(str(destination / 'test-config.yaml'), 'w') as f:
        yaml.dump(test, f)

    # Store only sw config for datagen
    with open(destination / 'config.yaml', 'w') as f:
        yaml.dump(app_config, f)
    
def prepare_output(output_dir: Path, bench_config_name: Path):
    os.makedirs(output_dir, exist_ok=True)
    shutil.copy(bench_config_name, output_dir / bench_config_name.name)

In [None]:
# Setup output directory 
output_dir = Path('output')
prepare_output(output_dir, bench_config_name)

# Iterate over all tests, create the configs, compile, run and post process

for hw_config, hw_config_df in configs.groupby(by='hw.config'):
    
    compile_hw(hw_config)
    for app_config, app_config_df in hw_config_df.groupby(by='sw.app'):
        
        app_cols   = [col for col in app_config_df.columns if f'{app_config}'       in col]
        sweep_cols = [col for col in config_app_df.columns if f'{app_config}.sweep' in col]
        print(f'{app_config} sweeps: {sweep_cols}')
        
        for sweep, sweep_df in app_config_df.groupby(by=sweep_cols) if sweep_cols else {'test': app_config_df}:
            for i, test in sweep_df.iterrows(): # (should be a df with one entry)
                # Remove unused properties
                test = test.dropna()
                
                # Get dicts for app/sweep config and remove prefixes
                app_config_short = dict(zip([name.removeprefix(f'sw.{test["sw.app"]}.').removeprefix('sweep.') 
                                             for name in test[app_cols]  .to_dict()], test[app_cols]  .to_dict().values()))
                sweep_short      = dict(zip([name.removeprefix(f'sw.{test["sw.app"]}.sweep.')                  
                                             for name in test[sweep_cols].to_dict()], test[sweep_cols].to_dict().values()))

                # Calculate output path for specific test
                test_path = output_dir / hw_config / app_config / slugify(str(sweep))
                test['path'] = str(test_path)

                # Print the config and sw specific config to the corresponding directory
                write_test_configs(test.to_dict(), app_config_short, test_path)
                
                # compile_sw(test, sw_config, output_dir)
                # run_test()
                # post_process()

In [None]:
configs.groupby(by='hw.config').get_group('full')

# Verify.py

In [None]:
args = {}
args['sim_bin']     = "bin/occamy_top.vsim"
args['snitch_bin']  = "sw/host/apps/offload/build/offload-gemm.elf"
args['symbols_bin'] = "sw/device/apps/blas/gemm/build/gemm.elf"
args['log']         = None
args['hw_config']   = 'cfg/1Q4C.hjson' # 'cfg/1Q2C.hjson'

In [None]:
import sys
import os
from pathlib import Path
import numpy as np

sys.path.append(os.path.join(os.path.abspath(''), "../../working_dir/snitch_cluster/sw/blas/gemm"))
from data.datagen import golden_model

sys.path.append(os.path.join(os.path.abspath(''), "../../working_dir/snitch_cluster/util/sim/"))
import verification  # noqa: E402
from elf import Elf  # noqa: E402
from data_utils import bytes_to_doubles, bytes_to_uint32s  # noqa: E402

In [None]:
!make CFG_OVERRIDE={args['hw_config']} DEBUG=ON sw

In [None]:
%%time
# Run simulation and get outputs
raw_results = verification.simulate(sim_bin=args['sim_bin'],
                                    snitch_bin=args['snitch_bin'],
                                    symbols_bin=args['symbols_bin'],
                                    log=args['log'],
                                    output_uids=['c'])

In [None]:
# Extract input operands from ELF file
if args['symbols_bin']:
    elf = Elf(args['symbols_bin'])
else:
    elf = Elf(args['snitch_bin'])
alpha = 1
beta = bytes_to_uint32s(elf.get_symbol_contents('BETA'))[0]
m = bytes_to_uint32s(elf.get_symbol_contents('M'))[0]
n = bytes_to_uint32s(elf.get_symbol_contents('N'))[0]
k = bytes_to_uint32s(elf.get_symbol_contents('K'))[0]
ta = bytes_to_uint32s(elf.get_symbol_contents('TA'))[0]
tb = bytes_to_uint32s(elf.get_symbol_contents('TB'))[0]
a = np.array(bytes_to_doubles(elf.get_symbol_contents('a')))
b = np.array(bytes_to_doubles(elf.get_symbol_contents('b')))
c = np.array(bytes_to_doubles(elf.get_symbol_contents('c'))).reshape((m, n))
result = np.array(bytes_to_doubles(elf.get_symbol_contents('result'))).reshape((m,n))

# Extract results in output_uids
c_actual = np.array(bytes_to_doubles(raw_results['c'])).reshape((m,n))

if ta:
    a = a.reshape((k, m))
    a = a.transpose()
else:
    a = a.reshape((m, k))
    
if tb:
    b = b.reshape((n, k))
    b = b.transpose()
else:
    b = b.reshape((k, n))

# Verify results
c_golden = golden_model(alpha, a, b, beta, c)

ERR_THRESHOLD = 0.001
absolute_err = np.absolute(c_golden - c_actual)
errors = np.count_nonzero(absolute_err > ERR_THRESHOLD)

if (errors):
    print(f'Failed with {errors} errors.')
    # verification.dump_results_to_csv([c_golden, c_actual, absolute_err],
    #                                  Path.cwd() / 'gemm_results.csv')
else:
    print(f'SUCCESS. Actual C matches result for dim {m} x {n}.')

In [None]:
((absolute_err > ERR_THRESHOLD)*1)

In [None]:
c_actual

In [None]:
result

In [None]:
!make -j annotate BINARY=sw/device/apps/blas/gemm/build/gemm.elf

# 2D Pipeline Indexing
Index calculations for determining the source cluster for C2C dma.

In [None]:
# Works for PI == PJ
PI = 3
PJ = 3
P  = PI * PJ
p = np.linspace(0, P -1, num=P, dtype=int).reshape((PI, PJ))
p

In [None]:
pi = (p / PJ).astype(int)
pi

In [None]:
pj = (p % PJ).astype(int)
pj

In [None]:
pk = (2*PJ - pi - pj -1) % PJ  # Or if k flipped: (PJ -pi + pj) % PJ
pk

In [None]:
srca = pi * PJ + ((2*PJ - pi - pk) % PJ)
srca

In [None]:
srcb = pj + PJ * ((2*PJ - pj - pk) % PJ)
srcb

In [None]:
sa = pi * PJ + ((PJ - pi - pj) % PJ)
sa

In [None]:
sb = pi + PJ *((PJ - pi + pj) % PJ)
sb

In [None]:
PJ * ((PJ - pi + pj) % PJ)

In [None]:
dram = ((p+1) % PJ == 0)
dram

In [None]:
sa = sa * (1-dram) + dram * -1
sa

In [None]:
sb = sb * (1-dram) + dram * -1
sb

In [None]:
pipeStep = (PJ - p - 1) % PJ
pipeStep