# Vivado - 4bit toy network

## Libraries

In [1]:
RUN_HLS = True

In [2]:
# Disable some console warnings on the ASIC-group servers
import os
os.environ['TF_XLA_FLAGS'] = '--tf_xla_enable_xla_devices'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [3]:
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow.keras import datasets, layers, models
import keras
from keras.models import Sequential, Model
from keras.layers import *
from keras.utils import Sequence
from keras.layers import Conv2D, MaxPooling2D
from qkeras import *
from keras.callbacks import CSVLogger
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from tensorflow.keras.optimizers import Adam
import numpy as np
import pandas as pd
from pandas import read_csv
import math
import seaborn as sns
import glob
import hls4ml
#import kerop

import pyaml

from typing import Union, List, Tuple
import random

ModuleNotFoundError: No module named 'tensorflow_probability'

## Create qkeras model

In [None]:
# constructs model
# model has 5 separate branches for x, y, cotA, cotB, covariance predictions

def var_network(var, hidden=10, output=2):
    var = Flatten()(var)
    var = QDense(
        hidden,
        kernel_quantizer=quantized_bits(8, 1, alpha=1),
        bias_quantizer=quantized_bits(8, 1, alpha=1),
        kernel_regularizer=tf.keras.regularizers.L1L2(0.01),
        activity_regularizer=tf.keras.regularizers.L2(0.01),
    )(var)
    var = QActivation("quantized_tanh(8, 0, 1)")(var)
    var = QDense(
        hidden,
        kernel_quantizer=quantized_bits(8, 1, alpha=1),
        bias_quantizer=quantized_bits(8, 1, alpha=1),
        kernel_regularizer=tf.keras.regularizers.L1L2(0.01),
        activity_regularizer=tf.keras.regularizers.L2(0.01),
    )(var)
    var = QActivation("quantized_tanh(8, 0, 1)")(var)
    return QDense(
        output,
        kernel_quantizer=quantized_bits(8, 1, alpha=1),
        bias_quantizer=quantized_bits(8, 1, alpha=1),
        kernel_regularizer=tf.keras.regularizers.L1L2(0.01),
    )(var)

def conv_network(var, kernel_size=3):
    var = QSeparableConv2D(
        5,kernel_size,
        depthwise_quantizer=quantized_bits(4, 1, 1, alpha=1),
        pointwise_quantizer=quantized_bits(4, 1, 1, alpha=1),
        bias_quantizer=quantized_bits(4, 1, alpha=1),
        depthwise_regularizer=tf.keras.regularizers.L1L2(0.01),
        pointwise_regularizer=tf.keras.regularizers.L1L2(0.01),
        activity_regularizer=tf.keras.regularizers.L2(0.01),
    )(var)
    var = QActivation("quantized_tanh(4, 0, 1)")(var)
    var = QSeparableConv2D(
        5,kernel_size,
        depthwise_quantizer=quantized_bits(4, 1, 1, alpha=1),
        pointwise_quantizer=quantized_bits(4, 1, 1, alpha=1),
        bias_quantizer=quantized_bits(4, 1, alpha=1),
        depthwise_regularizer=tf.keras.regularizers.L1L2(0.01),
        pointwise_regularizer=tf.keras.regularizers.L1L2(0.01),
        activity_regularizer=tf.keras.regularizers.L2(0.01),
    )(var)
    var = QActivation("quantized_tanh(4, 0, 1)")(var)
    var = QConv2D(
        5,1,
        kernel_quantizer=quantized_bits(4, 1, alpha=1),
        bias_quantizer=quantized_bits(4, 1, alpha=1),
        kernel_regularizer=tf.keras.regularizers.L1L2(0.01),
        activity_regularizer=tf.keras.regularizers.L2(0.01),
    )(var)
    var = QActivation("quantized_tanh(4, 0, 1)")(var)    
    return var

def CreateModel(shape):
    # Generate the same random values
    import random
    import numpy as np
    import tensorflow as tf

    random.seed(42)
    np.random.seed(42)
    tf.random.set_seed(42)
    
    x_base = x_in = Input(shape)
    stack = conv_network(x_base)
    stack = AveragePooling2D(
        pool_size=(2, 2), 
        strides=None, 
        padding="valid", 
        data_format=None,        
    )(stack)
    stack = QActivation("quantized_bits(8, 1, alpha=1)")(stack)
    stack = var_network(stack, hidden=16, output=14)
    model = Model(inputs=x_in, outputs=stack)
    return model

In [None]:
# compiles model
# loss = custom_loss
model=CreateModel((13,21,20))#train_3Dx.shape[1:])
model.summary()

## Show weights

In [None]:
# Print the weights on console
N_WEIGHTS = 10

# Backup print options
bkp_threshold = np.get_printoptions()['threshold']
bkp_linewidth = np.get_printoptions()['linewidth']

# Set print options
np.set_printoptions(threshold=np.inf, linewidth=np.inf)

weights = model.get_weights()
for i, w in enumerate(weights):
    print(f"Layer {i}:")
    print(w.flatten()[:N_WEIGHTS])
    print("-----------")

# Restore print options
np.set_printoptions(threshold=bkp_threshold, linewidth=bkp_linewidth)

## Show model

In [None]:
from tensorflow.keras.utils import plot_model
plot_model(model, to_file='vivado_model.png', show_shapes=True, show_layer_names=True, expand_nested=True, show_layer_activations=True)

In [None]:
print_qstats(model)

## Configure hls4ml model

In [None]:
import hls4ml

config = hls4ml.utils.config_from_keras_model(
    model, 
    granularity='name',
    default_precision='fixed<16,3>',
    #default_precision='fixed<17,7>',
)
config['LayerName']['input_1']['Precision']['result'] = 'fixed<4,1>'
#config['LayerName']['q_dense_2']['Precision']['result'] = 'fixed<25,9>'
#config['LayerName']['q_dense_2']['Precision']['accum'] = 'fixed<25,9>'
#config['LayerName']['q_dense_2_linear']['Precision']['result'] = 'fixed<25,9>'
#config['LayerName']['q_dense_2']['Precision']['result'] = 'fixed<19,9>'
#config['LayerName']['q_dense_2']['Precision']['accum'] = 'fixed<19,9>'
#config['LayerName']['q_dense_2_linear']['Precision']['result'] = 'fixed<19,9>'

# LineBuffer produces smaller implementations (Phil)
# Encoded    may produce larger implementations (Vladimir)
#            may be deprecated
config['Model']['ConvImplementation'] = 'Encoded'
#config['Model']['ConvImplementation'] = 'LineBuffer' # Default

config['Model']['Strategy'] = 'Latency'
#config['Model']['Strategy'] = 'Resource'

# Don't need this...
hls_model = hls4ml.converters.convert_from_keras_model(
    model, 
    hls_config=config, 
    output_dir='noslice_vivado_hls4ml_prj', 
    part='xcu250-figd2104-2L-e',
    #part='xc7z020clg400-1', 
    #io_type="io_parallel",
    io_type="io_stream",
)

hls_model.compile()

In [None]:
# print(pyaml.dump(config))

In [None]:
from copy import deepcopy
hconfig = deepcopy(config)
hconfig['LayerName']['input_1']['Precision']['result'] = 'fixed<4,1>'

In [None]:
print(pyaml.dump(hconfig))

## Run qkeras and hls4ml simulation

In [None]:
# Set a seed to have the same input traces on every run
np.random.seed(42)

toy_data = quantized_bits(4, 0, alpha=1)(np.random.rand(10000,13,21,20)).numpy()

# Enable tracing for all of the layers
for layer in hconfig['LayerName'].keys():
    print('Enable tracing for layer:', layer)
    hconfig['LayerName'][layer]['Trace'] = True

hmodel = hls4ml.converters.convert_from_keras_model(
    model,
    hls_config=hconfig,
    output_dir='noslice_hls4ml_vivado_trace_prj',
    part='xcu250-figd2104-2L-e',
    #io_type="io_parallel",
    io_type="io_stream",
) # ZCU216, engineering sample
#part='xczu49dr-ffvf1760-2-e') # ZCU216
hmodel.compile()

# Run tracing on the test set for the hls4ml model (fixed-point precision) 
hls4ml_pred, hls4ml_trace = hmodel.trace(toy_data)

# Run tracing on a portion of the test set for the Keras model (floating-point precision)
keras_trace = hls4ml.model.profiling.get_ymodel_keras(model, toy_data)

In [None]:
# Save inputs and expected outputs for further debugging
with open('noslice_hls4ml_vivado_trace_prj/tb_data/vivado_inputs.dat', 'w') as f:
    f.write(' '.join(map(str, toy_data[0].flatten())))
with open('noslice_hls4ml_vivado_trace_prj/tb_data/vivado_outputs.dat', 'w') as f:
    f.write(' '.join(map(str, keras_trace['q_dense_2'][0].flatten())))

## Show traces

In [None]:
# Print the traces on console
N_ELEMENTS=5

# Backup print options
bkp_threshold = np.get_printoptions()['threshold']
bkp_linewidth = np.get_printoptions()['linewidth']

# Set print options
np.set_printoptions(threshold=np.inf, linewidth=np.inf)

print('input', toy_data[0][0][0][:20])
for key in hls4ml_trace.keys():
    print('-------')
    print(key, hls4ml_trace[key].shape)
    print('[hls4ml]', key, hls4ml_trace[key][0].flatten()[:N_ELEMENTS])
    print('[keras] ', key, keras_trace[key][0].flatten()[:N_ELEMENTS])

# Restore print options
np.set_printoptions(threshold=bkp_threshold, linewidth=bkp_linewidth)

## Plot correlation qkeras and hls4ml

In [None]:
# Evaluate correlation plots
for layer in hls4ml_trace.keys():
    print(layer)
    if '_alpha' in layer:
        continue
    plt.figure()
    klayer = layer
    if '_linear' in layer:
        klayer = layer.replace('_linear', '')
    plt.scatter(hls4ml_trace[layer].flatten(), keras_trace[klayer].flatten(), s=0.2)
    min_x = min(np.amin(hls4ml_trace[layer]), np.amin(keras_trace[klayer]))
    max_x = max(np.amax(hls4ml_trace[layer]), np.amax(keras_trace[klayer]))
    plt.plot([min_x, max_x], [min_x, max_x], c='gray')
    plt.xlabel('hls4ml {}'.format(layer))
    plt.ylabel('QKeras {}'.format(klayer))

## Run HLS

In [None]:
def convert_model(keras_model,
                  hls_backend='Vivado',
                  io_type='io_stream',
                  conv_implementation='LineBuffer',
                  fpga_part='xcu250-figd2104-2L-e',
                  output_dir='hls4ml_vivado_{}_{}_{}_{}_prj'
                 ):

    yaml_config = hls4ml.backends.get_backend(hls_backend).create_initial_config(
        part=fpga_part, 
        io_type=io_type,
    )
    # or whatever part you want to use
    yaml_config['Backend'] = hls_backend

    yaml_config['ProjectName'] = 'smartpixels'
    config = hls4ml.utils.config_from_keras_model(keras_model, granularity='name', default_precision='fixed<16,3>')
    config['LayerName']['input_1']['Precision']['result'] = 'fixed<4,1>'    
    
    # LineBuffer produces smaller implementations (Phil)
    # Encoded    may produce larger implementations (Vladimir)
    #            may be deprecated
    config['Model']['ConvImplementation'] = conv_implementation
    
    yaml_config['HLSConfig'] = config

    hls4ml.model.optimizer.get_optimizer('vivado:fifo_depth_optimization').configure(profiling_fifo_depth=100_000)
    yaml_config['HLSConfig']['Flows'] = ['vivado:fifo_depth_optimization']

    yaml_config['KerasModel'] = keras_model
    yaml_config['OutputDir'] = output_dir.format(hls_backend.lower(), io_type.lower(), conv_implementation.lower(), fpga_part.lower())

    # At this point you should verify that the yaml matches the one you already have (check the output directory of the existing project)

    # About 10 samples should be enough, don't put a lot since you'll wait **A LOT** of time to simulate through them
    x = quantized_bits(4, 0, alpha=1)(np.random.rand(10,13,21,20)).numpy() # or whatever the input is

    from pathlib import Path
    Path(yaml_config['OutputDir']).mkdir(parents=True, exist_ok=True)

    # We need both samples and predictions passed to the testbench for cosim to do meaningful stuff

    input_data = ''
    for sample in x:
        for val in sample.flatten():
            input_data += str(val) + ' '
        input_data += '\n'

    input_data_path = yaml_config['OutputDir'] + '/input_data.dat'

    with open(input_data_path, 'w') as f:
        f.write(input_data.rstrip())
    yaml_config['InputData'] = input_data_path

    y = keras_model.predict(x)
    output_data = ''
    for prediction in y:
        for val in prediction.flatten():
            output_data += str(val) + ' '
        output_data += '\n'

    output_data_path = yaml_config['OutputDir'] + '/output_data.dat'
    with open(output_data_path, 'w') as f:
        f.write(output_data.rstrip())
    yaml_config['OutputPredictions'] = output_data_path

    model = hls4ml.converters.keras_to_hls(yaml_config)
    model.write()

    # reset=True will nuke any existing synthesis, use with care
    report = model.build(csim=False, synth=True, cosim=True, validation=False, export=True, vsynth=True, reset=True)
    #report = model.build(csim=False, synth=False, cosim=False, validation=False, export=False, vsynth=False, reset=False)
    report.pop('CSimResults', None) # We don't care about this, and it spams the output
    report.pop('CosimResults', None)
    print(report) # Print hashmap
    return report, fpga_part

In [None]:
import subprocess

def run_command(cmd):
    result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True)
    return result.stdout, result.stderr

def print_report(report, fpga_part = '?', vivado_version = '?', hls = True, syn = True, cosim = True):
    hls_results = report['CSynthesisReport']
    syn_results = report['VivadoSynthReport']
    cosim_results = report['CosimReport']
    if hls:
        print('-----------------------------------')
        print('Vivado version: {}'.format(vivado_version))
        print('FPGA part:      {}'.format(fpga_part))
        print('-----------------------------------')
        print('HLS')
        print('Target Clock Period:    {} ns'.format(hls_results['TargetClockPeriod']))
        print('Estimated Clock Period: {} ns'.format(hls_results['EstimatedClockPeriod']))
        print('Best/Worst Latency: {} / {}'.format(hls_results['BestLatency'], hls_results['WorstLatency']))
        print('Interval Min/Max:   {} / {}'.format(hls_results['IntervalMin'], hls_results['IntervalMax']))
        print('BRAM_18K:           {}, {:0.1f}% (Aval. {})'.format(hls_results['BRAM_18K'], float(hls_results['BRAM_18K'])*100.0/int(hls_results['AvailableBRAM_18K']), hls_results['AvailableBRAM_18K']))
        print('DSP:                {}, {:0.1f}% (Aval. {})'.format(hls_results['DSP'], float(hls_results['DSP'])*100.0/int(hls_results['AvailableDSP']), hls_results['AvailableDSP']))
        print('FF:                 {}, {:0.1f}% (Aval. {})'.format(hls_results['FF'], float(hls_results['FF'])*100.0/int(hls_results['AvailableFF']), hls_results['AvailableFF']))
        print('LUT:                {}, {:0.1f}% (Aval. {})'.format(hls_results['LUT'], float(hls_results['LUT'])*100.0/int(hls_results['AvailableLUT']), hls_results['AvailableLUT']))
        #print("URAM:                   {}, {} (Aval. {})".format(hls_results['URAM'], int(hls_results['URAM'])*100.0/int(hls_results['AvailableURAM']), hls_results['AvailableURAM']))
    if syn:
        print('-----------------------------------')
        print('Synthesis')
        print('BRAM_18K:           {}, {:0.1f}% (Aval. {})'.format(syn_results['BRAM_18K'], float(syn_results['BRAM_18K'])*100.0/int(hls_results['AvailableBRAM_18K']), hls_results['AvailableBRAM_18K']))
        print('DSP:                {}, {:0.1f}% (Aval. {})'.format(syn_results['DSP48E'], float(syn_results['DSP48E'])*100.0/int(hls_results['AvailableDSP']), hls_results['AvailableDSP']))
        print('FF:                 {}, {:0.1f}% (Aval. {})'.format(syn_results['FF'], float(syn_results['FF'])*100.0/int(hls_results['AvailableFF']), hls_results['AvailableFF']))
        print('LUT:                {}, {:0.1f}% (Aval. {})'.format(syn_results['LUT'], float(syn_results['LUT'])*100.0/int(hls_results['AvailableLUT']), hls_results['AvailableLUT']))
    if syn:
        print('-----------------------------------')
        print('Cosimulation')
        print('Max/Min Latency:    {} / {}'.format(cosim_results['LatencyMax'], cosim_results['LatencyMin']))
        print('Avg Latency:        {}'.format(cosim_results['LatencyAvg']))
        print('Max/Min Interval:   {} / {}'.format(cosim_results['IntervalMax'], cosim_results['IntervalMin']))
        print('Avg Interval:       {}'.format(cosim_results['IntervalAvg']))
    print('-----------------------------------')

In [None]:
stdout, stderr = run_command('vivado -version')
vivado_version = stdout.split()[1]

### Alveo U250

In [None]:
%%time 
if (RUN_HLS):
    report, fpga_part = convert_model(model,                                    
                                      hls_backend='Vivado',
                                      io_type='io_stream',
                                      conv_implementation='LineBuffer',
                                      #conv_implementation='Encoded',
                                      fpga_part='xcu250-figd2104-2L-e',
                                      output_dir='noslice_hls4ml_{}_{}_{}_{}_prj'
                                     )

In [None]:
if (RUN_HLS):
    print_report(report,
                 fpga_part='xcu250-figd2104-2L-e',
                 vivado_version=vivado_version)

#### Previous results

### PYNQ-Z2

In [None]:
%%time 
if (RUN_HLS):
    report, fpga_part = convert_model(model,                                    
                                      hls_backend='Vivado',
                                      io_type='io_stream',
                                      conv_implementation='LineBuffer',
                                      #conv_implementation='Encoded',
                                      fpga_part='xc7z020clg400-1',
                                      output_dir='noslice_hls4ml_{}_{}_{}_{}_prj'
                                     )

In [None]:
if (RUN_HLS):
    print_report(report,
                 fpga_part="xc7z020clg400-1",
                 vivado_version=vivado_version)

#### Previous results