# Tensorflow to Qkeras to Hls4ml


In [1]:
import numpy as np

seed = 0
np.random.seed(seed)

import tensorflow as tf

tf.random.set_seed(seed)


import sys

sys.path.append("/home/webphy/Desktop/dnn_processor/")  # just to enable `dataset`
sys.path.append(
    "/home/webphy/Desktop/dnn_processor/dataset/"
)  # just to enable `dataset`

import dataset
import plotting
import qkeras
import keras

2024-02-04 10:26:14.786514: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-04 10:26:14.812774: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-04 10:26:14.812797: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-04 10:26:14.814318: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-04 10:26:14.820570: I tensorflow/core/platform/cpu_feature_guar

# fetch dataset


In [2]:
BATCH_SIZE = 64
train_images_rgx = "../dataset/resized_dataset_texturas_v2/*/*.jpeg"
valid_images_rgx = "../dataset/resized_dataset_texturas_v2_test/*/*.jpeg"

train_ds, val_ds, nclasses = dataset.create_datasets(
    train_images_rgx, valid_images_rgx, BATCH_SIZE
)

2024-02-04 10:26:16.660495: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-02-04 10:26:16.711052: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-02-04 10:26:16.711173: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

# load the model


In [3]:
exp_id = "exp7_no_depth_w_GAP_multiObj"
trial_id = 719

model = keras.models.load_model(
    f"../neural_network/automl/saved_models/striped_models/{exp_id}/{trial_id}.h5",
    compile=False,
)
model.compile(
    "adam", loss=tf.keras.losses.categorical_crossentropy, metrics=["accuracy"]
)

x = x_in = keras.Input(shape=(256, 256, 3))
x = keras.layers.Activation("linear")(x)
for l in model.layers[1:]:
    x = l(x)


new_model = keras.models.Model(x_in, x)
new_model.compile(
    "adam", loss=tf.keras.losses.categorical_crossentropy, metrics=["accuracy"]
)

score = model.evaluate(val_ds)
score = new_model.evaluate(val_ds)
print("Accuracy: {}".format(score[1]))

2024-02-04 10:26:19.474551: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904


Accuracy: 0.8600000143051147


# quantize model


In [4]:
import qkeras
from qkeras import *
from keras.layers import Conv2D, DepthwiseConv2D, Activation, Dense, Flatten


BIT_WIDTH = 8

def create_qkeras_model_from_keras_model(keras_model):
    x = x_in = keras.Input(shape=keras_model.layers[0].output.shape[1:])

    x = QActivation(quantized_relu(BIT_WIDTH, 0))(x)

    for l in keras_model.layers:
        if isinstance(l, Conv2D):
            qconv2d = QConv2D(
                l.filters,
                l.kernel_size,
                l.strides,
                l.padding,
                kernel_quantizer=quantized_bits(BIT_WIDTH, 0, 1),
                bias_quantizer=quantized_bits(BIT_WIDTH, 0, 1),
            )
            x = qconv2d(x)
            qconv2d.set_weights(l.get_weights())

        elif isinstance(l, DepthwiseConv2D):
            qDepthwiseConv2D = QDepthwiseConv2D(
                l.kernel_size,
                l.strides,
                l.padding,
                depthwise_quantizer=quantized_bits(BIT_WIDTH, 0, 1),
                bias_quantizer=quantized_bits(BIT_WIDTH, 0, 1),
            )
            x = qDepthwiseConv2D(x)
            qDepthwiseConv2D.set_weights(l.get_weights())

        elif isinstance(l, Activation):
            activation_name = l.get_config()["activation"]
            print(activation_name)
            if activation_name == "tanh":
                x = QActivation(quantized_tanh(BIT_WIDTH, True))(x)
            elif activation_name == "relu":
                x = QActivation(quantized_relu(BIT_WIDTH, 0))(x)
            elif activation_name == "linear":
                x = QActivation(quantized_bits(BIT_WIDTH, 0, 1))(x)
            else:
                print(f"{l.name} was not quantized")

        elif isinstance(l, Flatten):
            # necessary because the layer before Flatten doesnt have avtivation
            x = QActivation(quantized_bits(BIT_WIDTH, 0, 1))(x)
            x = Flatten()(x)

        elif isinstance(l, Dense):
            qdense = QDense(
                l.units,
                kernel_quantizer=quantized_bits(BIT_WIDTH, 0, 1),
                bias_quantizer=quantized_bits(BIT_WIDTH, 0, 1),
            )
            x = qdense(x)
            qdense.set_weights(l.get_weights())
        
        elif isinstance(l, GlobalAveragePooling2D):
            globalAveragePooling2D = GlobalAveragePooling2D()
            x = globalAveragePooling2D(x)
            x = QActivation(quantized_bits(BIT_WIDTH, 0, 1))(x)
        else:
            print(f"{l.name} layer was not quantized...")    

    qkeras_model = keras.models.Model(x_in, x)

    return qkeras_model



qkeras_model = create_qkeras_model_from_keras_model(model)
qkeras_model.compile(
    keras.optimizers.Adam(0.0005),
    loss=keras.losses.CategoricalCrossentropy(True),
    metrics=["accuracy"],
)
qkeras_model.summary()

input_1 layer was not quantized...
linear
relu
relu
softmax
activation_2237 was not quantized
Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 256, 256, 3)]     0         
                                                                 
 q_activation (QActivation)  (None, 256, 256, 3)       0         
                                                                 
 q_conv2d (QConv2D)          (None, 51, 51, 8)         392       
                                                                 
 q_activation_1 (QActivatio  (None, 51, 51, 8)         0         
 n)                                                              
                                                                 
 q_conv2d_1 (QConv2D)        (None, 17, 17, 24)        216       
                                                                 
 q_activation_2 (QActivatio  (N

In [5]:
from qkeras.estimate import print_qstats

qkeras_model.evaluate(val_ds)
print_qstats(qkeras_model)

Instructions for updating:
Use ref() instead.


Instructions for updating:
Use ref() instead.



Number of operations in model:
    q_conv2d                      : 998784 (smult_8_8)
    q_conv2d_1                    : 55488 (smult_8_8)
    q_conv2d_2                    : 243000 (smult_8_8)

Number of operation types in model:
    smult_8_8                     : 1297272

Weight profiling:
    q_conv2d_weights               : 384   (8-bit unit)
    q_conv2d_bias                  : 8     (8-bit unit)
    q_conv2d_1_weights             : 192   (8-bit unit)
    q_conv2d_1_bias                : 24    (8-bit unit)
    q_conv2d_2_weights             : 1080  (8-bit unit)
    q_conv2d_2_bias                : 5     (8-bit unit)

Weight sparsity:
... quantizing model
    q_conv2d                       : 0.0102
    q_conv2d_1                     : 0.0093
    q_conv2d_2                     : 0.0074
    ----------------------------------------
    Total Sparsity                 : 0.0083


In [6]:
# free memory
del model

# fine tune quantized model


In [7]:
qkeras_model.fit(train_ds, batch_size=256, epochs=5, validation_data=val_ds)

Epoch 1/5


2024-02-04 10:26:23.964619: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] layout failed: INVALID_ARGUMENT: Size of values 0 does not match size of permutation 4 @ fanin shape ingradient_tape/model_1/q_activation_3/SelectV2-2-TransposeNHWCToNCHW-LayoutOptimizer
2024-02-04 10:26:24.817699: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f8f608a19d0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-02-04 10:26:24.817718: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3060, Compute Capability 8.6
2024-02-04 10:26:24.820776: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1707053184.877581  413581 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7f94dc1542b0>

In [8]:
qkeras_model.save(f"saved_qmodels/{trial_id}.h5")

  saving_api.save_model(


In [9]:
from qkeras.utils import _add_supported_quantized_objects

co = {}
_add_supported_quantized_objects(co)

loaded_qkeras_model = keras.models.load_model(f"saved_qmodels/{trial_id}.h5", co)
loaded_qkeras_model.evaluate(val_ds)

# free memory
del loaded_qkeras_model



# convert quantized model to HLS4ml


In [10]:
import hls4ml

config = hls4ml.utils.config_from_keras_model(
    qkeras_model,
    granularity="name",
    default_precision="fixed<8,1>",
    default_reuse_factor=4096,
)

config["Model"]["Strategy"] = "Resource"

for layer in config["LayerName"].keys():
    config["LayerName"][layer]["Strategy"] = "Resource"


Interpreting Model
Topology:
Layer name: input_2, layer type: InputLayer, input shapes: [[None, 256, 256, 3]], output shape: [None, 256, 256, 3]
Layer name: q_activation, layer type: Activation, input shapes: [[None, 256, 256, 3]], output shape: [None, 256, 256, 3]
Layer name: q_conv2d, layer type: QConv2D, input shapes: [[None, 256, 256, 3]], output shape: [None, 51, 51, 8]
Layer name: q_activation_1, layer type: Activation, input shapes: [[None, 51, 51, 8]], output shape: [None, 51, 51, 8]
Layer name: q_conv2d_1, layer type: QConv2D, input shapes: [[None, 51, 51, 8]], output shape: [None, 17, 17, 24]
Layer name: q_activation_2, layer type: Activation, input shapes: [[None, 17, 17, 24]], output shape: [None, 17, 17, 24]
Layer name: q_conv2d_2, layer type: QConv2D, input shapes: [[None, 17, 17, 24]], output shape: [None, 15, 15, 5]
Layer name: q_activation_3, layer type: Activation, input shapes: [[None, 15, 15, 5]], output shape: [None, 15, 15, 5]
Layer name: global_average_pooling2d,



In [11]:

plotting.print_dict(config)

Model
  Precision:         fixed<8,1>
  ReuseFactor:       4096
  Strategy:          Resource
  BramFactor:        1000000000
  TraceOutput:       False
LayerName
  input_2
    Trace:           False
    Precision
      result:        fixed<8,1>
    Strategy:        Resource
  q_activation
    Trace:           False
    Precision
      result:        ufixed<8,0,RND_CONV,SAT>
    Strategy:        Resource
  q_conv2d
    Trace:           False
    Precision
      result:        fixed<8,1>
      weight:        fixed<8,1>
      bias:          fixed<8,1>
    Strategy:        Resource
  q_conv2d_linear
    Trace:           False
    Precision
      result:        fixed<8,1>
    Strategy:        Resource
  q_activation_1
    Trace:           False
    Precision
      result:        fixed<8,1,RND_CONV,SAT>
    Strategy:        Resource
  q_conv2d_1
    Trace:           False
    Precision
      result:        fixed<8,1>
      weight:        fixed<8,1>
      bias:          fixed<8,1>
    Strate

In [None]:

hls_model = hls4ml.converters.convert_from_keras_model(
    qkeras_model,
    hls_config=config,
    output_dir=f"model_q{trial_id}/hls4ml_prj",
    part="xc7z020-clg400-1",
    io_type="io_stream",
)

In [None]:
hls4ml.utils.plot_model(hls_model, show_shapes=True, show_precision=True, to_file=None)

In [None]:
hls_model.compile()

In [None]:
hls_model.build(csim=False)

In [None]:
hls4ml.report.read_vivado_report(f"model_q{trial_id}/hls4ml_prj")

# build for Pynq

In [None]:

hls_model = hls4ml.converters.convert_from_keras_model(
    qkeras_model,
    hls_config=config,
    output_dir=f"deploy/qmodel_{trial_id}/hls4ml_prj",
    part="xc7z020-clg400-1",
    io_type="io_stream",
    backend='VivadoAccelerator',
    board='pynq-z2'
)

hls_model.compile()
hls_model.build(csim=False, export=True, bitfile=True)

In [None]:
hls4ml.report.read_vivado_report(f"deploy/qmodel_{trial_id}/hls4ml_prj")

# FIFO depth optimization

In [None]:
import hls4ml


config["Flows"] = ["vivado:fifo_depth_optimization"]
hls4ml.model.optimizer.get_optimizer("vivado:fifo_depth_optimization").configure(
    profiling_fifo_depth=100_000
)


hls_model = hls4ml.converters.convert_from_keras_model(
    qkeras_model,
    hls_config=config,
    io_type="io_stream",
    output_dir=f"deploy/qmodel_{trial_id}_fifo_opt/hls4ml_prj",
    part="xc7z020clg400-1",
    backend="Vivado",
)

hls_model.build(reset=False, csim=True, synth=True, cosim=True)

In [None]:
hls4ml.report.read_vivado_report(f"deploy/qmodel_{trial_id}_fifo_opt/hls4ml_prj")

# FIFO depth optimization for PYNQ

In [12]:
hls_model = hls4ml.converters.convert_from_keras_model(
    qkeras_model,
    hls_config=config,
    output_dir=f"deploy/qmodel_{trial_id}_fifo_opt_pynq/hls4ml_prj",
    part="xc7z020clg400-1",
    io_type="io_stream",
    backend='VivadoAccelerator',
    board='pynq-z2'
)

Interpreting Model
Topology:
Layer name: input_2, layer type: InputLayer, input shapes: [[None, 256, 256, 3]], output shape: [None, 256, 256, 3]
Layer name: q_activation, layer type: Activation, input shapes: [[None, 256, 256, 3]], output shape: [None, 256, 256, 3]
Layer name: q_conv2d, layer type: QConv2D, input shapes: [[None, 256, 256, 3]], output shape: [None, 51, 51, 8]
Layer name: q_activation_1, layer type: Activation, input shapes: [[None, 51, 51, 8]], output shape: [None, 51, 51, 8]
Layer name: q_conv2d_1, layer type: QConv2D, input shapes: [[None, 51, 51, 8]], output shape: [None, 17, 17, 24]
Layer name: q_activation_2, layer type: Activation, input shapes: [[None, 17, 17, 24]], output shape: [None, 17, 17, 24]
Layer name: q_conv2d_2, layer type: QConv2D, input shapes: [[None, 17, 17, 24]], output shape: [None, 15, 15, 5]
Layer name: q_activation_3, layer type: Activation, input shapes: [[None, 15, 15, 5]], output shape: [None, 15, 15, 5]
Layer name: global_average_pooling2d,

In [13]:
hls_model.compile()

Writing HLS project
Done


  saving_api.save_model(


In [14]:
hls_model.build(
    reset=False, csim=True, synth=True, cosim=True, export=True, bitfile=True
)


****** Vivado(TM) HLS - High-Level Synthesis from C, C++ and SystemC v2020.1 (64-bit)
  **** SW Build 2902540 on Wed May 27 19:54:35 MDT 2020
  **** IP Build 2902112 on Wed May 27 22:43:36 MDT 2020
    ** Copyright 1986-2020 Xilinx, Inc. All Rights Reserved.

source /opt/Xilinx/Vivado/2020.1/scripts/vivado_hls/hls.tcl -notrace
INFO: [HLS 200-10] Running '/opt/Xilinx/Vivado/2020.1/bin/unwrapped/lnx64.o/vivado_hls'
INFO: [HLS 200-10] For user 'webphy' on host 'wesley-XPS-8950' (Linux_x86_64 version 6.5.0-15-generic) on Sun Feb 04 10:28:44 -03 2024
INFO: [HLS 200-10] On os Ubuntu 22.04.3 LTS
INFO: [HLS 200-10] In directory '/home/webphy/Desktop/dnn_processor/hls4ml/deploy/qmodel_719_fifo_opt_pynq/hls4ml_prj'
Sourcing Tcl script 'build_prj.tcl'
INFO: [HLS 200-10] Creating and opening project '/home/webphy/Desktop/dnn_processor/hls4ml/deploy/qmodel_719_fifo_opt_pynq/hls4ml_prj/myproject_prj'.
INFO: [HLS 200-10] Adding design file 'firmware/myproject_axi.cpp' to the project
INFO: [HLS 200-1

ERROR: [Common 17-69] Command failed: Run 'impl_1' failed. Unable to open


INFO: [Common 17-206] Exiting Vivado at Sun Feb  4 10:39:52 2024...
Vivado synthesis report not found.
Implementation report not found.
Timing report not found.


{'CSimResults': [['{', 'data:', '0,', 'last:', '0', '}'],
  ['{', 'data:', '0,', 'last:', '0', '}'],
  ['{', 'data:', '0,', 'last:', '0', '}'],
  ['{', 'data:', '0,', 'last:', '0', '}'],
  ['{', 'data:', '0,', 'last:', '1', '}'],
  []],
 'CosimResults': [['{', 'data:', '0,', 'last:', '0', '}'],
  ['{', 'data:', '0.0078125,', 'last:', '0', '}'],
  ['{', 'data:', '0,', 'last:', '0', '}'],
  ['{', 'data:', '0,', 'last:', '0', '}'],
  ['{', 'data:', '0,', 'last:', '1', '}'],
  []],
 'CSynthesisReport': {'TargetClockPeriod': '5.00',
  'EstimatedClockPeriod': '6.887',
  'BestLatency': '25755284',
  'WorstLatency': '25756366',
  'IntervalMin': '1703938',
  'IntervalMax': '25755650',
  'BRAM_18K': '331',
  'DSP': '5',
  'FF': '19930',
  'LUT': '37522',
  'URAM': '0',
  'AvailableBRAM_18K': '280',
  'AvailableDSP': '220',
  'AvailableFF': '106400',
  'AvailableLUT': '53200',
  'AvailableURAM': '0'},
 'CosimReport': {'RTL': 'Verilog',
  'Status': 'Pass',
  'LatencyMin': 1703993,
  'LatencyMax': 

In [15]:
hls4ml.report.read_vivado_report(f"deploy/qmodel_{trial_id}_fifo_opt_pynq/hls4ml_prj")

Found 1 solution(s) in deploy/qmodel_719_fifo_opt_pynq/hls4ml_prj/myproject_prj.
Reports for solution "solution1":

C SIMULATION RESULT:
INFO: [SIM 2] *************** CSIM start ***************
INFO: [SIM 4] CSIM will launch GCC as the compiler.
   Compiling ../../../../myproject_test.cpp in debug mode
   Compiling ../../../../firmware/myproject.cpp in debug mode
   Compiling ../../../../firmware/myproject_axi.cpp in debug mode
   Generating csim.exe
INFO: Unable to open input/predictions file, using default input.
{ data: 0, last: 0 }
 { data: 0, last: 0 }
 { data: 0, last: 0 }
 { data: 0, last: 0 }
 { data: 0, last: 1 }
 
INFO: Saved inference results to file: tb_data/csim_results.log
INFO: [SIM 1] CSim done with 0 errors.
INFO: [SIM 3] *************** CSIM finish ***************

SYNTHESIS REPORT:
== Vivado HLS Report for 'myproject_axi'
* Date:           Sun Feb  4 10:29:57 2024

* Version:        2020.1 (Build 2897737 on Wed May 27 20:21:37 MDT 2020)
* Project:        myproject_pr