In [2]:
import argparse
import math
import os
import subprocess
import tempfile
import xml.etree.ElementTree as ET
from copy import deepcopy
from datetime import datetime
from pathlib import Path
from typing import List, Union

import keras_cv
import numpy as np
import onnx
import onnx2keras
import onnx2tf
import tensorflow as tf
import tensorflow_model_optimization as tfmot
import torch
import torch.nn as nn
import torch.nn.utils.prune as prune
import torch_pruning as tp
import ultralytics
from keras_cv import bounding_box, visualization
from matplotlib import pyplot as plt
from onnx2keras import onnx_to_keras
from rich.pretty import pprint
from tensorflow import keras
from tqdm.auto import tqdm
from ultralytics import YOLO, __version__

# from ultralytics.engine.model import TASK_MAP
from ultralytics.engine.trainer import BaseTrainer
from ultralytics.nn.modules import Bottleneck, C2f, Conv, Detect
from ultralytics.nn.tasks import attempt_load_one_weight
from ultralytics.utils import (
    DEFAULT_CFG_DICT,
    DEFAULT_CFG_KEYS,
    LOGGER,
    RANK,
    yaml_load,
)
from ultralytics.utils.checks import check_yaml
from ultralytics.utils.torch_utils import de_parallel, initialize_weights

2024-05-11 20:46:54.426877: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-11 20:46:54.486307: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-11 20:46:54.900496: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using TensorFlow backend


  from .autonotebook import tqdm as notebook_tqdm


Don't know why tensorflow doesn't recognize the GPU while ultralytics (torch) has no problems with it


In [4]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices("GPU")))

Num GPUs Available:  0


2024-05-11 20:46:59.884315: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-11 20:46:59.891471: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [5]:
ultralytics.checks()

Ultralytics YOLOv8.2.12 🚀 Python-3.10.12 torch-2.1.2+cu121 CUDA:0 (NVIDIA GeForce RTX 3060 Ti, 8192MiB)
Setup complete ✅ (12 CPUs, 23.4 GB RAM, 142.2/250.9 GB disk)


# Helper Functions


In [6]:
def get_file_size(file_path):
    size = os.path.getsize(file_path)
    return size


def sparsity(model):
    # Return global model sparsity
    a, b = 0, 0
    for p in model.parameters():
        a += p.numel()
        b += (p == 0).sum()
    return b / a


def convert_bytes(size, unit=None):
    if unit == "KB":
        return print("File size: " + str(round(size / 1024, 3)) + " Kilobytes")
    elif unit == "MB":
        return print("File size: " + str(round(size / (1024 * 1024), 3)) + " Megabytes")
    else:
        return print("File size: " + str(size) + " bytes")


def c_style_hexdump(input, ouput, name):
    with open(input, "rb") as f:
        file = f.read()

    file = bytearray(file)
    _bytes = [f"0x{x:02x}" for x in file]
    file = ",".join(_bytes)

    with open(ouput, "w") as f:
        f.write("#pragma once\n")
        f.write("#include <stdalign.h>\n")
        f.write(f"alignas(16) const unsigned char {name}[] = {{{file}}};")

    return len(_bytes)


def build_header(output, names_with_sizes):
    with open(output, "w") as f:
        f.write('#pragma once\n#ifdef __cplusplus\nextern "C"\n{\n#endif\n')
        f.write("#include <stdalign.h>\n\n")
        for name, size in names_with_sizes:
            f.write(f"alignas(16) extern const unsigned char {name}[{size}];\n")
        f.write("\n#ifdef __cplusplus\n}\n#endif\n")


def evaluate_model(interpreter, x_test, y_test):
    input_index = interpreter.get_input_details()[0]["index"]
    output_index = interpreter.get_output_details()[0]["index"]

    # Run predictions on every image in the "test" dataset.
    prediction_digits = []
    for i, test_image in enumerate(x_test):
        # Pre-processing: add batch dimension and convert to float32 to match with
        # the model's input data format.
        test_image = np.expand_dims(test_image, axis=0).astype(np.float32)
        interpreter.set_tensor(input_index, test_image)

        # Run inference.
        interpreter.invoke()

        # Post-processing: remove batch dimension and find the digit with highest
        # probability.
        output = interpreter.tensor(output_index)
        digit = np.argmax(output()[0])
        prediction_digits.append(digit)

    print("\n")
    # Compare prediction results with ground truth labels to calculate accuracy.
    prediction_digits = np.array(prediction_digits)
    accuracy = (prediction_digits == y_test).mean()
    return accuracy

# Load the Model


It is not possible to quantize the model directly using `model.export(format="tflite", imgsz=640, int8=True, data="coco.yaml")`, see issue https://github.com/ultralytics/ultralytics/issues/11722

What I do is export into `.onnx` format first and then convert to `.tflite` using `onnx2tf`


In [25]:
saved_model_fastest_det = tempfile.mkdtemp()

onnx2tf.convert(
    "FastestDet.onnx",
    output_integer_quantized_tflite=True,
    output_h5=True,
    output_folder_path=saved_model_fastest_det,
)


Simplifying[33m...[0m
Finish! Here is the difference:
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓
┃[1m [0m[1m           [0m[1m [0m┃[1m [0m[1mOriginal Model[0m[1m [0m┃[1m [0m[1mSimplified Model[0m[1m [0m┃
┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩
│ Add         │ 1              │ 1                │
│ AveragePool │ 1              │ 1                │
│ Concat      │ 19             │ 19               │
│ Constant    │ 150            │ 150              │
│ Conv        │ 70             │ 70               │
│ Gather      │ 26             │ 26               │
│ MaxPool     │ 1              │ 1                │
│ Relu        │ 48             │ 48               │
│ Reshape     │ 26             │ 26               │
│ Resize      │ 1              │ 1                │
│ Sigmoid     │ 1              │ 1                │
│ Softmax     │ 1              │ 1                │
│ Transpose   │ 15             │ 15               │
│ Model Size  │ 972.6KiB       │ 972.6K

2024-05-11 22:14:44.985352: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-11 22:14:44.985413: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1
2024-05-11 22:14:44.985525: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session
2024-05-11 22:14:44.985883: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-11 22:14:44.985900: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/

[32mFloat32 tflite output complete![0m


2024-05-11 22:14:46.156393: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:364] Ignored output_format.
2024-05-11 22:14:46.156449: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:367] Ignored drop_control_dependency.
2024-05-11 22:14:46.238954: I tensorflow/compiler/mlir/lite/flatbuffer_export.cc:2138] Estimated count of arithmetic ops: 250.424 M  ops, equivalently 125.212 M  MACs


[32mFloat16 tflite output complete![0m
[34mInput signature information for quantization[0m
[34msignature_name[0m: serving_default
[34minput_name.0[0m: input_1 [34mshape[0m: (1, 352, 352, 3) [34mdtype[0m: <dtype: 'float32'>


2024-05-11 22:14:47.964535: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:364] Ignored output_format.
2024-05-11 22:14:47.964593: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:367] Ignored drop_control_dependency.
2024-05-11 22:14:47.964747: I tensorflow/cc/saved_model/reader.cc:45] Reading SavedModel from: /tmp/tmpqakc845g
2024-05-11 22:14:47.967468: I tensorflow/cc/saved_model/reader.cc:91] Reading meta graph with tags { serve }
2024-05-11 22:14:47.967482: I tensorflow/cc/saved_model/reader.cc:132] Reading SavedModel debug info (if present) from: /tmp/tmpqakc845g
2024-05-11 22:14:47.974766: I tensorflow/cc/saved_model/loader.cc:231] Restoring SavedModel bundle.
2024-05-11 22:14:48.005701: I tensorflow/cc/saved_model/loader.cc:215] Running initialization op on SavedModel bundle at path: /tmp/tmpqakc845g
2024-05-11 22:14:48.038239: I tensorflow/cc/saved_model/loader.cc:314] SavedModel load for tags { serve }; Status: success: OK. Took 73491 m

[32mDynamic Range Quantization tflite output complete![0m


2024-05-11 22:14:48.411462: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:364] Ignored output_format.
2024-05-11 22:14:48.411517: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:367] Ignored drop_control_dependency.
2024-05-11 22:14:48.411703: I tensorflow/cc/saved_model/reader.cc:45] Reading SavedModel from: /tmp/tmpqakc845g
2024-05-11 22:14:48.415515: I tensorflow/cc/saved_model/reader.cc:91] Reading meta graph with tags { serve }
2024-05-11 22:14:48.415542: I tensorflow/cc/saved_model/reader.cc:132] Reading SavedModel debug info (if present) from: /tmp/tmpqakc845g
2024-05-11 22:14:48.422915: I tensorflow/cc/saved_model/loader.cc:231] Restoring SavedModel bundle.
2024-05-11 22:14:48.467812: I tensorflow/cc/saved_model/loader.cc:215] Running initialization op on SavedModel bundle at path: /tmp/tmpqakc845g
2024-05-11 22:14:48.498756: I tensorflow/cc/saved_model/loader.cc:314] SavedModel load for tags { serve }; Status: success: OK. Took 87051 m

[32mINT8 Quantization tflite output complete![0m


2024-05-11 22:14:50.897538: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:364] Ignored output_format.
2024-05-11 22:14:50.897604: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:367] Ignored drop_control_dependency.
2024-05-11 22:14:50.897788: I tensorflow/cc/saved_model/reader.cc:45] Reading SavedModel from: /tmp/tmpqakc845g
2024-05-11 22:14:50.902594: I tensorflow/cc/saved_model/reader.cc:91] Reading meta graph with tags { serve }
2024-05-11 22:14:50.902647: I tensorflow/cc/saved_model/reader.cc:132] Reading SavedModel debug info (if present) from: /tmp/tmpqakc845g
2024-05-11 22:14:50.912120: I tensorflow/cc/saved_model/loader.cc:231] Restoring SavedModel bundle.
2024-05-11 22:14:50.948501: I tensorflow/cc/saved_model/loader.cc:215] Running initialization op on SavedModel bundle at path: /tmp/tmpqakc845g
2024-05-11 22:14:50.987392: I tensorflow/cc/saved_model/loader.cc:314] SavedModel load for tags { serve }; Status: success: OK. Took 89605 m

[32mFull INT8 Quantization tflite output complete![0m


2024-05-11 22:14:53.369011: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:364] Ignored output_format.
2024-05-11 22:14:53.369074: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:367] Ignored drop_control_dependency.
2024-05-11 22:14:53.369244: I tensorflow/cc/saved_model/reader.cc:45] Reading SavedModel from: /tmp/tmpqakc845g
2024-05-11 22:14:53.372493: I tensorflow/cc/saved_model/reader.cc:91] Reading meta graph with tags { serve }
2024-05-11 22:14:53.372521: I tensorflow/cc/saved_model/reader.cc:132] Reading SavedModel debug info (if present) from: /tmp/tmpqakc845g
2024-05-11 22:14:53.379722: I tensorflow/cc/saved_model/loader.cc:231] Restoring SavedModel bundle.
2024-05-11 22:14:53.417877: I tensorflow/cc/saved_model/loader.cc:215] Running initialization op on SavedModel bundle at path: /tmp/tmpqakc845g
2024-05-11 22:14:53.446453: I tensorflow/cc/saved_model/loader.cc:314] SavedModel load for tags { serve }; Status: success: OK. Took 77209 m

[32mINT8 Quantization with int16 activations tflite output complete![0m


2024-05-11 22:14:57.572973: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:364] Ignored output_format.
2024-05-11 22:14:57.573022: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:367] Ignored drop_control_dependency.
2024-05-11 22:14:57.573172: I tensorflow/cc/saved_model/reader.cc:45] Reading SavedModel from: /tmp/tmpqakc845g
2024-05-11 22:14:57.576166: I tensorflow/cc/saved_model/reader.cc:91] Reading meta graph with tags { serve }
2024-05-11 22:14:57.576197: I tensorflow/cc/saved_model/reader.cc:132] Reading SavedModel debug info (if present) from: /tmp/tmpqakc845g
2024-05-11 22:14:57.583602: I tensorflow/cc/saved_model/loader.cc:231] Restoring SavedModel bundle.
2024-05-11 22:14:57.610895: I tensorflow/cc/saved_model/loader.cc:215] Running initialization op on SavedModel bundle at path: /tmp/tmpqakc845g
2024-05-11 22:14:57.637044: I tensorflow/cc/saved_model/loader.cc:314] SavedModel load for tags { serve }; Status: success: OK. Took 63870 m

[32mFull INT8 Quantization with int16 activations tflite output complete![0m


<keras.src.engine.functional.Functional at 0x7ff74b7aa200>

In [26]:
print(saved_model_fastest_det)

/tmp/tmpqakc845g


In [38]:
def representative_dataset():
    for _ in range(100):
        data = np.random.rand(1, 352, 352, 3)
        yield [data.astype(np.float32)]


converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_fastest_det)

converter.optimizations = [tf.lite.Optimize.DEFAULT]

converter.representative_dataset = representative_dataset

converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.uint8
converter.inference_output_type = tf.uint8


file = converter.convert()

_, fastest_det_quant_file = tempfile.mkstemp(".tflite")

with open(fastest_det_quant_file, "wb") as f:
    f.write(file)

convert_bytes(get_file_size(fastest_det_quant_file), "MB")

2024-05-11 23:15:28.413722: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:364] Ignored output_format.
2024-05-11 23:15:28.413777: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:367] Ignored drop_control_dependency.
2024-05-11 23:15:28.413936: I tensorflow/cc/saved_model/reader.cc:45] Reading SavedModel from: /tmp/tmpqakc845g
2024-05-11 23:15:28.416982: I tensorflow/cc/saved_model/reader.cc:91] Reading meta graph with tags { serve }
2024-05-11 23:15:28.417023: I tensorflow/cc/saved_model/reader.cc:132] Reading SavedModel debug info (if present) from: /tmp/tmpqakc845g
2024-05-11 23:15:28.658135: I tensorflow/cc/saved_model/loader.cc:231] Restoring SavedModel bundle.
2024-05-11 23:15:28.860968: I tensorflow/cc/saved_model/loader.cc:215] Running initialization op on SavedModel bundle at path: /tmp/tmpqakc845g
2024-05-11 23:15:28.890203: I tensorflow/cc/saved_model/loader.cc:314] SavedModel load for tags { serve }; Status: success: OK. Took 476267 

File size: 0.402 Megabytes


In [39]:
print(fastest_det_quant_file)

/tmp/tmpqibri787.tflite


In [40]:
tf.lite.experimental.Analyzer.analyze(fastest_det_quant_file)

=== /tmp/tmpqibri787.tflite ===

Your TFLite model has '1' subgraph(s). In the subgraph description below,
T# represents the Tensor numbers. For example, in Subgraph#0, the QUANTIZE op takes
tensor #0 as input and produces tensor #154 as output.

Subgraph#0 main(T#0) -> [T#364]
  Op#0 QUANTIZE(T#0) -> [T#154]
  Op#1 PAD(T#154, T#1[0, 0, 1, 1, 1, ...]) -> [T#155]
  Op#2 CONV_2D(T#155, T#153, T#152[96096, 32069, 39896, 44835, 4403, ...]) -> [T#156]
  Op#3 PAD(T#156, T#1[0, 0, 1, 1, 1, ...]) -> [T#157]
  Op#4 MAX_POOL_2D(T#157) -> [T#158]
  Op#5 PAD(T#158, T#1[0, 0, 1, 1, 1, ...]) -> [T#159]
  Op#6 DEPTHWISE_CONV_2D(T#159, T#151, T#150[3620, -14626, -4446, -6719, 2775, ...]) -> [T#160]
  Op#7 CONV_2D(T#158, T#149, T#148[-20606, -9339, 8208, -5232, 18590, ...]) -> [T#161]
  Op#8 PAD(T#161, T#1[0, 0, 1, 1, 1, ...]) -> [T#162]
  Op#9 DEPTHWISE_CONV_2D(T#162, T#147, T#146[217, 9713, -5683, -4704, 366, ...]) -> [T#163]
  Op#10 CONV_2D(T#160, T#145, T#144[3635, -574, 1858, 3546, 877, ...]) -> [

In [31]:
FASTEST_DET = f"{saved_model_fastest_det}/FastestDet_dynamic_range_quant.tflite"

tf.lite.experimental.Analyzer.analyze(FASTEST_DET)

=== /tmp/tmpqakc845g/FastestDet_dynamic_range_quant.tflite ===

Your TFLite model has '1' subgraph(s). In the subgraph description below,
T# represents the Tensor numbers. For example, in Subgraph#0, the PAD op takes
tensor #0 and tensor #13 as input and produces tensor #154 as output.

Subgraph#0 main(T#0) -> [T#360]
  Op#0 PAD(T#0, T#13[0, 0, 1, 1, 1, ...]) -> [T#154]
  Op#1 CONV_2D(T#154, T#99, T#29) -> [T#155]
  Op#2 PAD(T#155, T#13[0, 0, 1, 1, 1, ...]) -> [T#156]
  Op#3 MAX_POOL_2D(T#156) -> [T#157]
  Op#4 PAD(T#157, T#13[0, 0, 1, 1, 1, ...]) -> [T#158]
  Op#5 DEPTHWISE_CONV_2D(T#158, T#100, T#76) -> [T#159]
  Op#6 CONV_2D(T#157, T#101, T#30) -> [T#160]
  Op#7 PAD(T#160, T#13[0, 0, 1, 1, 1, ...]) -> [T#161]
  Op#8 DEPTHWISE_CONV_2D(T#161, T#102, T#77) -> [T#162]
  Op#9 CONV_2D(T#159, T#103, T#31) -> [T#163]
  Op#10 CONV_2D(T#162, T#104, T#32) -> [T#164]
  Op#11 CONCATENATION(T#163, T#164) -> [T#165]
  Op#12 TRANSPOSE(T#165, T#9[0, 3, 1, 2]) -> [T#166]
  Op#13 RESHAPE(T#166, T#6[24

In [41]:
MODEL = "models/fastest_det_rom_dyn_range.h"
HEADER = "models/fastest_det_dyn.h"

print(get_file_size(FASTEST_DET))

model_size = c_style_hexdump(FASTEST_DET, MODEL, "model_data")

print(model_size)

print(get_file_size(MODEL))
# build_header(HEADER, [])

392864
392864
1964404


In [42]:
# https://github.com/tensorflow/tensorflow/issues/43749

ODEL = "models/fastest_det_rom_full_int.h"
HEADER = "models/fastest_det_full.h"

print(get_file_size(fastest_det_quant_file))

model_size = c_style_hexdump(fastest_det_quant_file, MODEL, "model_data")

print(model_size)

print(get_file_size(MODEL))

421784
421784
2109004


# Convert from `saved_model` to `.tflite`


In [11]:
def representative_dataset():
    for _ in range(100):
        data = np.random.rand(1, 640, 640, 3)
        yield [data.astype(np.float32)]

In [12]:
converter = tf.lite.TFLiteConverter.from_saved_model("saved_model")

converter.optimizations = [tf.lite.Optimize.DEFAULT]

# converter.representative_dataset = representative_dataset

# converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
# converter.inference_input_type = tf.uint8
# converter.inference_output_type = tf.uint8

file = converter.convert()

_, baseline_quantized = tempfile.mkstemp(".tflite")

with open(baseline_quantized, "wb") as f:
    f.write(file)

convert_bytes(get_file_size(baseline_quantized), "MB")

File size: 3.34 Megabytes


2024-05-11 20:49:38.787659: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:364] Ignored output_format.
2024-05-11 20:49:38.787707: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:367] Ignored drop_control_dependency.
2024-05-11 20:49:38.787850: I tensorflow/cc/saved_model/reader.cc:45] Reading SavedModel from: saved_model
2024-05-11 20:49:38.792667: I tensorflow/cc/saved_model/reader.cc:91] Reading meta graph with tags { serve }
2024-05-11 20:49:38.792695: I tensorflow/cc/saved_model/reader.cc:132] Reading SavedModel debug info (if present) from: saved_model
2024-05-11 20:49:38.802549: I tensorflow/cc/saved_model/loader.cc:231] Restoring SavedModel bundle.
2024-05-11 20:49:38.829788: I tensorflow/cc/saved_model/loader.cc:215] Running initialization op on SavedModel bundle at path: saved_model
2024-05-11 20:49:38.854778: I tensorflow/cc/saved_model/loader.cc:314] SavedModel load for tags { serve }; Status: success: OK. Took 66927 microseconds.
20

In [13]:
tf.lite.experimental.Analyzer.analyze(baseline_quantized)

=== /tmp/tmp315mnjlf.tflite ===

Your TFLite model has '1' subgraph(s). In the subgraph description below,
T# represents the Tensor numbers. For example, in Subgraph#0, the PAD op takes
tensor #0 and tensor #13 as input and produces tensor #157 as output.

Subgraph#0 main(T#0) -> [T#409]
  Op#0 PAD(T#0, T#13[0, 0, 1, 1, 1, ...]) -> [T#157]
  Op#1 CONV_2D(T#157, T#94, T#18) -> [T#158]
  Op#2 LOGISTIC(T#158) -> [T#159]
  Op#3 MUL(T#158, T#159) -> [T#160]
  Op#4 PAD(T#160, T#13[0, 0, 1, 1, 1, ...]) -> [T#161]
  Op#5 CONV_2D(T#161, T#95, T#19) -> [T#162]
  Op#6 LOGISTIC(T#162) -> [T#163]
  Op#7 MUL(T#162, T#163) -> [T#164]
  Op#8 CONV_2D(T#164, T#96, T#20) -> [T#165]
  Op#9 LOGISTIC(T#165) -> [T#166]
  Op#10 MUL(T#165, T#166) -> [T#167]
  Op#11 STRIDED_SLICE(T#167, T#93[0, 0, 0, 0], T#92[0, 0, 0, 16], T#91[1, 1, 1, 1]) -> [T#168]
  Op#12 STRIDED_SLICE(T#167, T#92[0, 0, 0, 16], T#90[0, 0, 0, 32], T#91[1, 1, 1, 1]) -> [T#169]
  Op#13 CONV_2D(T#169, T#97, T#21) -> [T#170]
  Op#14 LOGISTIC(T#1

In [14]:
# validate on COCO dataset FP32
results_converted = YOLO(f"{saved_model}/yolov8n_float32.tflite").val(
    data="coco128.yaml"
)

# validate on COCO dataset INT8
results_quant = YOLO(baseline_quantized).val(data="coco128.yaml")

Ultralytics YOLOv8.2.12 🚀 Python-3.10.12 torch-2.1.2+cu121 CUDA:0 (NVIDIA GeForce RTX 3060 Ti, 8192MiB)
Loading /tmp/tmp5l314eq0/yolov8n_float32.tflite for TensorFlow Lite inference...
Forcing batch=1 square inference (1,3,640,640) for non-PyTorch models


INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
[34m[1mval: [0mScanning /home/eduard/Github/datasets/coco128/labels/train2017.cache... 126 images, 2 backgrounds, 0 corrupt: 100%|██████████| 128/128 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 128/128 [00:16<00:00,  7.58it/s]

                   all        128        929          0          0          0          0





Speed: 0.7ms preprocess, 123.9ms inference, 0.0ms loss, 1.3ms postprocess per image
Results saved to [1m/home/eduard/Github/x-heep-femu-tflite-sdk/runs/detect/val57[0m
Ultralytics YOLOv8.2.12 🚀 Python-3.10.12 torch-2.1.2+cu121 CUDA:0 (NVIDIA GeForce RTX 3060 Ti, 8192MiB)
Loading /tmp/tmp315mnjlf.tflite for TensorFlow Lite inference...
Forcing batch=1 square inference (1,3,640,640) for non-PyTorch models


[34m[1mval: [0mScanning /home/eduard/Github/datasets/coco128/labels/train2017.cache... 126 images, 2 backgrounds, 0 corrupt: 100%|██████████| 128/128 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 128/128 [00:13<00:00,  9.58it/s]


                   all        128        929      0.669      0.536      0.609      0.454
                person        128        254       0.81      0.661      0.757      0.541
               bicycle        128          6      0.655      0.322      0.334      0.239
                   car        128         46       0.77      0.196      0.271      0.184
            motorcycle        128          5       0.63        0.8      0.866      0.709
              airplane        128          6      0.747      0.667      0.823      0.623
                   bus        128          7      0.552      0.714      0.722      0.637
                 train        128          3      0.547      0.667      0.863      0.784
                 truck        128         12      0.783      0.306      0.435      0.258
                  boat        128          6      0.198      0.167      0.342      0.145
         traffic light        128         14          1      0.182       0.22      0.141
             stop sig

# Compare results


In [15]:
print("Original")
pprint(results.results_dict)

pprint("FP32")
pprint(results_converted.results_dict)

pprint("INT8")
pprint(results_quant.results_dict)

Original


# Sparsity and cluster preserving quantization aware training (PCQAT)


## Prune and fine-tune the model to 50% sparsity

I am using this approach and code https://github.com/VainF/Torch-Pruning after having tried, with little success (fast reduction in mAP when increasing the pruning amount) the ultralytics method https://docs.ultralytics.com/yolov5/tutorials/model_pruning_and_sparsity/#test-normally https://github.com/ultralytics/ultralytics/issues/3507


In [16]:
config = {
    "model": "yolov8n.pt",
    "epoch": 10,
    "cfg": "default.yaml",
    "iterative_steps": 16,
    "target_prune_rate": 0.1,
    "max_map_drop": 0.2,
    "data": "coco128.yaml",
}


class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self


args = AttrDict(config)
pprint(args)

In [19]:
prune(args)

Ultralytics YOLOv8.2.12 🚀 Python-3.10.12 torch-2.1.2+cu121 CUDA:0 (NVIDIA GeForce RTX 3060 Ti, 8192MiB)
YOLOv8n summary (fused): 185 layers, 3151904 parameters, 31936 gradients, 8.7 GFLOPs


[34m[1mval: [0mScanning /home/eduard/Github/datasets/coco128/labels/train2017.cache... 126 images, 2 backgrounds, 0 corrupt: 100%|██████████| 128/128 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 128/128 [00:03<00:00, 33.54it/s]


                   all        128        929      0.651      0.533      0.606      0.452
                person        128        254      0.805      0.667      0.764      0.543
               bicycle        128          6      0.661      0.328      0.329      0.232
                   car        128         46      0.819      0.196      0.269      0.181
            motorcycle        128          5      0.603        0.8       0.88      0.672
              airplane        128          6      0.755      0.667      0.845      0.619
                   bus        128          7      0.539      0.714      0.698      0.625
                 train        128          3      0.525      0.667       0.83      0.764
                 truck        128         12       0.69       0.25       0.42      0.239
                  boat        128          6      0.197      0.167      0.327      0.144
         traffic light        128         14      0.684      0.158      0.202      0.139
             stop sig

[34m[1mval: [0mScanning /home/eduard/Github/datasets/coco128/labels/train2017.cache... 126 images, 2 backgrounds, 0 corrupt: 100%|██████████| 128/128 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 128/128 [00:03<00:00, 33.99it/s]


                   all        128        929       0.52    0.00201    0.00282    0.00152
                person        128        254          0          0     0.0255    0.00965
               bicycle        128          6          1          0          0          0
                   car        128         46          1          0    0.00346    0.00234
            motorcycle        128          5          0          0          0          0
              airplane        128          6          1          0          0          0
                   bus        128          7      0.887      0.143      0.151     0.0888
                 train        128          3          0          0    0.00104   0.000311
                 truck        128         12          1          0    0.00332   0.000996
                  boat        128          6          1          0          0          0
         traffic light        128         14          0          0          0          0
             stop sig

[34m[1mtrain: [0mScanning /home/eduard/Github/datasets/coco128/labels/train2017.cache... 126 images, 2 backgrounds, 0 corrupt: 100%|██████████| 128/128 [00:00<?, ?it/s]
[34m[1mval: [0mScanning /home/eduard/Github/datasets/coco128/labels/train2017.cache... 126 images, 2 backgrounds, 0 corrupt: 100%|██████████| 128/128 [00:00<?, ?it/s]


Plotting labels to /home/eduard/Github/x-heep-femu-tflite-sdk/runs/detect/train17/labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.000119, momentum=0.9) with parameter groups 57 weight(decay=0.0), 64 weight(decay=0.0005), 63 bias(decay=0.0)
[34m[1mTensorBoard: [0mmodel graph visualization added ✅
Image sizes 640 train, 640 val
Using 8 dataloader workers
Logging results to [1m/home/eduard/Github/x-heep-femu-tflite-sdk/runs/detect/train17[0m
Starting training for 100 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      1/100      3.31G      3.865      6.577      3.282        217        640: 100%|██████████| 8/8 [00:01<00:00,  4.16it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 4/4 [00:00<00:00,  7.24it/s]

                   all        128        929          0          0          0          0





AttributeError: type object 'DetectionTrainer' has no attribute 'last'

In [None]:
PRUNING_AMOUNT = 0.1

In [None]:
for name, m in model.named_modules():
    if isinstance(m, nn.Conv2d):
        prune.l1_unstructured(m, name="weight", amount=PRUNING_AMOUNT)  # prune
        prune.remove(m, "weight")  # make permanent

In [None]:
print(f"Model pruned to {sparsity(model.model):.3} global sparsity")

Model pruned to 0.0998 global sparsity


In [None]:
ckpt = {
    "model": model.model,
    "train_args": {},  # save as dict
}

In [None]:
_, pruned_model = tempfile.mkstemp(".pt")

torch.save(ckpt, pruned_model)

pruned_model = YOLO(pruned_model)

results_pruned = pruned_model.val(data="coco128.yaml")


pprint("PRUNED")
pprint(results_pruned.results_dict)

Ultralytics YOLOv8.2.11 🚀 Python-3.10.12 torch-2.1.2+cu121 CUDA:0 (NVIDIA GeForce RTX 3060 Ti, 8192MiB)


[34m[1mval: [0mScanning /home/eduard/Github/datasets/coco128/labels/train2017.cache... 126 images, 2 backgrounds, 0 corrupt: 100%|██████████| 128/128 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 8/8 [00:03<00:00,  2.51it/s]


                   all        128        929      0.483      0.429      0.462      0.317
                person        128        254      0.337      0.531      0.407      0.242
               bicycle        128          6      0.625      0.333      0.301      0.221
                   car        128         46      0.213      0.152      0.109     0.0482
            motorcycle        128          5       0.52        0.8      0.832      0.651
              airplane        128          6      0.518      0.667      0.683       0.41
                   bus        128          7      0.487      0.714       0.68      0.498
                 train        128          3      0.364      0.667      0.684      0.504
                 truck        128         12      0.773      0.288      0.376       0.16
                  boat        128          6      0.546      0.167      0.238      0.101
         traffic light        128         14      0.128     0.0714     0.0808     0.0223
             stop sig

In [21]:
tf_model = keras.saving.load_model(f"{saved_model}/yolov8n_float32.h5")

In [None]:
prune_low_magnitude = tfmot.sparsity.keras.prune_low_magnitude

pruning_params = {
    "pruning_schedule": tfmot.sparsity.keras.ConstantSparsity(
        0.5, begin_step=0, frequency=100
    )
}

callbacks = [tfmot.sparsity.keras.UpdatePruningStep()]

pruned_model = prune_low_magnitude(tf_model, **pruning_params)

# Use smaller learning rate for fine-tuning
opt = keras.optimizers.Adam(learning_rate=1e-5)

pruned_model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    metrics=["accuracy"],
)

ValueError: Please initialize `Prune` with a supported layer. Layers should either be supported by the PruneRegistry (built-in keras layers) or should be a `PrunableLayer` instance, or should has a customer defined `get_prunable_weights` method. You passed: <class 'keras.src.layers.core.tf_op_layer.TFOpLambda'>

# Fit a YOLOv8 model with keras directly

https://keras.io/examples/vision/yolov8/


## Hyperparameters


In [None]:
SPLIT_RATIO = 0.2
BATCH_SIZE = 4
LEARNING_RATE = 0.001
EPOCH = 5
GLOBAL_CLIPNORM = 10.0

In [None]:
class_ids = [
    "car",
    "pedestrian",
    "trafficLight",
    "biker",
    "truck",
]
class_mapping = dict(zip(range(len(class_ids)), class_ids))

## Build the model

I start from pre-trained coco weigths


In [None]:
backbone = keras_cv.models.YOLOV8Backbone.from_preset("yolo_v8_xs_backbone_coco")
# https://github.com/keras-team/keras-cv/issues/1886

Downloading from https://www.kaggle.com/api/v1/models/keras/yolov8/keras/yolo_v8_xs_backbone_coco/2/download/model.weights.h5...
100%|██████████| 5.11M/5.11M [00:01<00:00, 5.35MB/s]


In [None]:
yolo = keras_cv.models.YOLOV8Detector(
    num_classes=len(class_mapping),
    bounding_box_format="xyxy",
    backbone=backbone,
    fpn_depth=1,
)

## Compile the model

- **Classification loss**: Each thing is either an identified class or not so it's a bianary classification problem
- **Box loss**: Complete IoU metric not only measures the overlap between predicted and ground truth bounding boxes but also considers the difference in aspect ratio, center distance, and box size


In [None]:
optimizer = tf.keras.optimizers.Adam(
    learning_rate=LEARNING_RATE,
    global_clipnorm=GLOBAL_CLIPNORM,
)

yolo.compile(
    optimizer=optimizer,
    classification_loss="binary_crossentropy",
    box_loss="ciou",
)

## Train the model


### COCO Metric Callback

Used to calculate the mAP (Mean Average Precision) score, Recall and Precision and to save the model when the mAP score improves


In [None]:
class EvaluateCOCOMetricsCallback(keras.callbacks.Callback):
    def __init__(self, data, save_path):
        super().__init__()
        self.data = data
        self.metrics = keras_cv.metrics.BoxCOCOMetrics(
            bounding_box_format="xyxy",
            evaluate_freq=1e9,
        )

        self.save_path = save_path
        self.best_map = -1.0

    def on_epoch_end(self, epoch, logs):
        self.metrics.reset_state()
        for batch in self.data:
            images, y_true = batch[0], batch[1]
            y_pred = self.model.predict(images, verbose=0)
            self.metrics.update_state(y_true, y_pred)

        metrics = self.metrics.result(force=True)
        logs.update(metrics)

        current_map = metrics["MaP"]
        if current_map > self.best_map:
            self.best_map = current_map
            self.model.save(self.save_path)  # Save the model when mAP improves

        return logs

In [None]:
yolo.fit(
    train_ds,
    validation_data=val_ds,
    epochs=3,
    callbacks=[EvaluateCOCOMetricsCallback(val_ds, "model.h5")],
)

In [None]:
def visualize_detections(model, dataset, bounding_box_format):
    images, y_true = next(iter(dataset.take(1)))
    y_pred = model.predict(images)
    y_pred = bounding_box.to_ragged(y_pred)
    visualization.plot_bounding_box_gallery(
        images,
        value_range=(0, 255),
        bounding_box_format=bounding_box_format,
        y_true=y_true,
        y_pred=y_pred,
        scale=4,
        rows=2,
        cols=2,
        show=True,
        font_scale=0.7,
        class_mapping=class_mapping,
    )


visualize_detections(yolo, dataset=val_ds, bounding_box_format="xyxy")