In [2]:
import io
from iree import runtime
from iree.compiler.transforms import ireec
# The compiler re-exports API access to a number of dialects. If one of these
# fails to import, it indicates a build issue.
import numpy as np
from iree.compiler import tools
from iree.runtime.benchmark import benchmark_module

# Performance of IREE (Keras)

In [4]:
import tensorflow as tf
import time
import numpy as np
from tensorflow.keras import backend
from iree.tf.support import module_utils
from vit_keras import vit
from keras_def import SequentialCNN, ConcatenatedCNN, NestedCNN, ShuffleNet, SqueezeNet_11, ResNet18

from tensorflow.keras.applications import ResNet50, ResNet50V2, MobileNetV3Small, DenseNet121, InceptionV3, VGG16
channels_first = False

model_list = {"MobileNetV3":MobileNetV3Small, "ResNet50":ResNet50, "SqueezeNet":SqueezeNet_11, "ShuffleNet":ShuffleNet,
              "Inception3":InceptionV3, "DenseNet":DenseNet121, 
             "Vit":vit.vit_b16}
# model_list = {"ResNet18":ResNet18}

INPUT_SHAPE = [1, 3, 224, 224] if channels_first else [1, 224, 224, 3]
backend.set_image_data_format('channels_first' if channels_first else 'channels_last')
x_train = np.random.uniform(low=0.0, high=1.0, size=(1, 3, 224, 224) if channels_first else (1, 224, 224, 3)).astype(np.float32)

for modelname, Model in model_list.items():
    
    if modelname == "Vit":
        model = Model(image_size=224, activation='relu', pretrained=False, include_top=True, pretrained_top=False, channel_first=channels_first)
    elif modelname == "SqueezeNet":
        model = Model(input_shape=tuple(INPUT_SHAPE[1:]), nb_classes=1000, channel_first=channels_first)
    elif modelname == "ShuffleNet":
        model = Model(include_top=True, input_shape=tuple(INPUT_SHAPE[1:]))
    elif modelname == "ResNet18":
        model = Model(classes=1000, input_shape=tuple(INPUT_SHAPE[1:]))
    else:
        model = Model(weights=None, include_top=True, input_shape=tuple(INPUT_SHAPE[1:]))
        
    t1_start = time.perf_counter()
    # Wrap the model in a tf.Module to compile it with IREE.
    class Module(tf.Module):
      def __init__(self):
        super(Module, self).__init__()
        self.m = model
        self.m.predict = lambda x: self.m.call(x, training=False)
        self.predict = tf.function(
            input_signature=[tf.TensorSpec(INPUT_SHAPE, tf.float32)])(model.predict)
    
    backend_choice = "iree_llvmcpu (CPU)" #@param [ "iree_vmvx (CPU)", "iree_llvmcpu (CPU)", "iree_vulkan (GPU/SwiftShader)" ]
    backend_choice = backend_choice.split(" ")[0]
    backend = module_utils.BackendInfo(backend_choice)
    compiled_model = backend.compile_from_class(Module, ["predict"])
    t1_stop = time.perf_counter()
    print("**** Model {} - Total Time: {:.3f}s".format(modelname, t1_stop - t1_start)) # print performance indicator

    print("Performing benchmark...")
    print("Calculating forward latency:\n  ", end="")
    %timeit -n 100 compiled_model.predict(x_train)

INFO:tensorflow:Assets written to: /tmp/tmpnski96_3.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmpnski96_3.sm/assets


**** Model MobileNetV3 - Total Time: 22.685s
Performing benchmark...
Calculating forward latency:
  77.8 ms ± 299 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
INFO:tensorflow:Assets written to: /tmp/tmpeopriwdk.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmpeopriwdk.sm/assets


**** Model ResNet50 - Total Time: 18.991s
Performing benchmark...
Calculating forward latency:
  204 ms ± 305 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
INFO:tensorflow:Assets written to: /tmp/tmp2i93g1jp.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmp2i93g1jp.sm/assets


**** Model SqueezeNet - Total Time: 6.337s
Performing benchmark...
Calculating forward latency:
  82.1 ms ± 466 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
INFO:tensorflow:Assets written to: /tmp/tmpt9b_fdzd.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmpt9b_fdzd.sm/assets


**** Model ShuffleNet - Total Time: 17.519s
Performing benchmark...
Calculating forward latency:
  83.5 ms ± 336 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
INFO:tensorflow:Assets written to: /tmp/tmp9lb0taga.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmp9lb0taga.sm/assets


**** Model Inception3 - Total Time: 30.343s
Performing benchmark...
Calculating forward latency:
  166 ms ± 798 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
INFO:tensorflow:Assets written to: /tmp/tmpu0959bvn.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmpu0959bvn.sm/assets


**** Model DenseNet - Total Time: 44.449s
Performing benchmark...
Calculating forward latency:
  178 ms ± 836 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
INFO:tensorflow:Assets written to: /tmp/tmpkzhyg36a.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmpkzhyg36a.sm/assets


**** Model Vit - Total Time: 25.910s
Performing benchmark...
Calculating forward latency:
  728 ms ± 1.55 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [2]:
from bert import BertModelLayer
from tensorflow import keras
import tensorflow as tf
import numpy as np
from ufront.keras.model import UFrontKeras
import iree.compiler as ireec
from iree import runtime
import time
from tensorflow.keras import backend
from iree.tf.support import module_utils
import iree
GPU = False

l_bert = BertModelLayer(**BertModelLayer.Params(
  vocab_size               = 16000,        # embedding params
  use_token_type           = True,
  use_position_embeddings  = True,
  token_type_vocab_size    = 16000,

  num_layers               = 12,           # transformer encoder params
  hidden_size              = 768,
  hidden_dropout           = 0.1,
  intermediate_size        = 4*768,
  intermediate_activation  = "gelu",

  adapter_size             = None,         # see arXiv:1902.00751 (adapter-BERT)

  shared_layer             = False,        # True for ALBERT (arXiv:1909.11942)
  embedding_size           = None,         # None for BERT, wordpiece embedding size for ALBERT
  num_heads = 12,
  # name                     = "bert"        # any other Keras layer params
))

input_ids = np.array([[31, 51, 99], [15, 5, 0]], dtype='int32')
input_mask = np.array([[1, 1, 1], [1, 1, 0]], dtype='int32')
token_type_ids = np.array([[0, 0, 1], [0, 1, 0]], dtype='int32')

INPUT_SHAPE = [2, 3]
max_seq_len = 3
l_input_ids      = keras.layers.Input(shape=(max_seq_len,), dtype='int32')
l_token_type_ids = keras.layers.Input(shape=(max_seq_len,), dtype='int32')

output = l_bert([l_input_ids, l_token_type_ids])          # [batch_size, max_seq_len, hidden_size]
model = keras.Model(inputs=[l_input_ids, l_token_type_ids], outputs=output)

t1_start = time.perf_counter()


modelname = "Bert"
BATCH_SIZE = INPUT_SHAPE[0]
SEQUENCE_LENGTH = INPUT_SHAPE[1]
bert_input = [tf.TensorSpec(shape=[BATCH_SIZE,SEQUENCE_LENGTH],dtype=tf.int32),
            tf.TensorSpec(shape=[BATCH_SIZE,SEQUENCE_LENGTH], dtype=tf.int32)]
class BertModule(tf.Module):
    def __init__(self):
        super(BertModule, self).__init__()
        dict_outputs = False
        self.m = model
        self.m.predict = lambda x: self.m.call(x, training=False)
        
    @tf.function(input_signature=bert_input)
    def predict(self,input_word_ids, segment_ids):
        inputs = [input_word_ids, segment_ids]
        return self.m.predict(inputs)
if GPU:
    binary = iree.compiler.tf.compile_module(
            BertModule(),
            target_backends=["cuda"],
            exported_names=["predict"])
    
    t1_stop = time.perf_counter()
    print("**** Model {} - Total Time: {:.3f}s".format(modelname, t1_stop - t1_start)) # print performance indicator

    
    module = runtime.load_vm_flatbuffer(binary, driver="cuda")
    print("Calculating forward latency:\n  ", end="")
    %timeit -n 100 module.predict(input_ids, token_type_ids)
    # tms = []
    # for i in range(10):
    #     ret = benchmark_module(module.vm_module, entry_functiong="predict", inputs=["1x3x224x224xf32=1"], device="cuda")
    #     tm = ret[0].time
    #     tms.append(float(tm[0:-3]))
    # print("{} - {:.3f} ± {:.3f} ms".format(modelname, np.mean(tms), np.std(tms)))
else:
    backend_choice = "iree_llvmcpu (CPU)" #@param [ "iree_vmvx (CPU)", "iree_llvmcpu (CPU)", "iree_vulkan (GPU/SwiftShader)" ]
    backend_choice = backend_choice.split(" ")[0]
    backend = module_utils.BackendInfo(backend_choice)
    compiled_model = backend.compile_from_class(BertModule, ["predict"])
    t1_stop = time.perf_counter()
    print("**** Model {} - Total Time: {:.3f}s".format(modelname, t1_stop - t1_start)) # print performance indicator

    print("Performing benchmark...")
    print("Calculating forward latency:\n  ", end="")
    %timeit -n 100 compiled_model.predict(input_ids, token_type_ids)
    

INFO:tensorflow:Assets written to: /tmp/tmp0xzo9men.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmp0xzo9men.sm/assets


**** Model Bert - Total Time: 23.804s
Performing benchmark...
Calculating forward latency:
  68.3 ms ± 219 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [19]:
module.predict(input_ids, token_type_ids)

<IREE DeviceArray: shape=[2, 3, 768], dtype=<class 'numpy.float32'>>

## GPU

In [3]:
import tensorflow as tf
import time
import iree
import numpy as np
from iree import runtime
from tensorflow.keras import backend
from iree.tf.support import module_utils
from vit_keras import vit
from keras_def import SequentialCNN, ConcatenatedCNN, NestedCNN, ShuffleNet, SqueezeNet_11, ResNet18

from tensorflow.keras.applications import ResNet50, ResNet50V2, MobileNetV3Small, DenseNet121, InceptionV3, VGG16
channels_first = True
GPU = True
model_list = {"ResNet18":ResNet18, "MobileNetV3":MobileNetV3Small, "ResNet50":ResNet50, "SqueezeNet":SqueezeNet_11, "ShuffleNet":ShuffleNet,
              "Inception3":InceptionV3, "DenseNet":DenseNet121, 
             "Vit":vit.vit_b16} 
input = np.random.uniform(low=0.0, high=1.0, size=(1, 3, 224, 224) if channels_first else (1, 224, 224, 3)).astype(np.float32)

INPUT_SHAPE = [1, 3, 224, 224] if channels_first else [1, 224, 224, 3]
backend.set_image_data_format('channels_first' if channels_first else 'channels_last')

for modelname, Model in model_list.items():
    
    if modelname == "Vit":
        model = Model(image_size=224, activation='relu', pretrained=False, include_top=True, pretrained_top=False, channel_first=channels_first)
    elif modelname == "SqueezeNet":
        model = Model(input_shape=tuple(INPUT_SHAPE[1:]), nb_classes=1000, channel_first=channels_first)
    elif modelname == "ShuffleNet":
        model = Model(include_top=True, input_shape=tuple(INPUT_SHAPE[1:]))
    elif modelname == "ResNet18":
        model = Model(classes=1000, input_shape=tuple(INPUT_SHAPE[1:]))
    else:
        model = Model(weights=None, include_top=True, input_shape=tuple(INPUT_SHAPE[1:]))
        
    t1_start = time.perf_counter()
    # Wrap the model in a tf.Module to compile it with IREE.
    class Module(tf.Module):
      def __init__(self):
        super(Module, self).__init__()
        self.m = model
        self.m.predict = lambda x: self.m.call(x, training=False)
        self.predict = tf.function(
            input_signature=[tf.TensorSpec(INPUT_SHAPE, tf.float32)])(model.predict)
    
    if GPU:
        binary = iree.compiler.tf.compile_module(
                Module(),
                target_backends=["cuda"],
                exported_names=["predict"])
        compiled_model = runtime.load_vm_flatbuffer(binary, driver="cuda")
    else:
        backend_choice = "iree_llvmcpu (CPU)" #@param [ "iree_vmvx (CPU)", "iree_llvmcpu (CPU)", "iree_vulkan (GPU/SwiftShader)" ]
        backend_choice = backend_choice.split(" ")[0]
        backend = module_utils.BackendInfo(backend_choice)
        compiled_model = backend.compile_from_class(Module, ["predict"])
    t1_stop = time.perf_counter()
    print("**** Model {} - Total Time: {:.3f}s".format(modelname, t1_stop - t1_start)) # print performance indicator

    print("Calculating forward latency:\n  ", end="")
    # tms = []
    # for i in range(10):
    #     ret = benchmark_module(module.vm_module, entry_functiong="predict", inputs=["1x3x224x224xf32=1"], device="cuda")
    #     tm = ret[0].time
    #     tms.append(float(tm[0:-3]))
    # print("{} - {:.3f} ± {:.3f} ms".format(modelname, np.mean(tms), np.std(tms)))
    %timeit -n 100 compiled_model.predict(input)

INFO:tensorflow:Assets written to: /tmp/tmp48pzv4te.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmp48pzv4te.sm/assets


**** Model ResNet18 - Total Time: 8.131s
Calculating forward latency:
  4.2 ms ± 27.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
INFO:tensorflow:Assets written to: /tmp/tmprseqvwkc.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmprseqvwkc.sm/assets


**** Model MobileNetV3 - Total Time: 16.621s
Calculating forward latency:
  2.44 ms ± 46.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
INFO:tensorflow:Assets written to: /tmp/tmpwi5kmuir.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmpwi5kmuir.sm/assets


**** Model ResNet50 - Total Time: 17.069s
Calculating forward latency:
  7.17 ms ± 50.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
INFO:tensorflow:Assets written to: /tmp/tmpcs2s15al.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmpcs2s15al.sm/assets


**** Model SqueezeNet - Total Time: 5.435s
Calculating forward latency:
  2.24 ms ± 38.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
INFO:tensorflow:Assets written to: /tmp/tmpcrchwiv4.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmpcrchwiv4.sm/assets


**** Model ShuffleNet - Total Time: 15.593s
Calculating forward latency:
  3.37 ms ± 48 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
INFO:tensorflow:Assets written to: /tmp/tmpyqa34p21.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmpyqa34p21.sm/assets


**** Model Inception3 - Total Time: 27.603s
Calculating forward latency:
  11.7 ms ± 58.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
INFO:tensorflow:Assets written to: /tmp/tmp5chpi6ja.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmp5chpi6ja.sm/assets


**** Model DenseNet - Total Time: 39.795s
Calculating forward latency:
  8.17 ms ± 120 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
INFO:tensorflow:Assets written to: /tmp/tmpvskkpbfl.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmpvskkpbfl.sm/assets


**** Model Vit - Total Time: 25.975s
Calculating forward latency:
  35.2 ms ± 119 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# Performance of Ufront (Keras Models)

In [4]:
import tensorflow as tf
import time
import iree
from iree import runtime
import numpy as np
from tensorflow.keras import backend
from iree.tf.support import module_utils
from vit_keras import vit
from keras_def import SequentialCNN, ConcatenatedCNN, NestedCNN, ShuffleNet, SqueezeNet_11, ResNet18
from ufront.keras.model import UFrontKeras
import iree.compiler as ireec

from tensorflow.keras.applications import ResNet50, ResNet50V2, MobileNetV3Small, DenseNet121, InceptionV3, VGG16
channels_first = True
GPU = True
model_list = {"ResNet18":ResNet18, "MobileNetV3":MobileNetV3Small, "ResNet50":ResNet50, "SqueezeNet":SqueezeNet_11, "ShuffleNet":ShuffleNet,
              "Inception3":InceptionV3, "DenseNet":DenseNet121, 
             "Vit":vit.vit_b16} 


INPUT_SHAPE = [1, 3, 224, 224] if channels_first else [1, 224, 224, 3]
backend.set_image_data_format('channels_first' if channels_first else 'channels_last')
input = np.random.uniform(low=0.0, high=1.0, size=(1, 3, 224, 224) if channels_first else (1, 224, 224, 3)).astype(np.float32)

for modelname, Model in model_list.items():
    
    if modelname == "Vit":
        base_model = Model(image_size=224, activation='relu', pretrained=False, include_top=True, pretrained_top=False, channel_first=channels_first)
    elif modelname == "SqueezeNet":
        base_model = Model(input_shape=tuple(INPUT_SHAPE[1:]), nb_classes=1000, channel_first=channels_first)
    elif modelname == "ShuffleNet":
        base_model = Model(include_top=True, input_shape=tuple(INPUT_SHAPE[1:]))
    elif modelname == "ResNet18":
        base_model = Model(classes=1000, input_shape=tuple(INPUT_SHAPE[1:]))
    else:
        base_model = Model(weights=None, include_top=True, input_shape=tuple(INPUT_SHAPE[1:]))
        
    model_name = base_model.name

    transformer = True if model_name.find("Transformer") > 0 or model_name.find("vit") >= 0 else False
    t1_start = time.perf_counter()

    model = UFrontKeras(base_model, inputs = [input], batch_size = 1, transformer=transformer, pass_weights=True)

    if transformer:
      last_op = model.get_output_operator()
      output = model.umodel().softmax(input=last_op.get_output(0), name="softmax_out")

    model.compile(optimizer={"type":"sgd", "lr":"0.01", "momentum":"0", "nesterov":"False", "weight_decay":"0"},
                        loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
    
    # modelir= model.dump_ir()

    tosa_ir= model.dump_tosa_ir()
    t1_stop = time.perf_counter()
    print("Compiling TOSA model...")
    if GPU:
        binary = ireec.compile_str(tosa_ir,
                        target_backends=["cuda"], 
                        input_type=ireec.InputType.TOSA)
        module = runtime.load_vm_flatbuffer(binary, driver="cuda")
    else:
        binary = ireec.compile_str(tosa_ir,
                        target_backends=["llvm-cpu"], 
                        input_type=ireec.InputType.TOSA)
        module = runtime.load_vm_flatbuffer(binary,backend="llvm-cpu") 

    t2_stop = time.perf_counter()
    print(modelname + "****Ufront->TOSA Time: {:.3f}s, TOSA->Binary Time: {:.3f}s, Total Time: {:.3f}s".format(t1_stop - t1_start, t2_stop - t1_stop, t2_stop - t1_start)) # print performance indicator

    %timeit -n 100 ufront_ret = module.forward(input)

    

2023-10-24 17:41:29.489666: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2023-10-24 17:41:29.489813: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session
2023-10-24 17:41:30.070867: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2023-10-24 17:41:30.071026: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session


Compiling TOSA model...
ResNet18****Ufront->TOSA Time: 2.586s, TOSA->Binary Time: 15.149s, Total Time: 17.734s
3.65 ms ± 22.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


2023-10-24 17:41:50.552071: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2023-10-24 17:41:50.552224: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session
2023-10-24 17:41:51.058447: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2023-10-24 17:41:51.058586: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session


Compiling TOSA model...
MobileNetV3****Ufront->TOSA Time: 3.936s, TOSA->Binary Time: 7.248s, Total Time: 11.184s
1.81 ms ± 25.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


2023-10-24 17:42:03.647954: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2023-10-24 17:42:03.648097: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session
2023-10-24 17:42:05.539141: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2023-10-24 17:42:05.539295: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session


Compiling TOSA model...
ResNet50****Ufront->TOSA Time: 6.498s, TOSA->Binary Time: 29.435s, Total Time: 35.933s
6.81 ms ± 25.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


2023-10-24 17:42:44.260989: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2023-10-24 17:42:44.261126: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session
2023-10-24 17:42:44.421330: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2023-10-24 17:42:44.421466: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session


Compiling TOSA model...
SqueezeNet****Ufront->TOSA Time: 0.616s, TOSA->Binary Time: 3.592s, Total Time: 4.208s
2.19 ms ± 39.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


2023-10-24 17:42:51.300706: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2023-10-24 17:42:51.300878: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session
2023-10-24 17:42:51.884569: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2023-10-24 17:42:51.884744: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session


Compiling TOSA model...
ShuffleNet****Ufront->TOSA Time: 2.824s, TOSA->Binary Time: 4.442s, Total Time: 7.265s
2.84 ms ± 50.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


2023-10-24 17:43:01.656539: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2023-10-24 17:43:01.656715: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session
2023-10-24 17:43:02.940418: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2023-10-24 17:43:02.940584: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session


Compiling TOSA model...
Inception3****Ufront->TOSA Time: 6.673s, TOSA->Binary Time: 28.789s, Total Time: 35.462s
10.8 ms ± 18.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


2023-10-24 17:43:46.701273: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2023-10-24 17:43:46.701442: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session
2023-10-24 17:43:48.062444: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2023-10-24 17:43:48.062608: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session


Compiling TOSA model...
DenseNet****Ufront->TOSA Time: 6.985s, TOSA->Binary Time: 16.366s, Total Time: 23.351s
7.11 ms ± 44.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


2023-10-24 17:44:17.513027: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2023-10-24 17:44:17.513197: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session
2023-10-24 17:44:23.491630: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2023-10-24 17:44:23.491811: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session


Compiling TOSA model...
Vit****Ufront->TOSA Time: 24.979s, TOSA->Binary Time: 5.934s, Total Time: 30.913s
30.1 ms ± 105 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
