In [1]:
!nvidia-smi

Wed May 22 15:00:55 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 520.61.05    Driver Version: 520.61.05    CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:21:00.0 Off |                  N/A |
|  0%   29C    P8    16W / 240W |     46MiB /  8192MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Install dependencies

In [2]:
# For CUDA 11
!pip install iree-compiler==20230524.529 iree-runtime==20230524.529 
!pip install iree-tools-tf==20230524.529  iree-tools-tflite==20230524.529

# For CUDA 12

# !pip install iree-compiler==20230815.614 iree-runtime==20230815.614
# !pip install iree-tools-tf==20230815.614  iree-tools-tflite==20230815.614
# fix issue of iree-benchmark-module for iree-compiler (v20230815.614), depend on the installation of IREE package
# ls /opt/conda/lib/python3.10/site-packages/iree/_runtime_libs/
# cp /opt/conda/lib/python3.10/site-packages/iree/_runtime_libs/iree-benchmark-module /opt/conda/lib/python3.10/site-packages/iree/runtime/


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting iree-tools-tflite==20230524.529
  Downloading iree_tools_tflite-20230524.529-py3-none-any.whl (3.2 kB)
Installing collected packages: iree-tools-tflite
Successfully installed iree-tools-tflite-20230524.529
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
# Other dependencies for IREE-TF
# !pip install tensorflow-cpu==2.13.0 
# !pip install tensorflow-addons
# !pip install validators
# !pip install scipy
# !pip install opencv-python
# !apt install libgl1 -y

In [3]:
!pip list | grep iree

iree-compiler                      20230524.529
iree-runtime                       20230524.529
iree-tools-tf                      20230524.529
iree-tools-tflite                  20230524.529

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
!pip list | grep tensorflow

tensorflow-addons                  0.21.0
tensorflow-cpu                     2.13.0
tensorflow-datasets                4.5.2
tensorflow-estimator               2.13.0
tensorflow-io-gcs-filesystem       0.24.0
tensorflow-metadata                1.7.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


# IREE-TF (CPU)

In [4]:
import tensorflow as tf
import time
import numpy as np
from tensorflow.keras import backend
from iree.tf.support import module_utils
from vit_keras import vit
from keras_def import SequentialCNN, ConcatenatedCNN, NestedCNN, ShuffleNet, SqueezeNet_11, ResNet18

from tensorflow.keras.applications import ResNet50, ResNet50V2, MobileNetV3Small, DenseNet121, InceptionV3, VGG16
channels_first = False

model_list = {"ResNet18":ResNet18, "MobileNetV3":MobileNetV3Small, "ResNet50":ResNet50, "SqueezeNet":SqueezeNet_11, "ShuffleNet":ShuffleNet,
              "Inception3":InceptionV3, "DenseNet":DenseNet121, 
             "Vit":vit.vit_b16}

INPUT_SHAPE = [1, 3, 224, 224] if channels_first else [1, 224, 224, 3]
backend.set_image_data_format('channels_first' if channels_first else 'channels_last')
x_train = np.random.uniform(low=0.0, high=1.0, size=(1, 3, 224, 224) if channels_first else (1, 224, 224, 3)).astype(np.float32)

for modelname, Model in model_list.items():
    
    if modelname == "Vit":
        model = Model(image_size=224, activation='relu', pretrained=False, include_top=True, pretrained_top=False, channel_first=channels_first)
    elif modelname == "SqueezeNet":
        model = Model(input_shape=tuple(INPUT_SHAPE[1:]), nb_classes=1000, channel_first=channels_first)
    elif modelname == "ShuffleNet":
        model = Model(include_top=True, input_shape=tuple(INPUT_SHAPE[1:]))
    elif modelname == "ResNet18":
        model = Model(classes=1000, input_shape=tuple(INPUT_SHAPE[1:]))
    else:
        model = Model(weights=None, include_top=True, input_shape=tuple(INPUT_SHAPE[1:]))
        
    t1_start = time.perf_counter()
    # Wrap the model in a tf.Module to compile it with IREE.
    class Module(tf.Module):
      def __init__(self):
        super(Module, self).__init__()
        self.m = model
        self.m.predict = lambda x: self.m.call(x, training=False)
        self.predict = tf.function(
            input_signature=[tf.TensorSpec(INPUT_SHAPE, tf.float32)])(model.predict)
    
    backend_choice = "iree_llvmcpu (CPU)" #@param [ "iree_vmvx (CPU)", "iree_llvmcpu (CPU)", "iree_vulkan (GPU/SwiftShader)" ]
    backend_choice = backend_choice.split(" ")[0]
    backend = module_utils.BackendInfo(backend_choice)
    compiled_model = backend.compile_from_class(Module, ["predict"])
    t1_stop = time.perf_counter()
    print("**** Model {} - Total Time: {:.3f}s".format(modelname, t1_stop - t1_start)) # print performance indicator

    print("Performing benchmark...")
    print("Calculating forward latency:\n  ", end="")
    %timeit -n 100 compiled_model.predict(x_train)

INFO:tensorflow:Assets written to: /tmp/tmpnski96_3.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmpnski96_3.sm/assets


**** Model MobileNetV3 - Total Time: 22.685s
Performing benchmark...
Calculating forward latency:
  77.8 ms ± 299 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
INFO:tensorflow:Assets written to: /tmp/tmpeopriwdk.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmpeopriwdk.sm/assets


**** Model ResNet50 - Total Time: 18.991s
Performing benchmark...
Calculating forward latency:
  204 ms ± 305 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
INFO:tensorflow:Assets written to: /tmp/tmp2i93g1jp.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmp2i93g1jp.sm/assets


**** Model SqueezeNet - Total Time: 6.337s
Performing benchmark...
Calculating forward latency:
  82.1 ms ± 466 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
INFO:tensorflow:Assets written to: /tmp/tmpt9b_fdzd.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmpt9b_fdzd.sm/assets


**** Model ShuffleNet - Total Time: 17.519s
Performing benchmark...
Calculating forward latency:
  83.5 ms ± 336 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
INFO:tensorflow:Assets written to: /tmp/tmp9lb0taga.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmp9lb0taga.sm/assets


**** Model Inception3 - Total Time: 30.343s
Performing benchmark...
Calculating forward latency:
  166 ms ± 798 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
INFO:tensorflow:Assets written to: /tmp/tmpu0959bvn.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmpu0959bvn.sm/assets


**** Model DenseNet - Total Time: 44.449s
Performing benchmark...
Calculating forward latency:
  178 ms ± 836 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
INFO:tensorflow:Assets written to: /tmp/tmpkzhyg36a.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmpkzhyg36a.sm/assets


**** Model Vit - Total Time: 25.910s
Performing benchmark...
Calculating forward latency:
  728 ms ± 1.55 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


## IREE-TF (GPU)

#### Vision Models

In [1]:
import tensorflow as tf
import time
import iree
import numpy as np
from iree import runtime
from tensorflow.keras import backend
from iree.tf.support import module_utils
from vit_keras import vit
from keras_def import SequentialCNN, ConcatenatedCNN, NestedCNN, ShuffleNet, SqueezeNet_11, ResNet18
from benchmark import benchmark_module
from tensorflow.keras.applications import ResNet50, ResNet50V2, MobileNetV3Small, DenseNet121, InceptionV3, VGG16
channels_first = True
model_list = {"ResNet18":ResNet18, "MobileNetV3":MobileNetV3Small, "ResNet50":ResNet50, "SqueezeNet":SqueezeNet_11, "ShuffleNet":ShuffleNet,
              "Inception3":InceptionV3, "DenseNet":DenseNet121, 
             "Vit":vit.vit_b16} 
input = np.random.uniform(low=0.0, high=1.0, size=(1, 3, 224, 224) if channels_first else (1, 224, 224, 3)).astype(np.float32)

INPUT_SHAPE = [1, 3, 224, 224] if channels_first else [1, 224, 224, 3]
backend.set_image_data_format('channels_first' if channels_first else 'channels_last')

for modelname, Model in model_list.items():
    
    if modelname == "Vit":
        model = Model(image_size=224, activation='relu', pretrained=False, include_top=True, pretrained_top=False, channel_first=channels_first)
    elif modelname == "SqueezeNet":
        model = Model(input_shape=tuple(INPUT_SHAPE[1:]), nb_classes=1000, channel_first=channels_first)
    elif modelname == "ShuffleNet":
        model = Model(include_top=True, input_shape=tuple(INPUT_SHAPE[1:]))
    elif modelname == "ResNet18":
        model = Model(classes=1000, input_shape=tuple(INPUT_SHAPE[1:]))
    else:
        model = Model(weights=None, include_top=True, input_shape=tuple(INPUT_SHAPE[1:]))
        
    t1_start = time.perf_counter()
    # Wrap the model in a tf.Module to compile it with IREE.
    class Module(tf.Module):
        def __init__(self):
            super(Module, self).__init__()
            dict_outputs = False
            self.m = model
            self.m.predict = lambda x: self.m.call(x, training=False)
            
        @tf.function(input_signature=[tf.TensorSpec(shape=[1, 3, 224, 224],dtype=tf.float32)])
        def predict(self,x):
            return self.m.predict(x)
    
    binary = iree.compiler.tf.compile_module(
            Module(),
            target_backends=["cuda"],
            exported_names=["predict"])

    t1_stop = time.perf_counter()
    print("**** Model {} - Total Time: {:.3f}s".format(modelname, t1_stop - t1_start)) # print performance indicator

    print("Calculating forward latency:\n  ", end="")
    module = runtime.load_vm_flatbuffer(binary, driver="cuda")
    tms = []
    for i in range(10):
        ret = benchmark_module(module.vm_module, entry_function="predict", inputs=["1x3x224x224xf32=1"], device="cuda")
        tm = ret[0].time
        tms.append(float(tm[0:-3]))
    print("{} - {:.3f} ± {:.3f} ms".format(modelname, np.mean(tms), np.std(tms)))

2024-05-21 18:01:11.822700: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



INFO:tensorflow:Assets written to: /tmp/tmps93v7um8.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmps93v7um8.sm/assets
2024-05-21 18:01:18.248501: I tensorflow/cc/saved_model/bundle_v2.cc:44] Reading SavedModel from: /tmp/tmps93v7um8.sm
2024-05-21 18:01:18.259966: I tensorflow/cc/saved_model/reader.cc:132] Reading SavedModel debug info (if present) from: /tmp/tmps93v7um8.sm
2024-05-21 18:01:19.107231: W tensorflow/compiler/mlir/tensorflow/transforms/xla_cluster_formation.cc:81] no entry function is found


**** Model ResNet18 - Total Time: 7.177s
Calculating forward latency:
  ResNet18 - 3.232 ± 0.009 ms
INFO:tensorflow:Assets written to: /tmp/tmpd4x5z3_7.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmpd4x5z3_7.sm/assets
2024-05-21 18:01:45.498474: I tensorflow/cc/saved_model/bundle_v2.cc:44] Reading SavedModel from: /tmp/tmpd4x5z3_7.sm
2024-05-21 18:01:45.537594: I tensorflow/cc/saved_model/reader.cc:132] Reading SavedModel debug info (if present) from: /tmp/tmpd4x5z3_7.sm
2024-05-21 18:01:46.395873: W tensorflow/compiler/mlir/tensorflow/transforms/xla_cluster_formation.cc:81] no entry function is found


**** Model MobileNetV3 - Total Time: 15.049s
Calculating forward latency:
  MobileNetV3 - 1.520 ± 0.000 ms
INFO:tensorflow:Assets written to: /tmp/tmp7lauxz4e.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmp7lauxz4e.sm/assets
2024-05-21 18:02:13.745904: I tensorflow/cc/saved_model/bundle_v2.cc:44] Reading SavedModel from: /tmp/tmp7lauxz4e.sm
2024-05-21 18:02:13.807671: I tensorflow/cc/saved_model/reader.cc:132] Reading SavedModel debug info (if present) from: /tmp/tmp7lauxz4e.sm
2024-05-21 18:02:15.882209: W tensorflow/compiler/mlir/tensorflow/transforms/xla_cluster_formation.cc:81] no entry function is found


**** Model ResNet50 - Total Time: 19.173s
Calculating forward latency:
  ResNet50 - 6.133 ± 0.016 ms
INFO:tensorflow:Assets written to: /tmp/tmpsueqp2un.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmpsueqp2un.sm/assets
2024-05-21 18:02:36.367478: I tensorflow/cc/saved_model/bundle_v2.cc:44] Reading SavedModel from: /tmp/tmpsueqp2un.sm
2024-05-21 18:02:36.377542: I tensorflow/cc/saved_model/reader.cc:132] Reading SavedModel debug info (if present) from: /tmp/tmpsueqp2un.sm
2024-05-21 18:02:36.595692: W tensorflow/compiler/mlir/tensorflow/transforms/xla_cluster_formation.cc:81] no entry function is found


**** Model SqueezeNet - Total Time: 3.894s
Calculating forward latency:
  SqueezeNet - 1.390 ± 0.000 ms
INFO:tensorflow:Assets written to: /tmp/tmpg2_qwvfp.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmpg2_qwvfp.sm/assets
2024-05-21 18:03:00.231870: I tensorflow/cc/saved_model/bundle_v2.cc:44] Reading SavedModel from: /tmp/tmpg2_qwvfp.sm
2024-05-21 18:03:00.276356: I tensorflow/cc/saved_model/reader.cc:132] Reading SavedModel debug info (if present) from: /tmp/tmpg2_qwvfp.sm
2024-05-21 18:03:01.214831: W tensorflow/compiler/mlir/tensorflow/transforms/xla_cluster_formation.cc:81] no entry function is found


**** Model ShuffleNet - Total Time: 15.533s
Calculating forward latency:
  ShuffleNet - 2.480 ± 0.000 ms
INFO:tensorflow:Assets written to: /tmp/tmpvkbxwqx7.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmpvkbxwqx7.sm/assets
2024-05-21 18:03:35.728424: I tensorflow/cc/saved_model/bundle_v2.cc:44] Reading SavedModel from: /tmp/tmpvkbxwqx7.sm
2024-05-21 18:03:35.813632: I tensorflow/cc/saved_model/reader.cc:132] Reading SavedModel debug info (if present) from: /tmp/tmpvkbxwqx7.sm
2024-05-21 18:03:38.444018: W tensorflow/compiler/mlir/tensorflow/transforms/xla_cluster_formation.cc:81] no entry function is found


**** Model Inception3 - Total Time: 28.349s
Calculating forward latency:
  Inception3 - 11.900 ± 0.000 ms
INFO:tensorflow:Assets written to: /tmp/tmp3db1ueu5.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmp3db1ueu5.sm/assets
2024-05-21 18:04:28.626842: I tensorflow/cc/saved_model/bundle_v2.cc:44] Reading SavedModel from: /tmp/tmp3db1ueu5.sm
2024-05-21 18:04:28.755830: I tensorflow/cc/saved_model/reader.cc:132] Reading SavedModel debug info (if present) from: /tmp/tmp3db1ueu5.sm
2024-05-21 18:04:31.389322: W tensorflow/compiler/mlir/tensorflow/transforms/xla_cluster_formation.cc:81] no entry function is found


**** Model DenseNet - Total Time: 41.106s
Calculating forward latency:
  DenseNet - 7.166 ± 0.007 ms
INFO:tensorflow:Assets written to: /tmp/tmporxwsvkb.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmporxwsvkb.sm/assets
2024-05-21 18:05:16.128203: I tensorflow/cc/saved_model/bundle_v2.cc:44] Reading SavedModel from: /tmp/tmporxwsvkb.sm
2024-05-21 18:05:16.247636: I tensorflow/cc/saved_model/reader.cc:132] Reading SavedModel debug info (if present) from: /tmp/tmporxwsvkb.sm
2024-05-21 18:05:22.984023: W tensorflow/compiler/mlir/tensorflow/transforms/xla_cluster_formation.cc:81] no entry function is found


**** Model Vit - Total Time: 34.160s
Calculating forward latency:
  Vit - 33.380 ± 0.087 ms


#### Bert

In [3]:
from bert import BertModelLayer
from tensorflow import keras
import tensorflow as tf
import numpy as np
from ufront.keras.model import UFrontKeras
import iree.compiler as ireec
from iree import runtime
import time
from tensorflow.keras import backend
from iree.tf.support import module_utils
import iree
GPU = True

l_bert = BertModelLayer(**BertModelLayer.Params(
  vocab_size               = 16000,        # embedding params
  use_token_type           = True,
  use_position_embeddings  = True,
  token_type_vocab_size    = 16000,

  num_layers               = 12,           # transformer encoder params
  hidden_size              = 768,
  hidden_dropout           = 0.1,
  intermediate_size        = 4*768,
  intermediate_activation  = "gelu",

  adapter_size             = None,         # see arXiv:1902.00751 (adapter-BERT)

  shared_layer             = False,        # True for ALBERT (arXiv:1909.11942)
  embedding_size           = None,         # None for BERT, wordpiece embedding size for ALBERT
  num_heads = 12,
  # name                     = "bert"        # any other Keras layer params
))

input_ids = np.array([[31, 51, 99], [15, 5, 0]], dtype='int32')
input_mask = np.array([[1, 1, 1], [1, 1, 0]], dtype='int32')
token_type_ids = np.array([[0, 0, 1], [0, 1, 0]], dtype='int32')

INPUT_SHAPE = [2, 3]
max_seq_len = 3
l_input_ids      = keras.layers.Input(shape=(max_seq_len,), dtype='int32')
l_token_type_ids = keras.layers.Input(shape=(max_seq_len,), dtype='int32')

output = l_bert([l_input_ids, l_token_type_ids])          # [batch_size, max_seq_len, hidden_size]
model = keras.Model(inputs=[l_input_ids, l_token_type_ids], outputs=output)

t1_start = time.perf_counter()


modelname = "Bert"
BATCH_SIZE = INPUT_SHAPE[0]
SEQUENCE_LENGTH = INPUT_SHAPE[1]
bert_input = [tf.TensorSpec(shape=[BATCH_SIZE,SEQUENCE_LENGTH],dtype=tf.int32),
            tf.TensorSpec(shape=[BATCH_SIZE,SEQUENCE_LENGTH], dtype=tf.int32)]
class BertModule(tf.Module):
    def __init__(self):
        super(BertModule, self).__init__()
        dict_outputs = False
        self.m = model
        self.m.predict = lambda x: self.m.call(x, training=False)
        
    @tf.function(input_signature=bert_input)
    def predict(self,input_word_ids, segment_ids):
        inputs = [input_word_ids, segment_ids]
        return self.m.predict(inputs)
if GPU:
    binary = iree.compiler.tf.compile_module(
            BertModule(),
            target_backends=["cuda"],
            exported_names=["predict"])
    
    t1_stop = time.perf_counter()
    print("**** Model {} - Total Time: {:.3f}s".format(modelname, t1_stop - t1_start)) # print performance indicator

    
    module = runtime.load_vm_flatbuffer(binary, driver="cuda")
    print("Calculating forward latency:\n  ", end="")
    %timeit -n 100 module.predict(input_ids, token_type_ids)
else:
    backend_choice = "iree_llvmcpu (CPU)" #@param [ "iree_vmvx (CPU)", "iree_llvmcpu (CPU)", "iree_vulkan (GPU/SwiftShader)" ]
    backend_choice = backend_choice.split(" ")[0]
    backend = module_utils.BackendInfo(backend_choice)
    compiled_model = backend.compile_from_class(BertModule, ["predict"])
    t1_stop = time.perf_counter()
    print("**** Model {} - Total Time: {:.3f}s".format(modelname, t1_stop - t1_start)) # print performance indicator

    print("Performing benchmark...")
    print("Calculating forward latency:\n  ", end="")
    %timeit -n 100 compiled_model.predict(input_ids, token_type_ids)
    

INFO:tensorflow:Assets written to: /tmp/tmpfnz6cdfj.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmpfnz6cdfj.sm/assets
2024-05-21 18:10:25.461128: I tensorflow/cc/saved_model/bundle_v2.cc:44] Reading SavedModel from: /tmp/tmpfnz6cdfj.sm
2024-05-21 18:10:25.542762: I tensorflow/cc/saved_model/reader.cc:132] Reading SavedModel debug info (if present) from: /tmp/tmpfnz6cdfj.sm
2024-05-21 18:10:33.175330: W tensorflow/compiler/mlir/tensorflow/transforms/xla_cluster_formation.cc:81] no entry function is found


**** Model Bert - Total Time: 33.487s
Calculating forward latency:
  3.34 ms ± 57.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


#### LSTM

In [4]:
import time
import iree
import numpy as np
from iree import runtime
import numpy as np
import tensorflow as tf
from iree.compiler import tools
from benchmark import benchmark_module
from tensorflow.keras import backend
from iree.tf.support import module_utils
from keras_def import KerasLSTM

channels_first = True
modelname = "LSTM"  
batch_size = 8
hidden_size = 128
seq_size = 32
input_size = 256

t1_start = time.perf_counter()
INPUT_SHAPE = (batch_size, seq_size,hidden_size)
input = np.random.randn(batch_size, seq_size,hidden_size).astype(np.float32)
h0 = np.zeros((batch_size, hidden_size), dtype=np.float32)
c0 = np.zeros((batch_size, hidden_size), dtype=np.float32)
model = KerasLSTM(input_shape=input.shape[1:], seq_size = seq_size, hidden_size = hidden_size)
# Wrap the model in a tf.Module to compile it with IREE.
class Module(tf.Module):
    def __init__(self):
        super(Module, self).__init__()
        self.m = model
        self.m.predict = lambda x: self.m.call(x, training=False)
        self.predict = tf.function(
            input_signature=[tf.TensorSpec(INPUT_SHAPE, tf.float32)])(model.predict)

binary = iree.compiler.tf.compile_module(
        Module(),
        target_backends=["cuda"],
        exported_names=["predict"])

t1_stop = time.perf_counter()
print("**** Model {} - Total Time: {:.3f}s".format(modelname, t1_stop - t1_start)) # print performance indicator
compiled_model = runtime.load_vm_flatbuffer(binary, driver="cuda")

print("Calculating forward latency:\n  ", end="")
%timeit -n 100 compiled_model.predict(input)

INFO:tensorflow:Assets written to: /tmp/tmp8yzyoaw9.sm/assets


INFO:tensorflow:Assets written to: /tmp/tmp8yzyoaw9.sm/assets
2024-05-21 18:11:38.181184: I tensorflow/cc/saved_model/bundle_v2.cc:44] Reading SavedModel from: /tmp/tmp8yzyoaw9.sm
2024-05-21 18:11:38.250725: I tensorflow/cc/saved_model/reader.cc:132] Reading SavedModel debug info (if present) from: /tmp/tmp8yzyoaw9.sm
2024-05-21 18:11:39.588854: W tensorflow/compiler/mlir/tensorflow/transforms/xla_cluster_formation.cc:81] no entry function is found


**** Model LSTM - Total Time: 14.603s
Calculating forward latency:
  1.69 ms ± 41.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# UFront Test (GPU)

#### Vision Models

In [3]:
import tensorflow as tf
import time
import iree
from iree import runtime
import numpy as np
from tensorflow.keras import backend
from iree.tf.support import module_utils
from vit_keras import vit
from keras_def import SequentialCNN, ConcatenatedCNN, NestedCNN, ShuffleNet, SqueezeNet_11, ResNet18
from ufront.keras.model import UFrontKeras
import iree.compiler as ireec
from benchmark import benchmark_module

from tensorflow.keras.applications import ResNet50, ResNet50V2, MobileNetV3Small, DenseNet121, InceptionV3, VGG16
channels_first = True
GPU = True
model_list = {"ResNet18":ResNet18, "MobileNetV3":MobileNetV3Small, "ResNet50":ResNet50, "SqueezeNet":SqueezeNet_11, "ShuffleNet":ShuffleNet,
              "Inception3":InceptionV3, "DenseNet":DenseNet121, 
             "Vit":vit.vit_b16} 


INPUT_SHAPE = [1, 3, 224, 224] if channels_first else [1, 224, 224, 3]
backend.set_image_data_format('channels_first' if channels_first else 'channels_last')
input = np.random.uniform(low=0.0, high=1.0, size=(1, 3, 224, 224) if channels_first else (1, 224, 224, 3)).astype(np.float32)

for modelname, Model in model_list.items():
    
    if modelname == "Vit":
        base_model = Model(image_size=224, activation='relu', pretrained=False, include_top=True, pretrained_top=False, channel_first=channels_first)
    elif modelname == "SqueezeNet":
        base_model = Model(input_shape=tuple(INPUT_SHAPE[1:]), nb_classes=1000, channel_first=channels_first)
    elif modelname == "ShuffleNet":
        base_model = Model(include_top=True, input_shape=tuple(INPUT_SHAPE[1:]))
    elif modelname == "ResNet18":
        base_model = Model(classes=1000, input_shape=tuple(INPUT_SHAPE[1:]))
    else:
        base_model = Model(weights=None, include_top=True, input_shape=tuple(INPUT_SHAPE[1:]))
        
    model_name = base_model.name

    transformer = True if model_name.find("Transformer") > 0 or model_name.find("vit") >= 0 else False
    t1_start = time.perf_counter()

    model = UFrontKeras(base_model, inputs = [input], batch_size = 1, transformer=transformer, pass_weights=True)

    if transformer:
      last_op = model.get_output_operator()
      output = model.umodel().softmax(input=last_op.get_output(0), name="softmax_out")

    model.compile(optimizer={"type":"sgd", "lr":"0.01", "momentum":"0", "nesterov":"False", "weight_decay":"0"},
                        loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
    
    tosa_ir= model.dump_tosa_ir()
    t1_stop = time.perf_counter()
    print("Compiling TOSA model...")
    binary = ireec.compile_str(tosa_ir,
                    target_backends=["cuda"], 
                    input_type=ireec.InputType.TOSA)

    t2_stop = time.perf_counter()
    print(modelname + "****Ufront->TOSA Time: {:.3f}s, TOSA->Binary Time: {:.3f}s, Total Time: {:.3f}s".format(t1_stop - t1_start, t2_stop - t1_stop, t2_stop - t1_start)) # print performance indicator

    module = runtime.load_vm_flatbuffer(binary, driver="cuda")
    tms = []
    for i in range(10):
        ret = benchmark_module(module.vm_module, entry_function="forward", inputs=["1x3x224x224xf32=1"], device="cuda")
        tm = ret[0].time
        tms.append(float(tm[0:-3]))
    print("{} - {:.3f} ± {:.3f} ms".format(modelname, np.mean(tms), np.std(tms)))

    

2024-05-21 16:06:07.625910: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2024-05-21 16:06:07.626056: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session
2024-05-21 16:06:08.152926: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2024-05-21 16:06:08.153075: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session


Compiling TOSA model...
ResNet18****Ufront->TOSA Time: 2.403s, TOSA->Binary Time: 1.935s, Total Time: 4.337s
ResNet18 - 2.878 ± 0.004 ms


2024-05-21 16:06:27.413748: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2024-05-21 16:06:27.413900: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session
2024-05-21 16:06:27.870573: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2024-05-21 16:06:27.870714: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session


Compiling TOSA model...
MobileNetV3****Ufront->TOSA Time: 2.591s, TOSA->Binary Time: 2.500s, Total Time: 5.091s
MobileNetV3 - 1.151 ± 0.003 ms


2024-05-21 16:06:45.077633: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2024-05-21 16:06:45.077795: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session
2024-05-21 16:06:46.021675: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2024-05-21 16:06:46.021807: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session


Compiling TOSA model...
ResNet50****Ufront->TOSA Time: 5.025s, TOSA->Binary Time: 3.051s, Total Time: 8.076s
ResNet50 - 6.045 ± 0.025 ms


2024-05-21 16:07:06.529522: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2024-05-21 16:07:06.529687: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session
2024-05-21 16:07:06.671457: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2024-05-21 16:07:06.671596: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session


Compiling TOSA model...
SqueezeNet****Ufront->TOSA Time: 0.902s, TOSA->Binary Time: 1.601s, Total Time: 2.503s
SqueezeNet - 1.131 ± 0.003 ms


2024-05-21 16:07:21.089181: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2024-05-21 16:07:21.089381: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session
2024-05-21 16:07:21.558486: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2024-05-21 16:07:21.558631: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session


Compiling TOSA model...
ShuffleNet****Ufront->TOSA Time: 2.285s, TOSA->Binary Time: 1.965s, Total Time: 4.250s
ShuffleNet - 2.220 ± 0.000 ms


2024-05-21 16:07:39.019604: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2024-05-21 16:07:39.019760: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session
2024-05-21 16:07:40.088663: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2024-05-21 16:07:40.088818: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session


Compiling TOSA model...
Inception3****Ufront->TOSA Time: 6.085s, TOSA->Binary Time: 4.092s, Total Time: 10.177s
Inception3 - 11.420 ± 0.040 ms


2024-05-21 16:08:04.820175: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2024-05-21 16:08:04.820336: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session
2024-05-21 16:08:06.242510: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2024-05-21 16:08:06.242697: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session


Compiling TOSA model...
DenseNet****Ufront->TOSA Time: 6.415s, TOSA->Binary Time: 4.439s, Total Time: 10.855s
DenseNet - 6.551 ± 0.033 ms


2024-05-21 16:08:29.261620: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2024-05-21 16:08:29.261780: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session
2024-05-21 16:08:34.388363: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2024-05-21 16:08:34.388528: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session


Compiling TOSA model...
Vit****Ufront->TOSA Time: 23.300s, TOSA->Binary Time: 4.824s, Total Time: 28.124s
Vit - 29.000 ± 0.063 ms


#### Bert

In [1]:
from bert import BertModelLayer
from tensorflow import keras
import numpy as np
from ufront.keras.model import UFrontKeras
import iree.compiler as ireec
from iree import runtime
import time

l_bert = BertModelLayer(**BertModelLayer.Params(
  vocab_size               = 16000,        # embedding params
  use_token_type           = True,
  use_position_embeddings  = True,
  token_type_vocab_size    = 16000,

  num_layers               = 12,           # transformer encoder params
  hidden_size              = 768,
  hidden_dropout           = 0.1,
  intermediate_size        = 4*768,
  intermediate_activation  = "gelu",

  adapter_size             = None,         # see arXiv:1902.00751 (adapter-BERT)

  shared_layer             = False,        # True for ALBERT (arXiv:1909.11942)
  embedding_size           = None,         # None for BERT, wordpiece embedding size for ALBERT
  num_heads = 12,
  # name                     = "bert"        # any other Keras layer params
))

input_ids = np.array([[31, 51, 99], [15, 5, 0]], dtype='int32')
input_mask = np.array([[1, 1, 1], [1, 1, 0]], dtype='int32')
token_type_ids = np.array([[0, 0, 1], [0, 1, 0]], dtype='int32')

max_seq_len = 3
l_input_ids      = keras.layers.Input(shape=(max_seq_len,), dtype='int32')
l_token_type_ids = keras.layers.Input(shape=(max_seq_len,), dtype='int32')

output = l_bert([l_input_ids, l_token_type_ids])          # [batch_size, max_seq_len, hidden_size]
net = keras.Model(inputs=[l_input_ids, l_token_type_ids], outputs=output)

t1_start = time.perf_counter()
model = UFrontKeras(net, inputs = [input_ids, token_type_ids], batch_size = 1, transformer=True, pass_weights=True)

model.compile(optimizer={"type":"sgd", "lr":"0.01", "momentum":"0", "nesterov":"False", "weight_decay":"0"},
                        loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])


print("Compiling TOSA model...")
tosa_ir= model.dump_tosa_ir()
t1_stop = time.perf_counter()

print("Compiling Binary...")
binary = ireec.compile_str(tosa_ir,
                target_backends=["cuda"], 
                input_type=ireec.InputType.TOSA)
t2_stop = time.perf_counter()

print("Bert****Ufront->TOSA Time: {:.3f}s, TOSA->Binary Time: {:.3f}s, Total Time: {:.3f}s".format(t1_stop - t1_start, t2_stop - t1_stop, t2_stop - t1_start)) # print performance indicator
module = runtime.load_vm_flatbuffer(binary, driver="cuda")
%timeit -n 100 module.forward(input_ids, token_type_ids)


2024-05-21 16:26:18.516202: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Could not search for non-variable resources. Concrete function internal representation may have changed.
2024-05-21 16:26:24.396774: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2024-05-21 16:26:24.396970: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session
2024-05-21 16:26:31.450491: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2024-05-21 16:26:31.450689: I tensorflow/core/grappler/clusters/sin

Compiling TOSA model...
Compiling Binary...
Bert****Ufront->TOSA Time: 27.892s, TOSA->Binary Time: 4.408s, Total Time: 32.300s
3.29 ms ± 41.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### LSTM

In [1]:
import iree.compiler as ireec
from iree import runtime
import time
import numpy as np
from ufront.keras.model import UFrontKeras
from keras_def import KerasLSTM
batch_size = 8
hidden_size = 128
seq_size = 32
input_size = 256
input = np.random.randn(batch_size, seq_size,hidden_size).astype(np.float32)
h0 = np.zeros((batch_size, hidden_size), dtype=np.float32)
c0 = np.zeros((batch_size, hidden_size), dtype=np.float32)
t1_start = time.perf_counter()
lstm = KerasLSTM(input_shape=input.shape[1:], seq_size = seq_size, hidden_size = hidden_size)
model = UFrontKeras(lstm, inputs = [input], batch_size = 1, pass_weights=True)
output_tensors = model(inputs = [input])

#This will trigger model compilation, i.e., convert Rust computation graph to a unified high-level IR and lower it to TOSA IR
model.compile(optimizer={"type":"sgd", "lr":"0.01", "momentum":"0", "nesterov":"False", "weight_decay":"0"},
                      loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])


# modelir= model.dump_ir()
# print(modelir)

tosa_ir = model.dump_tosa_ir()
t1_stop = time.perf_counter()
binary = ireec.compile_str(tosa_ir,
                target_backends=["cuda"], 
                input_type=ireec.InputType.TOSA)
t2_stop = time.perf_counter()
print("LSTM****Ufront->TOSA Time: {:.3f}s, TOSA->Binary Time: {:.3f}s, Total Time: {:.3f}s".format(t1_stop - t1_start, t2_stop - t1_stop, t2_stop - t1_start)) # print performance indicator

module = runtime.load_vm_flatbuffer(binary, driver="cuda")
%timeit -n 100 module.forward(input)

2024-05-22 16:27:03.095086: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Could not search for non-variable resources. Concrete function internal representation may have changed.
2024-05-22 16:27:06.569871: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2024-05-22 16:27:06.570062: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session
2024-05-22 16:27:08.334241: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
2024-05-22 16:27:08.334436: I tensorflow/core/grappler/clusters/sin

LSTM****Ufront->TOSA Time: 6.917s, TOSA->Binary Time: 1.008s, Total Time: 7.925s
1.61 ms ± 28.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
