In [1]:
import pathlib
import time
import torch
from torchvision.models import resnet18, resnet50, squeezenet1_1, regnet_x_32gf, maxvit_t, shufflenet_v2_x1_5, inception_v3, mobilenet_v3_small, efficientnet_v2_s, densenet121, convnext_small
import torchvision.models as models
from ufront.pytorch.model import UFrontTorch
import argparse
import ctypes
from iree.compiler import tools
from iree import runtime
import iree.runtime as ireert
import iree.compiler as ireec
from typing import Optional

import torch
import iree.runtime as ireert
import iree.compiler as ireec

In [None]:
# !pip install iree-compiler==20230512.517 iree-runtime==20230512.517 -f https://openxla.github.io/iree/pip-release-links.html


In [4]:
batch_size = 1
input = torch.ones((batch_size, 3, 224, 224), dtype=torch.float32)

model_list = {"MobileNetV3":mobilenet_v3_small(pretrained=False), "ShuffleNetV2":shufflenet_v2_x1_5(pretrained=False),
            "ResNet18":resnet18(pretrained=False), "ResNet50":resnet50(pretrained=False), "SqueezeNet":squeezenet1_1(pretrained=False),
            "DenseNet121":densenet121(pretrained=False), "InceptionV3":inception_v3(pretrained=False), "ViT_B16":models.vision_transformer.vit_b_16(weights=False, dropout=0.1)}

for modelname, net in model_list.items():
    # blockPrint()
    net.train(False) 

    t1_start = time.perf_counter()
    model = UFrontTorch(net, batch_size=batch_size, pass_weights=True) # convert torch model to ufront model
    #This will trigger Rust frontend for actual model conversion and graph building
    #operators can also be managed by python side (each operator here corresponding to an operator in the Rust computation graph)
    output_tensors = model(inputs = [input])

    #The output of the model (forward pass have not been triggered at the moment!)
    if model.model.__class__.__name__ not in ["MaxVit", "SwinTransformer", "VisionTransformer", "MultiHeadAttention"]:
        output = model.softmax(input=output_tensors[0], name="softmax_out")

    #This will trigger model compilation, i.e., convert Rust computation graph to a unified high-level IR and lower it to TOSA IR
    model.compile(optimizer={"type":"sgd", "lr":"0.01", "momentum":"0", "nesterov":"False", "weight_decay":"0"},
                        loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
    tosa_ir= model.dump_tosa_ir()

    t1_stop = time.perf_counter()

    # print("Compiling TOSA model...")
    compiled_module = ireec.compile_str(tosa_ir,
                        target_backends=["llvm-cpu"],
                        input_type=ireec.InputType.TOSA)

    t2_stop = time.perf_counter()

    print(modelname + "****Ufront->TOSA Time: {:.3f}s, TOSA->Binary Time: {:.3f}s, Total Time: {:.3f}s".format(t1_stop - t1_start, t2_stop - t1_stop, t2_stop - t1_start)) # print performance indicator


MobileNetV3****Ufront->TOSA Time: 0.260s, TOSA->Binary Time: 6.524s, Total Time: 6.784s
ShuffleNetV2****Ufront->TOSA Time: 0.373s, TOSA->Binary Time: 4.092s, Total Time: 4.465s
ResNet18****Ufront->TOSA Time: 0.410s, TOSA->Binary Time: 2.868s, Total Time: 3.278s
ResNet50****Ufront->TOSA Time: 0.904s, TOSA->Binary Time: 6.228s, Total Time: 7.132s
SqueezeNet****Ufront->TOSA Time: 0.076s, TOSA->Binary Time: 3.409s, Total Time: 3.485s
DenseNet121****Ufront->TOSA Time: 1.260s, TOSA->Binary Time: 17.486s, Total Time: 18.746s
InceptionV3****Ufront->TOSA Time: 1.103s, TOSA->Binary Time: 7.922s, Total Time: 9.025s
ViT_B16****Ufront->TOSA Time: 2.637s, TOSA->Binary Time: 5.656s, Total Time: 8.293s


In [2]:
batch_size = 1
input = torch.ones((batch_size, 3, 224, 224), dtype=torch.float32)

model_list = {"MobileNetV3":mobilenet_v3_small(pretrained=False), "ShuffleNetV2":shufflenet_v2_x1_5(pretrained=False),
            "ResNet18":resnet18(pretrained=False), "ResNet50":resnet50(pretrained=False), "SqueezeNet":squeezenet1_1(pretrained=False),
            "DenseNet121":densenet121(pretrained=False), "InceptionV3":inception_v3(pretrained=False), "ViT_B16":models.vision_transformer.vit_b_16(weights=False, dropout=0.1)}

for modelname, net in model_list.items():
    # blockPrint()
    net.train(False) 

    t1_start = time.perf_counter()
    model = UFrontTorch(net, batch_size=batch_size, pass_weights=True) # convert torch model to ufront model
    #This will trigger Rust frontend for actual model conversion and graph building
    #operators can also be managed by python side (each operator here corresponding to an operator in the Rust computation graph)
    output_tensors = model(inputs = [input])

    #The output of the model (forward pass have not been triggered at the moment!)
    if model.model.__class__.__name__ not in ["MaxVit", "SwinTransformer", "VisionTransformer", "MultiHeadAttention"]:
        output = model.softmax(input=output_tensors[0], name="softmax_out")

    #This will trigger model compilation, i.e., convert Rust computation graph to a unified high-level IR and lower it to TOSA IR
    model.compile(optimizer={"type":"sgd", "lr":"0.01", "momentum":"0", "nesterov":"False", "weight_decay":"0"},
                        loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
    tosa_ir= model.dump_tosa_ir()

    t1_stop = time.perf_counter()

    # print("Compiling TOSA model...")
    compiled_module = ireec.compile_str(tosa_ir,
                        target_backends=["llvm-cpu"],
                        input_type=ireec.InputType.TOSA)

    t2_stop = time.perf_counter()

    print(modelname + "****Ufront->TOSA Time: {:.3f}s, TOSA->Binary Time: {:.3f}s, Total Time: {:.3f}s".format(t1_stop - t1_start, t2_stop - t1_stop, t2_stop - t1_start)) # print performance indicator




MobileNetV3****Ufront->TOSA Time: 0.356s, TOSA->Binary Time: 6.222s, Total Time: 6.577s
ShuffleNetV2****Ufront->TOSA Time: 0.374s, TOSA->Binary Time: 3.782s, Total Time: 4.156s
ResNet18****Ufront->TOSA Time: 0.419s, TOSA->Binary Time: 2.739s, Total Time: 3.158s
ResNet50****Ufront->TOSA Time: 0.940s, TOSA->Binary Time: 5.989s, Total Time: 6.929s
SqueezeNet****Ufront->TOSA Time: 0.079s, TOSA->Binary Time: 3.159s, Total Time: 3.239s
DenseNet121****Ufront->TOSA Time: 1.194s, TOSA->Binary Time: 16.150s, Total Time: 17.344s
InceptionV3****Ufront->TOSA Time: 1.130s, TOSA->Binary Time: 7.642s, Total Time: 8.772s
ViT_B16****Ufront->TOSA Time: 2.725s, TOSA->Binary Time: 5.781s, Total Time: 8.506s


# Compile & Run Pytorch BERT Model

In [3]:
# import torch, torchtext
from ufront.pytorch.model import UFrontTorch 
import iree.compiler as ireec
from iree import runtime
from torch_bert import BertModel, BertConfig
import torch
import time
GPU = True
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

config = BertConfig(vocab_size_or_config_json_file=16000, hidden_size=768,
    num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

net = BertModel(config=config)
net.eval()

t1_start = time.perf_counter()
model = UFrontTorch(net, batch_size=1, pass_weights=True) # convert torch model to ufront model
#This will trigger Rust frontend for actual model conversion and graph building
#operators can also be managed by python side (each operator here corresponding to an operator in the Rust computation graph)
output_tensors = model(inputs = [input_ids, token_type_ids, input_mask])

#This will trigger model compilation, i.e., convert Rust computation graph to a unified high-level IR and lower it to TOSA IR
model.compile(optimizer={"type":"sgd", "lr":"0.01", "momentum":"0", "nesterov":"False", "weight_decay":"0"},
                    loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])


print("Compiling TOSA model...")
tosa_ir= model.dump_tosa_ir()
t1_stop = time.perf_counter()

print(len(tosa_ir))

print("Compiling Binary...")

if GPU:
    binary = ireec.compile_str(tosa_ir,
                    target_backends=["cuda"], 
                    input_type=ireec.InputType.TOSA)
    module = runtime.load_vm_flatbuffer(binary, driver="cuda")
else:
    binary = ireec.compile_str(tosa_ir,
                    target_backends=["llvm-cpu"], 
                    input_type=ireec.InputType.TOSA)
    module = runtime.load_vm_flatbuffer(binary,backend="llvm-cpu") 

t2_stop = time.perf_counter()

print("Bert****Ufront->TOSA Time: {:.3f}s, TOSA->Binary Time: {:.3f}s, Total Time: {:.3f}s".format(t1_stop - t1_start, t2_stop - t1_stop, t2_stop - t1_start)) # print performance indicator


Compiling TOSA model...
785802350
Compiling Binary...
Bert****Ufront->TOSA Time: 2.970s, TOSA->Binary Time: 5.258s, Total Time: 8.227s


In [14]:
%timeit -n 100 module.forward(input_ids, token_type_ids, input_mask)

3.32 ms ± 69.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# Compile & Run ONNX BERT Model

In [3]:
# import torch, torchtext
from ufront.pytorch.model import UFrontTorch 
import iree.compiler as ireec
from iree import runtime
from torch_bert import BertModel, BertConfig
import torch
import time
from ufront.onnx.model import ONNXModel, ONNXModelKeras, UFrontONNX
from torch.onnx import TrainingMode
import onnx
import io
GPU = True
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

config = BertConfig(vocab_size_or_config_json_file=16000, hidden_size=768,
    num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

net = BertModel(config=config)
net.eval()
t1_start = time.perf_counter()

f = io.BytesIO()
model_name = net.__class__.__name__ 
torch.onnx.export(model=net, args=(input_ids, input_mask, token_type_ids), f=f, export_params=True, #do_constant_folding=True,
                    training=TrainingMode.EVAL if model_name=="Inception3" else TrainingMode.TRAINING, opset_version=17)
onnx_model = onnx.load_model_from_string(f.getvalue())

# transformer = True if model_name in ["MaxVit", "SwinTransformer", "VisionTransformer", "MultiHeadAttention"] else False
model = UFrontONNX(onnx_model=onnx_model, batch_size=1, simplify=True, pass_weights=True, transformer=True)


#operators can also be managed by python side (each operator here corresponding to an operator in the Rust computation graph)
output_tensors = model(inputs = [input_ids, token_type_ids, input_mask])

#This will trigger model compilation, i.e., convert Rust computation graph to a unified high-level IR and lower it to TOSA IR
model.compile(optimizer={"type":"sgd", "lr":"0.01", "momentum":"0", "nesterov":"False", "weight_decay":"0"},
                    loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])

# modelir = model.dump_ir()


print("Compiling TOSA model...")
tosa_ir= model.dump_tosa_ir()
t1_stop = time.perf_counter()

print(len(tosa_ir))


print("Compiling Binary...")

if GPU:
    binary = ireec.compile_str(tosa_ir,
                    target_backends=["cuda"], 
                    input_type=ireec.InputType.TOSA)
    module = runtime.load_vm_flatbuffer(binary, driver="cuda")
else:
    binary = ireec.compile_str(tosa_ir,
                    target_backends=["llvm-cpu"], 
                    input_type=ireec.InputType.TOSA)
    module = runtime.load_vm_flatbuffer(binary,backend="llvm-cpu") 

t2_stop = time.perf_counter()

print("Bert****Ufront->TOSA Time: {:.3f}s, TOSA->Binary Time: {:.3f}s, Total Time: {:.3f}s".format(t1_stop - t1_start, t2_stop - t1_stop, t2_stop - t1_start)) # print performance indicator





Compiling TOSA model...
782623906
Compiling Binary...
Bert****Ufront->TOSA Time: 19.005s, TOSA->Binary Time: 5.031s, Total Time: 24.036s


In [4]:

%timeit -n 100 module.forward(input_ids, token_type_ids, input_mask)


3.63 ms ± 36.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# Compile & Run TF/Keras BERT Model

In [None]:
# pip install bert-for-tf2 (need to fix its issue (InputSpec) under TF 2.6+)

In [3]:
from bert import BertModelLayer
from tensorflow import keras
import numpy as np
from ufront.keras.model import UFrontKeras
import iree.compiler as ireec
from iree import runtime
import time
GPU = True

l_bert = BertModelLayer(**BertModelLayer.Params(
  vocab_size               = 16000,        # embedding params
  use_token_type           = True,
  use_position_embeddings  = True,
  token_type_vocab_size    = 16000,

  num_layers               = 12,           # transformer encoder params
  hidden_size              = 768,
  hidden_dropout           = 0.1,
  intermediate_size        = 4*768,
  intermediate_activation  = "gelu",

  adapter_size             = None,         # see arXiv:1902.00751 (adapter-BERT)

  shared_layer             = False,        # True for ALBERT (arXiv:1909.11942)
  embedding_size           = None,         # None for BERT, wordpiece embedding size for ALBERT
  num_heads = 12,
  # name                     = "bert"        # any other Keras layer params
))

input_ids = np.array([[31, 51, 99], [15, 5, 0]], dtype='int32')
input_mask = np.array([[1, 1, 1], [1, 1, 0]], dtype='int32')
token_type_ids = np.array([[0, 0, 1], [0, 1, 0]], dtype='int32')

max_seq_len = 3
l_input_ids      = keras.layers.Input(shape=(max_seq_len,), dtype='int32')
l_token_type_ids = keras.layers.Input(shape=(max_seq_len,), dtype='int32')

output = l_bert([l_input_ids, l_token_type_ids])          # [batch_size, max_seq_len, hidden_size]
net = keras.Model(inputs=[l_input_ids, l_token_type_ids], outputs=output)

t1_start = time.perf_counter()
#build UFront model
model = UFrontKeras(net, inputs = [input_ids, token_type_ids], batch_size = 1, transformer=True, pass_weights=True)


model.compile(optimizer={"type":"sgd", "lr":"0.01", "momentum":"0", "nesterov":"False", "weight_decay":"0"},
                        loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])

# modelir = model.dump_ir()

# print(modelir)

print("Compiling TOSA model...")
tosa_ir= model.dump_tosa_ir()
t1_stop = time.perf_counter()

print(len(tosa_ir))

print("Compiling Binary...")

if GPU:
    binary = ireec.compile_str(tosa_ir,
                    target_backends=["cuda"], 
                    input_type=ireec.InputType.TOSA)
    module = runtime.load_vm_flatbuffer(binary, driver="cuda")
else:
    binary = ireec.compile_str(tosa_ir,
                    target_backends=["llvm-cpu"], 
                    input_type=ireec.InputType.TOSA)
    module = runtime.load_vm_flatbuffer(binary,backend="llvm-cpu") 

t2_stop = time.perf_counter()

print("Bert****Ufront->TOSA Time: {:.3f}s, TOSA->Binary Time: {:.3f}s, Total Time: {:.3f}s".format(t1_stop - t1_start, t2_stop - t1_stop, t2_stop - t1_start)) # print performance indicator


2023-10-19 18:01:45.466756: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-19 18:01:46.222370: W external/org_tensorflow/tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.1/lib64
2023-10-19 18:01:46.241722: W external/org_tensorflow/tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.1/lib64
2023-10-19 18:01:46.243395: W external/org_tensorflow/tensorflow/stream_executor/platform/

Compiling TOSA model...
777888366
Compiling Binary...
Bert****Ufront->TOSA Time: 28.472s, TOSA->Binary Time: 4.853s, Total Time: 33.325s


In [5]:
%timeit -n 100 module.forward(input_ids, token_type_ids)


3.51 ms ± 26.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
