In [1]:
import pathlib
import time
import torch
from torchvision.models import resnet18, resnet50, squeezenet1_1, regnet_x_32gf, maxvit_t, shufflenet_v2_x1_5, inception_v3, mobilenet_v3_small, efficientnet_v2_s, densenet121, convnext_small
import torchvision.models as models
from ufront.pytorch.model import UFrontTorch
import argparse
import ctypes
# pip install iree-compiler iree-runtime iree-tools-tf -f https://openxla.github.io/iree/pip-release-links.html
from iree.compiler import tools
from iree import runtime
import iree.runtime as ireert
import iree.compiler as ireec
from typing import Optional

import torch
import iree.runtime as ireert
import iree.compiler as ireec

In [5]:
batch_size = 1
input = torch.ones((batch_size, 3, 224, 224), dtype=torch.float32)

model_list = {"MobileNetV3":mobilenet_v3_small(pretrained=False), "ShuffleNetV2":shufflenet_v2_x1_5(pretrained=False),
            "ResNet18":resnet18(pretrained=False), "ResNet50":resnet50(pretrained=False), "SqueezeNet":squeezenet1_1(pretrained=False),
            "DenseNet121":densenet121(pretrained=False), "InceptionV3":inception_v3(pretrained=False), "ViT_B16":models.vision_transformer.vit_b_16(weights=False, dropout=0.1)}

for modelname, net in model_list.items():
    # blockPrint()
    net.train(False) 

    t1_start = time.perf_counter()
    model = UFrontTorch(net, batch_size=batch_size) # convert torch model to ufront model
    #This will trigger Rust frontend for actual model conversion and graph building
    #operators can also be managed by python side (each operator here corresponding to an operator in the Rust computation graph)
    output_tensors = model(inputs = [input])

    #The output of the model (forward pass have not been triggered at the moment!)
    if model.model.__class__.__name__ not in ["MaxVit", "SwinTransformer", "VisionTransformer", "MultiHeadAttention"]:
        output = model.softmax(input=output_tensors[0], name="softmax_out")

    #This will trigger model compilation, i.e., convert Rust computation graph to a unified high-level IR and lower it to TOSA IR
    model.compile(optimizer={"type":"sgd", "lr":"0.01", "momentum":"0", "nesterov":"False", "weight_decay":"0"},
                        loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
    tosa_ir= model.dump_tosa_ir()

    t1_stop = time.perf_counter()

    # print("Compiling TOSA model...")
    compiled_module = ireec.compile_str(tosa_ir,
                        target_backends=["llvm-cpu"],
                        input_type=ireec.InputType.TOSA)

    t2_stop = time.perf_counter()

    print(modelname + "****Ufront->TOSA Time: {:.3f}s, TOSA->Binary Time: {:.3f}s, Total Time: {:.3f}s".format(t1_stop - t1_start, t2_stop - t1_stop, t2_stop - t1_start)) # print performance indicator


MobileNetV3****Ufront->TOSA Time: 0.274s, TOSA->Binary Time: 8.913s, Total Time: 9.187s
ShuffleNetV2****Ufront->TOSA Time: 0.413s, TOSA->Binary Time: 5.769s, Total Time: 6.182s
ResNet18****Ufront->TOSA Time: 1.481s, TOSA->Binary Time: 3.935s, Total Time: 5.416s
ResNet50****Ufront->TOSA Time: 2.993s, TOSA->Binary Time: 7.321s, Total Time: 10.314s
SqueezeNet****Ufront->TOSA Time: 0.172s, TOSA->Binary Time: 3.415s, Total Time: 3.587s
DenseNet121****Ufront->TOSA Time: 1.051s, TOSA->Binary Time: 23.630s, Total Time: 24.681s
InceptionV3****Ufront->TOSA Time: 2.862s, TOSA->Binary Time: 10.515s, Total Time: 13.377s
ViT_B16****Ufront->TOSA Time: 6.293s, TOSA->Binary Time: 5.702s, Total Time: 11.996s


In [2]:
# import torch, torchtext
from ufront.pytorch.model import UFrontTorch 
import iree.compiler as ireec
from iree import runtime
from torch_bert import BertModel, BertConfig
import torch
import time
GPU = True
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
    num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

net = BertModel(config=config)
net.eval()

t1_start = time.perf_counter()
model = UFrontTorch(net, batch_size=1, pass_weights=True) # convert torch model to ufront model
#This will trigger Rust frontend for actual model conversion and graph building
#operators can also be managed by python side (each operator here corresponding to an operator in the Rust computation graph)
output_tensors = model(inputs = [input_ids, token_type_ids, input_mask])

#This will trigger model compilation, i.e., convert Rust computation graph to a unified high-level IR and lower it to TOSA IR
model.compile(optimizer={"type":"sgd", "lr":"0.01", "momentum":"0", "nesterov":"False", "weight_decay":"0"},
                    loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])

# modelir = model.dump_ir()

# print(modelir)

# import pathlib
# path = str(pathlib.Path(__file__).parent.resolve()) + "/output_ir/torch_" + model.model.__class__.__name__ + ".ir"
# path = str(pathlib.Path(__file__).parent.resolve()) + "/output_ir/torch_Resnet18.ir"

# f = open(path, "w")
# f.write(modelir)
# f.close()

# print("\r\n\r\nIR for ", model.model.__class__.__name__, " generated: ", path)

print("Compiling TOSA model...")
tosa_ir= model.dump_tosa_ir()
t1_stop = time.perf_counter()

print(len(tosa_ir))

print("Compiling Binary...")

if GPU:
    binary = ireec.compile_str(tosa_ir,
                    target_backends=["cuda"], 
                    input_type=ireec.InputType.TOSA)
    module = runtime.load_vm_flatbuffer(binary, driver="cuda")
else:
    binary = ireec.compile_str(tosa_ir,
                    target_backends=["llvm-cpu"], 
                    input_type=ireec.InputType.TOSA)
    module = runtime.load_vm_flatbuffer(binary,backend="llvm-cpu") 

t2_stop = time.perf_counter()

print("Bert****Ufront->TOSA Time: {:.3f}s, TOSA->Binary Time: {:.3f}s, Total Time: {:.3f}s".format(t1_stop - t1_start, t2_stop - t1_stop, t2_stop - t1_start)) # print performance indicator


Compiling TOSA model...
884101340
Compiling Binary...
Bert****Ufront->TOSA Time: 3.426s, TOSA->Binary Time: 95.340s, Total Time: 98.765s


In [3]:

%timeit -n 100 module.forward(input_ids, token_type_ids, input_mask)

8.22 ms ± 18.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [4]:
# import torch, torchtext
from ufront.pytorch.model import UFrontTorch 
import iree.compiler as ireec
from iree import runtime
from bert import BertModel, BertConfig
import torch
import time
from ufront.onnx.model import ONNXModel, ONNXModelKeras, UFrontONNX
from torch.onnx import TrainingMode
import onnx
import io
GPU = True
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
    num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

net = BertModel(config=config)
net.eval()
t1_start = time.perf_counter()

f = io.BytesIO()
model_name = net.__class__.__name__ 
torch.onnx.export(model=net, args=(input_ids, input_mask, token_type_ids), f=f, export_params=True, #do_constant_folding=True,
                    training=TrainingMode.EVAL if model_name=="Inception3" else TrainingMode.TRAINING, opset_version=17)
onnx_model = onnx.load_model_from_string(f.getvalue())

# transformer = True if model_name in ["MaxVit", "SwinTransformer", "VisionTransformer", "MultiHeadAttention"] else False
model = UFrontONNX(onnx_model=onnx_model, batch_size=1, simplify=True, pass_weights=True, transformer=True)


#operators can also be managed by python side (each operator here corresponding to an operator in the Rust computation graph)
output_tensors = model(inputs = [input_ids, token_type_ids, input_mask])

#This will trigger model compilation, i.e., convert Rust computation graph to a unified high-level IR and lower it to TOSA IR
model.compile(optimizer={"type":"sgd", "lr":"0.01", "momentum":"0", "nesterov":"False", "weight_decay":"0"},
                    loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])

modelir = model.dump_ir()


print("Compiling TOSA model...")
tosa_ir= model.dump_tosa_ir()
t1_stop = time.perf_counter()

print(len(tosa_ir))


print("Compiling Binary...")

if GPU:
    binary = ireec.compile_str(tosa_ir,
                    target_backends=["cuda"], 
                    input_type=ireec.InputType.TOSA)
    module = runtime.load_vm_flatbuffer(binary, driver="cuda")
else:
    binary = ireec.compile_str(tosa_ir,
                    target_backends=["llvm-cpu"], 
                    input_type=ireec.InputType.TOSA)
    module = runtime.load_vm_flatbuffer(binary,backend="llvm-cpu") 

t2_stop = time.perf_counter()

print("Bert****Ufront->TOSA Time: {:.3f}s, TOSA->Binary Time: {:.3f}s, Total Time: {:.3f}s".format(t1_stop - t1_start, t2_stop - t1_stop, t2_stop - t1_start)) # print performance indicator



Compiling TOSA model...
880918944
Compiling Binary...
Bert****Ufront->TOSA Time: 21.993s, TOSA->Binary Time: 5.739s, Total Time: 27.732s


In [5]:

%timeit -n 100 module.forward(input_ids, token_type_ids, input_mask)


8.22 ms ± 50.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
# pip install bert-for-tf2 (need to fix some issue under TF 2.6+)

In [7]:
from bert import BertModelLayer
from tensorflow import keras
import numpy as np
from ufront.keras.model import UFrontKeras

l_bert = BertModelLayer(**BertModelLayer.Params(
  vocab_size               = 16000,        # embedding params
  use_token_type           = True,
  use_position_embeddings  = True,
  token_type_vocab_size    = 2,

  num_layers               = 12,           # transformer encoder params
  hidden_size              = 768,
  hidden_dropout           = 0.1,
  intermediate_size        = 4*768,
  intermediate_activation  = "gelu",

  adapter_size             = None,         # see arXiv:1902.00751 (adapter-BERT)

  shared_layer             = False,        # True for ALBERT (arXiv:1909.11942)
  embedding_size           = None,         # None for BERT, wordpiece embedding size for ALBERT
  num_heads = 12,
  # name                     = "bert"        # any other Keras layer params
))

input_ids = np.array([[31, 51, 99], [15, 5, 0]], dtype='int64')
input_mask = np.array([[1, 1, 1], [1, 1, 0]], dtype='int64')
token_type_ids = np.array([[0, 0, 1], [0, 1, 0]], dtype='int64')

max_seq_len = 128
l_input_ids      = keras.layers.Input(shape=(max_seq_len,), dtype='int64')
l_token_type_ids = keras.layers.Input(shape=(max_seq_len,), dtype='int64')

output = l_bert([l_input_ids, l_token_type_ids])          # [batch_size, max_seq_len, hidden_size]
net = keras.Model(inputs=[l_input_ids, l_token_type_ids], outputs=output)

#build UFront model
ufront_model = UFrontKeras(net, inputs = [input_ids, token_type_ids], batch_size = 1, transformer=True, pass_weights=True)


ufront_model.compile(optimizer={"type":"sgd", "lr":"0.01", "momentum":"0", "nesterov":"False", "weight_decay":"0"},
                        loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])