In [9]:
!pip install iree-compiler==20230524.529 iree-runtime==20230524.529 -f https://github.com/iree-org/iree/releases/tag/candidate-20230512.517

Looking in links: https://github.com/iree-org/iree/releases/tag/candidate-20230512.517
Collecting iree-compiler==20230524.529
  Downloading iree_compiler-20230524.529-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (577 bytes)
Collecting iree-runtime==20230524.529
  Downloading iree_runtime-20230524.529-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting PyYAML (from iree-compiler==20230524.529)
  Downloading PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.1 kB)
Downloading iree_compiler-20230524.529-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (55.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.8/55.8 MB[0m [31m827.9 kB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[?25hDownloading iree_runtime-20230524.529-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m8

In [None]:
# https://github.com/llvm/torch-mlir/releases/tag/snapshot-20230525.849 download and install torch-mlir

# !pip install torch-mlir==20230525.849 -f https://github.com/llvm/torch-mlir/releases/download/snapshot-20230525.849/torch_mlir-20230525.849-cp310-cp310-linux_x86_64.whl --no-dependencies

# Pytorch Model (UFront)

In [1]:
import pathlib
import time
import torch
from torchvision.models import resnet18, resnet50, squeezenet1_1, regnet_x_32gf, maxvit_t, shufflenet_v2_x1_5, inception_v3, mobilenet_v3_small, efficientnet_v2_s, densenet121, convnext_small
import torchvision.models as models
from ufront.pytorch.model import UFrontTorch
import argparse
import ctypes
from iree.compiler import tools
from iree import runtime
import iree.runtime as ireert
import iree.compiler as ireec
from typing import Optional
import numpy as np
import torch
import iree.runtime as ireert
import iree.compiler as ireec

  warn(


Some of the onnx models requires onnxsim library, please install onnxsim before usage!


In [7]:
# !pip install torchvision==0.16.0 --no-dependencies
# !pip install torch==2.1.0 --no-dependencies

In [7]:
!pip install /root/ufront-0.1.1-cp310-cp310-manylinux_2_28_x86_64.whl

Processing /root/ufront-0.1.1-cp310-cp310-manylinux_2_28_x86_64.whl
Installing collected packages: ufront
Successfully installed ufront-0.1.1
[0m

In [16]:
batch_size = 1
import numpy as np
from iree.runtime.benchmark import benchmark_module

input_sample = np.random.uniform(low=0.0, high=1.0, size=(batch_size, 3, 224, 224)).astype(np.float32)
input = torch.Tensor(input_sample)

model_list = {"MobileNetV3":mobilenet_v3_small(pretrained=False), "ShuffleNetV2":shufflenet_v2_x1_5(pretrained=False),
            "ResNet18":resnet18(pretrained=False), "ResNet50":resnet50(pretrained=False), "SqueezeNet":squeezenet1_1(pretrained=False),
            "DenseNet121":densenet121(pretrained=False), "InceptionV3":inception_v3(pretrained=False), "ViT_B16":models.vision_transformer.vit_b_16(weights=False, dropout=0.1)}

for modelname, net in model_list.items():
    net.train(False) 

    t1_start = time.perf_counter()
    model = UFrontTorch(net, batch_size=batch_size, pass_weights=True) # convert torch model to ufront model
    #This will trigger Rust frontend for actual model conversion and graph building
    #operators can also be managed by python side (each operator here corresponding to an operator in the Rust computation graph)
    output_tensors = model(inputs = [input])

    #This will trigger model compilation, i.e., convert Rust computation graph to a unified high-level IR and lower it to TOSA IR
    model.compile(optimizer={"type":"sgd", "lr":"0.01", "momentum":"0", "nesterov":"False", "weight_decay":"0"},
                        loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
    tosa_ir= model.dump_tosa_ir()

    t1_stop = time.perf_counter()

    binary = ireec.compile_str(tosa_ir,
                    target_backends=["cuda"], 
                    input_type=ireec.InputType.TOSA)
    t2_stop = time.perf_counter()

    print(modelname + "****Ufront->TOSA Time: {:.3f}s, TOSA->Binary Time: {:.3f}s, Total Time: {:.3f}s".format(t1_stop - t1_start, t2_stop - t1_stop, t2_stop - t1_start)) # print performance indicator
    module = runtime.load_vm_flatbuffer(binary, driver="cuda")
    tms = []
    for i in range(10):
        ret = benchmark_module(module.vm_module, entry_functiong="forward", inputs=["1x3x224x224xf32=1"], device="cuda")
        tm = ret[0].time
        tms.append(float(tm[0:-3]))
    print("{} - {:.3f} ± {:.3f} ms".format(modelname, np.mean(tms), np.std(tms)))


MobileNetV3****Ufront->TOSA Time: 0.240s, TOSA->Binary Time: 2.635s, Total Time: 2.875s
MobileNetV3 - 1.350 ± 0.000 ms
ShuffleNetV2****Ufront->TOSA Time: 0.345s, TOSA->Binary Time: 1.923s, Total Time: 2.268s
ShuffleNetV2 - 2.420 ± 0.000 ms
ResNet18****Ufront->TOSA Time: 0.362s, TOSA->Binary Time: 1.696s, Total Time: 2.059s
ResNet18 - 2.891 ± 0.008 ms
ResNet50****Ufront->TOSA Time: 0.777s, TOSA->Binary Time: 3.141s, Total Time: 3.918s
ResNet50 - 6.039 ± 0.028 ms
SqueezeNet****Ufront->TOSA Time: 0.064s, TOSA->Binary Time: 1.300s, Total Time: 1.364s
SqueezeNet - 1.108 ± 0.004 ms
DenseNet121****Ufront->TOSA Time: 1.203s, TOSA->Binary Time: 5.280s, Total Time: 6.483s
DenseNet121 - 7.640 ± 0.004 ms
InceptionV3****Ufront->TOSA Time: 0.957s, TOSA->Binary Time: 4.099s, Total Time: 5.056s
InceptionV3 - 12.080 ± 0.040 ms
ViT_B16****Ufront->TOSA Time: 2.399s, TOSA->Binary Time: 4.911s, Total Time: 7.311s
ViT_B16 - 29.400 ± 0.089 ms


In [17]:
# !pip install boto3

In [18]:
# !pip install tqdm

In [6]:
from ufront.pytorch.model import UFrontTorch 
import iree.compiler as ireec
from iree import runtime
from torch_bert import BertModel, BertConfig
import torch
import time
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

config = BertConfig(vocab_size_or_config_json_file=16000, hidden_size=768,
    num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

net = BertModel(config=config)
net.eval()

t1_start = time.perf_counter()
model = UFrontTorch(net, batch_size=1, pass_weights=True) # convert torch model to ufront model
#This will trigger Rust frontend for actual model conversion and graph building
#operators can also be managed by python side (each operator here corresponding to an operator in the Rust computation graph)
output_tensors = model(inputs = [input_ids, token_type_ids, input_mask])

#This will trigger model compilation, i.e., convert Rust computation graph to a unified high-level IR and lower it to TOSA IR
model.compile(optimizer={"type":"sgd", "lr":"0.01", "momentum":"0", "nesterov":"False", "weight_decay":"0"},
                    loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])

print("Compiling TOSA model...")
tosa_ir= model.dump_tosa_ir()
t1_stop = time.perf_counter()
print("Compiling Binary...")
binary = ireec.compile_str(tosa_ir,
                target_backends=["cuda"], 
                input_type=ireec.InputType.TOSA)
t2_stop = time.perf_counter()
print("Bert****Ufront->TOSA Time: {:.3f}s, TOSA->Binary Time: {:.3f}s, Total Time: {:.3f}s".format(t1_stop - t1_start, t2_stop - t1_stop, t2_stop - t1_start)) # print performance indicator
module = runtime.load_vm_flatbuffer(binary, driver="cuda")

%timeit -n 100 module.forward(input_ids, token_type_ids, input_mask)


Compiling TOSA model...
Compiling Binary...
Bert****Ufront->TOSA Time: 2.848s, TOSA->Binary Time: 4.363s, Total Time: 7.212s
3.4 ms ± 20.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [9]:
!pip list | grep iree

iree-compiler            20230524.529
iree-runtime             20230524.529


In [10]:
!pip list | grep torch

torch                    2.1.0.dev20230522+cpu
torch-mlir               20230523.847
torchvision              0.16.0


## Pytorch Model (Torch-MLIR)

In [11]:
import torch
import io
import numpy as np
import time
import torch_mlir
from torchvision.models import resnet18, resnet50, squeezenet1_1, regnet_x_32gf, maxvit_t, shufflenet_v2_x1_5, inception_v3, mobilenet_v3_small, efficientnet_v2_s, densenet121, convnext_small
import torchvision.models as models
from iree import runtime
from typing import Optional
from torch.utils._pytree import tree_map
import iree.runtime as ireert
import iree.compiler as ireec

  warn(


In [12]:
class IREEInvoker:
    """A wrapper around an IREE module that provides a Pythonic interface.
    
    Specifically, this adapts `module.forward(...)` and similar calls into
    lower-level calls into the functions in the IREE module, and also converts
    between the IREE and Torch types.
    """

    def __init__(self, iree_module):
        self._iree_module = iree_module
        self.device = iree_module._context.config.device

    def __getattr__(self, function_name: str):
        def invoke(*args):
            def wrap(x):
                if isinstance(x, torch.Tensor):
                    return ireert.asdevicearray(self.device, x)
                return x
            def unwrap(x):
                if isinstance(x, ireert.DeviceArray):
                    return torch.from_numpy(np.asarray(x).copy())
                return x
            iree_args = tree_map(wrap, args)
            result = self._iree_module[function_name](*iree_args)
            return tree_map(unwrap, result)
        return invoke
    
def _map_target_backend_to_driver(target_backend):
    if target_backend == "cuda":
        return "cuda"
    if target_backend == "vulkan":
        return "vulkan"
    if target_backend in ("llvm-cpu", "vmvx"):
        return "local-sync"
    raise ValueError(f"Unknown target backend: {target_backend}")

def load_vmfb(flatbuffer, backend="llvm-cpu"):
    """Load an IREE Flatbuffer into an in-process runtime wrapper.
    The wrapper accepts and returns `torch.Tensor` types.
    """
    config = ireert.Config(driver_name=_map_target_backend_to_driver(backend))
    ctx = ireert.SystemContext(config=config)
    vm_module = ireert.VmModule.from_flatbuffer(ctx.instance, flatbuffer)
    ctx.add_vm_module(vm_module)
    return IREEInvoker(ctx.modules.module)

def compile_to_vmfb(mlir_module, target_backend="llvm-cpu", 
                    cuda_llvm_target_arch: Optional[str] = None):
    """Compile an MLIR module to an IREE Flatbuffer.
    The module is expected to be in the format produced by `torch_mlir.compile`
    with `OutputType.LINALG_ON_TENSORS`.
    TODO: Expose more compiler options.
    """
    extra_args = []
    if cuda_llvm_target_arch is not None:
        arch_flag = f"--iree-hal-cuda-llvm-target-arch={cuda_llvm_target_arch}"
        extra_args.append(arch_flag)
    bytecode_stream = io.BytesIO()
    mlir_module.operation.write_bytecode(bytecode_stream)
    bytecode = bytecode_stream.getvalue()
    
    return ireec.compile_str(bytecode,
                             target_backends=[target_backend],
                             input_type=ireec.InputType.TM_TENSOR,
                             extra_args=extra_args)

In [15]:
from iree.runtime.benchmark import benchmark_module
input_sample = np.random.uniform(low=0.0, high=1.0, size=(batch_size, 3, 224, 224)).astype(np.float32)
input = torch.Tensor(input_sample)
model_list = {"MobileNetV3":mobilenet_v3_small(pretrained=False), "ShuffleNetV2":shufflenet_v2_x1_5(pretrained=False),
            "ResNet18":resnet18(pretrained=False), "ResNet50":resnet50(pretrained=False), "SqueezeNet":squeezenet1_1(pretrained=False),
            "DenseNet121":densenet121(pretrained=False), "InceptionV3":inception_v3(pretrained=False), "ViT_B16":models.vision_transformer.vit_b_16(weights=False)}

for modelname, model in model_list.items():
    print("\r\n**********Processing model " + modelname)
    try: 
        model.train(mode=False)
        t1_start = time.perf_counter()
        
        ts_graph = torch.jit.script(model)
        module_ir = torch_mlir.compile(ts_graph, input,
                                            output_type=torch_mlir.OutputType.LINALG_ON_TENSORS)

        binary = compile_to_vmfb(module_ir, target_backend="cuda")

        t2_stop = time.perf_counter()
        module = runtime.load_vm_flatbuffer(binary, driver="cuda")

        print(modelname + "****Compilation Time: {:.3f}s".format(t2_stop - t1_start)) # print performance indicator

        print("Calculating forward latency:\n  ", end="")
        tms = []
        for i in range(10):
            ret = benchmark_module(module.vm_module, entry_functiong="forward", inputs=["1x3x224x224xf32=1"], device="cuda")
            tm = ret[0].time
            tms.append(float(tm[0:-3]))
        print("{} - {:.3f} ± {:.3f} ms".format(modelname, np.mean(tms), np.std(tms)))
    except Exception as e:
        print(str(e)[:100]) #only print error head


**********Processing model MobileNetV3
MobileNetV3****Compilation Time: 3.508s
Calculating forward latency:
  MobileNetV3 - 1.628 ± 0.004 ms

**********Processing model ShuffleNetV2
Lowering TorchScript IR -> Torch Backend IR failed with the following diagnostics:


python exceptio

**********Processing model ResNet18
ResNet18****Compilation Time: 1.651s
Calculating forward latency:
  ResNet18 - 6.510 ± 0.021 ms

**********Processing model ResNet50
ResNet50****Compilation Time: 3.494s
Calculating forward latency:
  ResNet50 - 15.080 ± 0.040 ms

**********Processing model SqueezeNet
SqueezeNet****Compilation Time: 1.657s
Calculating forward latency:
  SqueezeNet - 1.650 ± 0.000 ms

**********Processing model DenseNet121
Lowering Torch Backend IR -> Linalg-on-Tensors Backend IR failed with the following diagnostics:


p

**********Processing model InceptionV3
Lowering TorchScript IR -> Torch Backend IR failed with the following diagnostics:


python exceptio

**********Processing model 