In [1]:
!nvidia-smi

Wed May 22 14:49:51 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.12             Driver Version: 535.104.12   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-PCIE-40GB          Off | 00000000:3D:00.0 Off |                    0 |
| N/A   30C    P0              35W / 250W |   1185MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-PCIE-40GB          Off | 00000000:3E:00.0 Off |  

# Install dependencies

#### Install IREE

In [2]:
# For CUDA 11
# !pip install iree-compiler==20230524.529 iree-runtime==20230524.529 
# !pip install iree-tools-tf==20230524.529  iree-tools-tflite==20230524.529

# For CUDA 12

!pip install iree-compiler==20230815.614 iree-runtime==20230815.614
!pip install iree-tools-tf==20230815.614  iree-tools-tflite==20230815.614
# fix issue of iree-benchmark-module for iree-compiler (v20230815.614), depend on the installation of IREE package
# ls /opt/conda/lib/python3.10/site-packages/iree/_runtime_libs/
# cp /opt/conda/lib/python3.10/site-packages/iree/_runtime_libs/iree-benchmark-module /opt/conda/lib/python3.10/site-packages/iree/runtime/


[0m

In [3]:
!pip list | grep iree

iree-compiler                20230815.614
iree-runtime                 20230815.614
iree-tools-tf                20230815.614
iree-tools-tflite            20230815.614


#### Install Torch-MLIR

In [None]:
# https://github.com/llvm/torch-mlir/releases/tag/snapshot-20230525.849 download and install torch-mlir and corresponding torch-cpu
# !pip install torch-mlir==20230525.849 -f https://github.com/llvm/torch-mlir/releases/download/snapshot-20230525.849/torch_mlir-20230525.849-cp310-cp310-linux_x86_64.whl --no-dependencies
# !pip install torch==2.1.0.dev20230523 -f https://github.com/llvm/torch-mlir/releases/download/snapshot-20230525.849/torch-2.1.0.dev20230523+cpu-cp310-cp310-linux_x86_64.whl --no-dependencies
# !pip install torchvision==0.16.0 --no-dependencies


In [4]:
# torch-mlir requires matched torch (dev)
!pip list | grep torch

torch                        2.1.0.dev20230523+cpu
torch-mlir                   20230525.849
torchaudio                   2.1.0
torchelastic                 0.2.2
torchvision                  0.16.0


# Torch-MLIR Test

In [5]:
import torch
import io
import numpy as np
import time
import torch_mlir
from torchvision.models import resnet18, resnet50, squeezenet1_1, regnet_x_32gf, maxvit_t, shufflenet_v2_x1_5, inception_v3, mobilenet_v3_small, efficientnet_v2_s, densenet121, convnext_small
import torchvision.models as models
from iree import runtime
from typing import Optional
from torch.utils._pytree import tree_map
import iree.runtime as ireert
import iree.compiler as ireec

  warn(


In [6]:
class IREEInvoker:
    """A wrapper around an IREE module that provides a Pythonic interface.
    
    Specifically, this adapts `module.forward(...)` and similar calls into
    lower-level calls into the functions in the IREE module, and also converts
    between the IREE and Torch types.
    """

    def __init__(self, iree_module):
        self._iree_module = iree_module
        self.device = iree_module._context.config.device

    def __getattr__(self, function_name: str):
        def invoke(*args):
            def wrap(x):
                if isinstance(x, torch.Tensor):
                    return ireert.asdevicearray(self.device, x)
                return x
            def unwrap(x):
                if isinstance(x, ireert.DeviceArray):
                    return torch.from_numpy(np.asarray(x).copy())
                return x
            iree_args = tree_map(wrap, args)
            result = self._iree_module[function_name](*iree_args)
            return tree_map(unwrap, result)
        return invoke
    
def _map_target_backend_to_driver(target_backend):
    if target_backend == "cuda":
        return "cuda"
    if target_backend == "vulkan":
        return "vulkan"
    if target_backend in ("llvm-cpu", "vmvx"):
        return "local-sync"
    raise ValueError(f"Unknown target backend: {target_backend}")

def load_vmfb(flatbuffer, backend="llvm-cpu"):
    """Load an IREE Flatbuffer into an in-process runtime wrapper.
    The wrapper accepts and returns `torch.Tensor` types.
    """
    config = ireert.Config(driver_name=_map_target_backend_to_driver(backend))
    ctx = ireert.SystemContext(config=config)
    vm_module = ireert.VmModule.from_flatbuffer(ctx.instance, flatbuffer)
    ctx.add_vm_module(vm_module)
    return IREEInvoker(ctx.modules.module)

def compile_to_vmfb(mlir_module, target_backend="llvm-cpu", 
                    cuda_llvm_target_arch: Optional[str] = None):
    """Compile an MLIR module to an IREE Flatbuffer.
    The module is expected to be in the format produced by `torch_mlir.compile`
    with `OutputType.LINALG_ON_TENSORS`.
    TODO: Expose more compiler options.
    """
    extra_args = []
    if cuda_llvm_target_arch is not None:
        arch_flag = f"--iree-hal-cuda-llvm-target-arch={cuda_llvm_target_arch}"
        extra_args.append(arch_flag)
    bytecode_stream = io.BytesIO()
    mlir_module.operation.write_bytecode(bytecode_stream)
    bytecode = bytecode_stream.getvalue()
    
    return ireec.compile_str(bytecode,
                             target_backends=[target_backend],
                             input_type=ireec.InputType.TM_TENSOR,
                             extra_args=extra_args)

In [17]:
# !pip install boto3

In [18]:
# !pip install tqdm

#### Vision Models

In [7]:
from benchmark import benchmark_module
batch_size = 1
input_sample = np.random.uniform(low=0.0, high=1.0, size=(batch_size, 3, 224, 224)).astype(np.float32)
input = torch.Tensor(input_sample)
model_list = {"MobileNetV3":mobilenet_v3_small(pretrained=False), "ShuffleNetV2":shufflenet_v2_x1_5(pretrained=False),
            "ResNet18":resnet18(pretrained=False), "ResNet50":resnet50(pretrained=False), "SqueezeNet":squeezenet1_1(pretrained=False),
            "DenseNet121":densenet121(pretrained=False), "InceptionV3":inception_v3(pretrained=False), "ViT_B16":models.vision_transformer.vit_b_16(weights=False)}

for modelname, model in model_list.items():
    print("\r\n**********Processing model " + modelname)
    try: 
        model.train(mode=False)
        t1_start = time.perf_counter()
        
        ts_graph = torch.jit.script(model)
        module_ir = torch_mlir.compile(ts_graph, input,
                                            output_type=torch_mlir.OutputType.LINALG_ON_TENSORS)

        binary = compile_to_vmfb(module_ir, target_backend="cuda")

        t2_stop = time.perf_counter()
        module = runtime.load_vm_flatbuffer(binary, driver="cuda")

        print(modelname + "****Compilation Time: {:.3f}s".format(t2_stop - t1_start)) # print performance indicator

        print("Calculating forward latency:\n  ", end="")
        tms = []
        for i in range(10):
            ret = benchmark_module(module.vm_module, entry_function="forward", inputs=["1x3x224x224xf32=1"], device="cuda")
            tm = ret[0].time
            tms.append(float(tm[0:-3]))
        print("{} - {:.3f} ± {:.3f} ms".format(modelname, np.mean(tms), np.std(tms)))
    except Exception as e:
        print(str(e)[:100]) #print error head




**********Processing model MobileNetV3
MobileNetV3****Compilation Time: 8.148s
Calculating forward latency:
  MobileNetV3 - 1.439 ± 0.005 ms

**********Processing model ShuffleNetV2
Lowering TorchScript IR -> Torch Backend IR failed with the following diagnostics:


python exceptio

**********Processing model ResNet18
ResNet18****Compilation Time: 3.575s
Calculating forward latency:
  ResNet18 - 3.691 ± 0.003 ms

**********Processing model ResNet50
ResNet50****Compilation Time: 6.779s
Calculating forward latency:
  ResNet50 - 11.130 ± 0.046 ms

**********Processing model SqueezeNet
SqueezeNet****Compilation Time: 3.450s
Calculating forward latency:
  SqueezeNet - 1.228 ± 0.004 ms

**********Processing model DenseNet121
Lowering Torch Backend IR -> Linalg-on-Tensors Backend IR failed with the following diagnostics:


p

**********Processing model InceptionV3




Lowering TorchScript IR -> Torch Backend IR failed with the following diagnostics:


python exceptio

**********Processing model ViT_B16
Lowering TorchScript IR -> Torch Backend IR failed with the following diagnostics:


python exceptio


#### Language Model (Bert)

In [8]:
import iree.compiler as ireec
from iree import runtime
from torch_bert import BertModel, BertConfig
import torch
import time
modelname = "Bert"
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

config = BertConfig(vocab_size_or_config_json_file=16000, hidden_size=768,
    num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

model = BertModel(config=config)
model.eval()

ts_graph = torch.jit.script(model)
module_ir = torch_mlir.compile(ts_graph, input_ids,
                                    output_type=torch_mlir.OutputType.LINALG_ON_TENSORS)

binary = compile_to_vmfb(module_ir, target_backend="llvm-cpu")
compiled_model = runtime.load_vm_flatbuffer(binary,backend="llvm-cpu")
        
print("Performing benchmark...")

t1_stop = time.perf_counter()
print("**** Model {} - Total Time: {:.3f}s".format(modelname, t1_stop - t1_start)) # print performance indicator

%timeit -n 100 compiled_model.forward(x_train)

RuntimeError: 
Arguments for call are not valid.
The following variants are available:
  
  aten::arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor:
  Expected a value of type 'Optional[Device]' for argument 'device' but instead found type 'Tensor (inferred)'.
  Inferred the value for argument 'device' to be of type 'Tensor' because it was not annotated with an explicit type.
  
  aten::arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor:
  Argument end not provided.
  
  aten::arange.start_step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor:
  Argument end not provided.
  
  aten::arange.start_out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!):
  Argument end not provided.
  
  aten::arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!):
  Argument out not provided.

The original call is:
  File "/root/ufront-test/torch_bert.py", line 221
    def forward(self, x, dtype, device):
        return torch.arange(x, dtype=dtype, device=device)
               ~~~~~~~~~~~~ <--- HERE


#### RNN/LSTM

In [9]:
import time
from torch_def import *
modelname = "LSTM"
batch_size = 8
hidden_size = 128
seq_size = 32
input_size = 256
try: 
    t1_start = time.perf_counter()

    input = np.random.randn(batch_size, seq_size,hidden_size).astype(np.float32)
    h0 = np.zeros((batch_size, hidden_size), dtype=np.float32)
    c0 = np.zeros((batch_size, hidden_size), dtype=np.float32)

    input, h0, c0 = torch.Tensor(input), torch.Tensor(h0), torch.Tensor(c0)
    model = SimpleLSTM(input_size = 10, hidden_size = hidden_size, seq_size=seq_size)

    model.train(mode=False)
    ts_graph = torch.jit.script(model)
    module_ir = torch_mlir.compile(ts_graph, input,
                                        output_type=torch_mlir.OutputType.LINALG_ON_TENSORS)

    binary = compile_to_vmfb(module_ir, target_backend="llvm-cpu")
    compiled_model = runtime.load_vm_flatbuffer(binary,backend="llvm-cpu")
    print("Performing benchmark...")

    t1_stop = time.perf_counter()
    print("**** Model {} - Total Time: {:.3f}s".format(modelname, t1_stop - t1_start)) # print performance indicator

    %timeit -n 100 compiled_model.forward(input)
except Exception as e:
    print(e)

Arg annotations should have one entry per function parameter (including self).


# UFront Test

In [5]:
# !pip uninstall ufront -y
!pip install ./ufront-0.1.1-cp310-cp310-manylinux_2_28_x86_64.whl

[0mProcessing ./ufront-0.1.1-cp310-cp310-manylinux_2_28_x86_64.whl
Installing collected packages: ufront
Successfully installed ufront-0.1.1
[0m

In [10]:
import pathlib
import time
import torch
from torchvision.models import resnet18, resnet50, squeezenet1_1, regnet_x_32gf, maxvit_t, shufflenet_v2_x1_5, inception_v3, mobilenet_v3_small, efficientnet_v2_s, densenet121, convnext_small
import torchvision.models as models
from ufront.pytorch.model import UFrontTorch
import argparse
import ctypes
from iree.compiler import tools
from iree import runtime
import iree.runtime as ireert
import iree.compiler as ireec
from typing import Optional
import numpy as np
import torch
import iree.runtime as ireert
import iree.compiler as ireec

#### Vision models

In [11]:
import numpy as np
from benchmark import benchmark_module
batch_size = 1
input_sample = np.random.uniform(low=0.0, high=1.0, size=(batch_size, 3, 224, 224)).astype(np.float32)
input = torch.Tensor(input_sample)

model_list = {"MobileNetV3":mobilenet_v3_small(pretrained=False), "ShuffleNetV2":shufflenet_v2_x1_5(pretrained=False),
            "ResNet18":resnet18(pretrained=False), "ResNet50":resnet50(pretrained=False), "SqueezeNet":squeezenet1_1(pretrained=False),
            "DenseNet121":densenet121(pretrained=False), "InceptionV3":inception_v3(pretrained=False), "ViT_B16":models.vision_transformer.vit_b_16(weights=False, dropout=0.1)}

for modelname, net in model_list.items():
    net.train(False) 

    t1_start = time.perf_counter()
    model = UFrontTorch(net, batch_size=batch_size, pass_weights=True) # convert torch model to ufront model
    #This will trigger Rust frontend for actual model conversion and graph building
    #operators can also be managed by python side (each operator here corresponding to an operator in the Rust computation graph)
    output_tensors = model(inputs = [input])

    #This will trigger model compilation, i.e., convert Rust computation graph to a unified high-level IR and lower it to TOSA IR
    model.compile(optimizer={"type":"sgd", "lr":"0.01", "momentum":"0", "nesterov":"False", "weight_decay":"0"},
                        loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
    tosa_ir= model.dump_tosa_ir()

    t1_stop = time.perf_counter()

    binary = ireec.compile_str(tosa_ir,
                    target_backends=["cuda"], 
                    input_type=ireec.InputType.TOSA)
    t2_stop = time.perf_counter()

    print(modelname + "****Ufront->TOSA Time: {:.3f}s, TOSA->Binary Time: {:.3f}s, Total Time: {:.3f}s".format(t1_stop - t1_start, t2_stop - t1_stop, t2_stop - t1_start)) # print performance indicator
    module = runtime.load_vm_flatbuffer(binary, driver="cuda")
    tms = []
    for i in range(10):
        ret = benchmark_module(module.vm_module, entry_function="forward", inputs=["1x3x224x224xf32=1"], device="cuda")
        tm = ret[0].time
        tms.append(float(tm[0:-3]))
    print("{} - {:.3f} ± {:.3f} ms".format(modelname, np.mean(tms), np.std(tms)))




MobileNetV3****Ufront->TOSA Time: 0.435s, TOSA->Binary Time: 7.299s, Total Time: 7.734s
MobileNetV3 - 1.520 ± 0.014 ms
ShuffleNetV2****Ufront->TOSA Time: 0.639s, TOSA->Binary Time: 5.097s, Total Time: 5.736s
ShuffleNetV2 - 2.676 ± 0.008 ms
ResNet18****Ufront->TOSA Time: 0.948s, TOSA->Binary Time: 4.569s, Total Time: 5.516s
ResNet18 - 4.483 ± 0.006 ms
ResNet50****Ufront->TOSA Time: 1.725s, TOSA->Binary Time: 8.104s, Total Time: 9.829s
ResNet50 - 7.062 ± 0.006 ms
SqueezeNet****Ufront->TOSA Time: 0.130s, TOSA->Binary Time: 3.031s, Total Time: 3.162s
SqueezeNet - 1.085 ± 0.005 ms
DenseNet121****Ufront->TOSA Time: 1.867s, TOSA->Binary Time: 13.395s, Total Time: 15.261s
DenseNet121 - 12.030 ± 0.046 ms
InceptionV3****Ufront->TOSA Time: 1.869s, TOSA->Binary Time: 10.554s, Total Time: 12.423s
InceptionV3 - 15.420 ± 0.040 ms
ViT_B16****Ufront->TOSA Time: 6.395s, TOSA->Binary Time: 10.204s, Total Time: 16.599s
ViT_B16 - 12.510 ± 0.164 ms


#### Language Model (Bert)

In [12]:
from ufront.pytorch.model import UFrontTorch 
import iree.compiler as ireec
from iree import runtime
from torch_bert import BertModel, BertConfig
import torch
import time
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

config = BertConfig(vocab_size_or_config_json_file=16000, hidden_size=768,
    num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

net = BertModel(config=config)
net.eval()

t1_start = time.perf_counter()
model = UFrontTorch(net, batch_size=1, pass_weights=True) # convert torch model to ufront model
output_tensors = model(inputs = [input_ids, token_type_ids, input_mask])

model.compile(optimizer={"type":"sgd", "lr":"0.01", "momentum":"0", "nesterov":"False", "weight_decay":"0"},
                    loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])

print("Compiling TOSA model...")
tosa_ir= model.dump_tosa_ir()
t1_stop = time.perf_counter()
print("Compiling Binary...")
binary = ireec.compile_str(tosa_ir,
                target_backends=["cuda"], 
                input_type=ireec.InputType.TOSA)
t2_stop = time.perf_counter()
print("Bert****Ufront->TOSA Time: {:.3f}s, TOSA->Binary Time: {:.3f}s, Total Time: {:.3f}s".format(t1_stop - t1_start, t2_stop - t1_stop, t2_stop - t1_start)) # print performance indicator
module = runtime.load_vm_flatbuffer(binary, driver="cuda")

%timeit -n 100 module.forward(input_ids, token_type_ids, input_mask)


Compiling TOSA model...
Compiling Binary...
Bert****Ufront->TOSA Time: 6.852s, TOSA->Binary Time: 10.501s, Total Time: 17.353s
7.21 ms ± 354 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


#### RNN/LSTM

In [13]:
import torch
from ufront.pytorch.model import UFrontTorch
import iree.compiler as ireec
from iree import runtime
import time
import torch
from torch_def import *
import numpy as np
batch_size = 8
hidden_size = 128
seq_size = 32
input_size = 256
input = np.random.randn(batch_size, seq_size,hidden_size).astype(np.float32)
h0 = np.zeros((batch_size, hidden_size), dtype=np.float32)
c0 = np.zeros((batch_size, hidden_size), dtype=np.float32)
t1_start = time.perf_counter()
input, h0, c0 = torch.Tensor(input), torch.Tensor(h0), torch.Tensor(c0)
lstm = SimpleLSTM(input_size = 10, hidden_size = hidden_size, seq_size=seq_size)
model = UFrontTorch(lstm, batch_size=batch_size, pass_weights=True)
output_tensors = model(inputs = [input, h0, c0])

model.compile(optimizer={"type":"sgd", "lr":"0.01", "momentum":"0", "nesterov":"False", "weight_decay":"0"},
                      loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])

tosa_ir = model.dump_tosa_ir()
t1_stop = time.perf_counter()

binary = ireec.compile_str(tosa_ir,
                target_backends=["cuda"], 
                input_type=ireec.InputType.TOSA)
t2_stop = time.perf_counter()

print("LSTM****Ufront->TOSA Time: {:.3f}s, TOSA->Binary Time: {:.3f}s, Total Time: {:.3f}s".format(t1_stop - t1_start, t2_stop - t1_stop, t2_stop - t1_start)) # print performance indicator

module = runtime.load_vm_flatbuffer(binary, driver="cuda")
%timeit -n 100 module.forward(input, h0, c0)

LSTM****Ufront->TOSA Time: 0.138s, TOSA->Binary Time: 3.668s, Total Time: 3.806s
2.03 ms ± 230 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
