# ImageNet inference with UFront

#### You may also run this jupyter notebook on Kaggle (anonymous repo): 
#### https://www.kaggle.com/code/anomyuser/ufront-test/

In [1]:
import torch,os,random
import torch.nn as nn
import torch.nn.functional as F
from typing import Tuple
from PIL import Image
from torchvision import transforms
from torch.utils import data
from functools import partial
from torchvision.models import resnet18, resnet50, squeezenet1_1, regnet_x_32gf, maxvit_t, shufflenet_v2_x1_5, inception_v3, mobilenet_v3_small, efficientnet_v2_s, densenet121, convnext_small
import tqdm

In [2]:
# check python first
!python --version

Python 3.10.13


### Prepare the ImageNet validation dataset

In [3]:
root = "/kaggle/input/" #set to the path of imagenet-1k validation set, which contains a folder named 'imagenet1kvalid'
working = "/kaggle/working/"
!wget https://raw.githubusercontent.com/raghakot/keras-vis/master/resources/imagenet_class_index.json

--2024-05-23 03:30:37--  https://raw.githubusercontent.com/raghakot/keras-vis/master/resources/imagenet_class_index.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 35363 (35K) [text/plain]
Saving to: 'imagenet_class_index.json'


2024-05-23 03:30:37 (3.38 MB/s) - 'imagenet_class_index.json' saved [35363/35363]



In [4]:
import os
from torch.utils.data import Dataset
from PIL import Image
import json
syn_to_class = {}
with open(os.path.join(working, "imagenet_class_index.json"), "rb") as f:
    json_file = json.load(f)
    for class_id, v in json_file.items():
        syn_to_class[class_id] = v[1]
                
def get_class_name(entry):        
    target = syn_to_class[int(entry)]
    return target
        
class ImageNetKaggle(Dataset):
    def __init__(self, root, transform=None):
        self.samples = []
        self.targets = []
        self.transform = transform
        samples_dir = os.path.join(root, "imagenet1kvalid")
        for entry in os.listdir(samples_dir):
                sample_path = os.path.join(samples_dir, entry)
                for file in os.listdir(sample_path):                    
                    self.samples.append(os.path.join(sample_path, file))
                    self.targets.append(int(entry))
                
    def __len__(self):
            return len(self.samples)
        
    def __getitem__(self, idx):
            x = Image.open(self.samples[idx]).convert("RGB")
            if self.transform:
                x = self.transform(x)
            return x, self.targets[idx]

### Prepare Dataloader

In [5]:
from torch.utils.data import DataLoader
from torchvision import transforms
import torch
import torchvision
from tqdm import tqdm
mean = (0.485, 0.456, 0.406)
std = (0.229, 0.224, 0.225)
val_transform = transforms.Compose(
            [
                transforms.Resize(256),
                transforms.CenterCrop(224),
                transforms.ToTensor(),
                transforms.Normalize(mean, std),
            ]
        )
dataset = ImageNetKaggle(root, val_transform)
dataloader = DataLoader(
            dataset,
            batch_size=64, # may need to reduce this depending on your GPU 
            num_workers=8, # may need to reduce this depending on your num of CPUs and RAM
            shuffle=False,
            drop_last=False,
            pin_memory=True
        )



### 1) Download and install UFront package

In [6]:
# You may execute !python --version and install the following ufront package based on your python version
# You may download the latest UFront package or obtain them from the release folder

# For Python 3.7
# !wget https://anonymous.4open.science/r/anonyufront-2B3E/release/ufront-0.1.1-cp37-cp37m-manylinux_2_28_x86_64.whl

# For Python 3.8
# !wget https://anonymous.4open.science/r/anonyufront-2B3E/release/ufront-0.1.1-cp38-cp38-manylinux_2_28_x86_64.whl

# For Python 3.9
# !wget https://anonymous.4open.science/r/anonyufront-2B3E/release/ufront-0.1.1-cp39-cp39-manylinux_2_28_x86_64.whl

# For Python 3.10
!wget https://anonymous.4open.science/r/anonyufront-2B3E/release/ufront-0.1.1-cp310-cp310-manylinux_2_28_x86_64.whl

# For Python 3.11
# !wget https://anonymous.4open.science/r/anonyufront-2B3E/release/ufront-0.1.1-cp311-cp311-manylinux_2_28_x86_64.whl

--2024-05-23 03:30:55--  https://anonymous.4open.science/r/anonyufront-2B3E/release/ufront-0.1.1-cp310-cp310-manylinux_2_28_x86_64.whl
Resolving anonymous.4open.science (anonymous.4open.science)... 172.67.183.76, 104.21.18.195, 2606:4700:3035::ac43:b74c, ...
Connecting to anonymous.4open.science (anonymous.4open.science)|172.67.183.76|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /api/repo/anonyufront-2B3E/file/release/ufront-0.1.1-cp310-cp310-manylinux_2_28_x86_64.whl [following]
--2024-05-23 03:30:55--  https://anonymous.4open.science/api/repo/anonyufront-2B3E/file/release/ufront-0.1.1-cp310-cp310-manylinux_2_28_x86_64.whl
Reusing existing connection to anonymous.4open.science:443.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/octet-stream]
Saving to: 'ufront-0.1.1-cp310-cp310-manylinux_2_28_x86_64.whl'

ufront-0.1.1-cp310-     [      <=>           ]  57.79M  9.64MB/s    in 6.5s    

2024-05-23 03:31:02 (

In [7]:
# !pip install ufront-0.1.1-cp37-cp37m-manylinux_2_28_x86_64.whl #for python3.7

# !pip install ufront-0.1.1-cp38-cp38-manylinux_2_28_x86_64.whl #for python3.8
# !pip install ufront-0.1.1-cp39-cp39-manylinux_2_28_x86_64.whl #for python3.9
!pip install ufront-0.1.1-cp310-cp310-manylinux_2_28_x86_64.whl #for python3.10
# !pip install ufront-0.1.1-cp311-cp311-manylinux_2_28_x86_64.whl #for python3.11

Processing ./ufront-0.1.1-cp310-cp310-manylinux_2_28_x86_64.whl
Collecting tf2onnx (from ufront==0.1.1)
  Downloading tf2onnx-1.16.1-py3-none-any.whl.metadata (1.3 kB)
Collecting onnxsim==0.4.17 (from ufront==0.1.1)
  Downloading onnxsim-0.4.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Downloading onnxsim-0.4.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tf2onnx-1.16.1-py3-none-any.whl (455 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m455.8/455.8 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tf2onnx, onnxsim, ufront
Successfully installed onnxsim-0.4.17 tf2onnx-1.16.1 ufront-0.1.1


In [8]:
!nvidia-smi

Thu May 23 03:31:20 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Tesla T4                       Off | 00000000:00:0

### 2) Install compiler backend and runtime


In [9]:
# !pip install iree-compiler==20231004.665 iree-runtime==20231004.665
!pip install iree-compiler==20230815.614 iree-runtime==20230815.614
# !pip install iree-tools-tf==20230815.614  iree-tools-tflite==20230815.614

Collecting iree-compiler==20230815.614
  Downloading iree_compiler-20230815.614-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (577 bytes)
Collecting iree-runtime==20230815.614
  Downloading iree_runtime-20230815.614-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.5 kB)
Downloading iree_compiler-20230815.614-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (68.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.2/68.2 MB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading iree_runtime-20230815.614-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m63.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: iree-runtime, iree-compiler
Successfully installed iree-compiler-20230815.614 iree-runtime-20230815.614


### 3) Compile the models and run compiled models on GPU

In [10]:
from ufront.pytorch.model import UFrontTorch 
import iree
import iree.compiler as ireec
from iree.compiler import tools
from iree import runtime

def get_ufront_ir(net, dataloader):
    for x, y in dataloader:
        break
    net.eval()
    indata = x.numpy()
    print("Compiling model...")
    model = UFrontTorch(net, batch_size=indata.shape[0], pass_weights=True) # convert torch model to ufront model
    #This will trigger Rust frontend for actual model conversion and graph building
    #operators can also be managed by python side (each operator here corresponding to an operator in the Rust computation graph)
    output_tensors = model(inputs = [indata])

    #The output of the model (forward pass have not been triggered at the moment!)
    output = model.softmax(input=output_tensors[0], name="softmax_out")

    #This will trigger model compilation, i.e., convert Rust computation graph to a unified high-level IR and lower it to TOSA IR
    model.compile(optimizer={"type":"sgd", "lr":"0.01", "momentum":"0", "nesterov":"False", "weight_decay":"0"},
                        loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
    
    modelir = model.dump_ir()
    return modelir

def compile_with_ufront(net, GPU, dataloader):
    for x, y in dataloader:
        break
    net.eval()
    indata = x.numpy()
    print("Compiling model...")
    model = UFrontTorch(net, batch_size=indata.shape[0], pass_weights=True) # convert torch model to ufront model
    #This will trigger Rust frontend for actual model conversion and graph building
    #operators can also be managed by python side (each operator here corresponding to an operator in the Rust computation graph)
    output_tensors = model(inputs = [indata])

    #The output of the model (forward pass have not been triggered at the moment!)
    output = model.softmax(input=output_tensors[0], name="softmax_out")

    #This will trigger model compilation, i.e., convert Rust computation graph to a unified high-level IR and lower it to TOSA IR
    model.compile(optimizer={"type":"sgd", "lr":"0.01", "momentum":"0", "nesterov":"False", "weight_decay":"0"},
                        loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
    
    modelir = model.dump_ir()
    
    print("Compiling to TOSA...")
    tosa_ir= model.dump_tosa_ir()

    print("Compiling to binary...")
    if GPU:
        binary = ireec.compile_str(tosa_ir,
                        target_backends=["cuda"], 
                        input_type=ireec.InputType.TOSA)
        module = runtime.load_vm_flatbuffer(binary, driver="cuda")
    else:
        binary = ireec.compile_str(tosa_ir,
                        target_backends=["llvm-cpu"], 
                        input_type=ireec.InputType.TOSA)
        module = runtime.load_vm_flatbuffer(binary,backend="llvm-cpu") 
    print("binary executable produced!")
    return module

Collecting onnxruntime
  Downloading onnxruntime-1.18.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.3 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading onnxruntime-1.18.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (6.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected package

[0m

Successfully installed coloredlogs-15.0.1 humanfriendly-10.0 onnxruntime-1.18.0


In [11]:
def get_ufront_accuracy(module, dataloader):
    correct = 0.0
    total = 0.0
    batch_size = 0
    print("Start inference...")
    for x, y in tqdm(dataloader):
        if batch_size == 0:
            batch_size = x.shape[0]
        elif x.shape[0] < batch_size:
            print("Ignore last batch!") #dynamic batch size is currently not supported!
            break
        y_pred = module.forward(x.numpy()).to_host()
        correct += (y_pred.argmax(axis=1) == y.numpy()).sum().item()
        total += len(y)
    return correct / total

In [12]:
#If you experiencing the following error, you need to upgrade NVidia Driver and CUDA; or you can lower IREE to a lower version, e.g., 20230330.474.

#RuntimeError: Error creating vm context with modules: 
#main_checkout/runtime/src/iree/hal/drivers/cuda/native_executable.c:99: INTERNAL; 
#CUDA driver error 'CUDA_ERROR_UNSUPPORTED_PTX_VERSION' (222): 
#the provided PTX was compiled with an unsupported toolchain.; 
#while invoking native function hal.executable.create; while calling import; 

### Compile Vision models and perform inference on ImageNet validation set

In [13]:
net = mobilenet_v3_small(weights="DEFAULT", dropout=0.0)
module = compile_with_ufront(net, True, dataloader) # compile to binary
get_ufront_accuracy(module, dataloader) # inference

Downloading: "https://download.pytorch.org/models/mobilenet_v3_small-047dcff4.pth" to /root/.cache/torch/hub/checkpoints/mobilenet_v3_small-047dcff4.pth
100%|██████████| 9.83M/9.83M [00:00<00:00, 74.3MB/s]


Compiling model...


  torch.has_cuda,
  torch.has_cudnn,
  torch.has_mps,
  torch.has_mkldnn,


Compiling to TOSA...
Compiling to binary...
binary executable produced!
Start inference...


100%|█████████▉| 781/782 [02:19<00:00,  5.60it/s]

Ignore last batch!





0.6700544174135723

In [14]:
net = shufflenet_v2_x1_5(weights="DEFAULT")
module = compile_with_ufront(net, True, dataloader)
get_ufront_accuracy(module, dataloader)

Downloading: "https://download.pytorch.org/models/shufflenetv2_x1_5-3c479a10.pth" to /root/.cache/torch/hub/checkpoints/shufflenetv2_x1_5-3c479a10.pth
100%|██████████| 13.6M/13.6M [00:00<00:00, 68.7MB/s]


Compiling model...
Compiling to TOSA...
Compiling to binary...
binary executable produced!
Start inference...


100%|█████████▉| 781/782 [02:15<00:00,  5.74it/s]

Ignore last batch!





0.7154089308578745

In [15]:
net = squeezenet1_1(weights="DEFAULT")
module = compile_with_ufront(net, True, dataloader)
get_ufront_accuracy(module, dataloader)

Downloading: "https://download.pytorch.org/models/squeezenet1_1-b8a52dc0.pth" to /root/.cache/torch/hub/checkpoints/squeezenet1_1-b8a52dc0.pth
100%|██████████| 4.73M/4.73M [00:00<00:00, 46.9MB/s]


Compiling model...
Compiling to TOSA...
Compiling to binary...
binary executable produced!
Start inference...


100%|█████████▉| 781/782 [02:17<00:00,  5.68it/s]

Ignore last batch!





0.5783650768245838

In [16]:
net = resnet18(weights="DEFAULT")
module = compile_with_ufront(net, True, dataloader)
get_ufront_accuracy(module, dataloader)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 126MB/s]


Compiling model...
Compiling to TOSA...
Compiling to binary...
binary executable produced!
Start inference...


100%|█████████▉| 781/782 [04:37<00:00,  2.82it/s]

Ignore last batch!





0.6934018886043534

In [17]:
net = resnet50(weights="DEFAULT")
module = compile_with_ufront(net, True, dataloader)
get_ufront_accuracy(module, dataloader)

Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 154MB/s]


Compiling model...
Compiling to TOSA...
Compiling to binary...
binary executable produced!
Start inference...


100%|█████████▉| 781/782 [09:24<00:00,  1.38it/s]

Ignore last batch!





0.7921334827144686

In [18]:
net = densenet121(weights="DEFAULT")
module = compile_with_ufront(net, True, dataloader)
get_ufront_accuracy(module, dataloader)

Downloading: "https://download.pytorch.org/models/densenet121-a639ec97.pth" to /root/.cache/torch/hub/checkpoints/densenet121-a639ec97.pth
100%|██████████| 30.8M/30.8M [00:00<00:00, 118MB/s]


Compiling model...
Compiling to TOSA...
Compiling to binary...
binary executable produced!
Start inference...


100%|█████████▉| 781/782 [06:09<00:00,  2.11it/s]

Ignore last batch!





0.7347751280409731

In [19]:
net = inception_v3(weights="DEFAULT") 
module = compile_with_ufront(net, True, dataloader)
get_ufront_accuracy(module, dataloader) #low performance than official reported, to fix this, the image size need to resize to 299 x 299, instead of standard 224 x 224

Downloading: "https://download.pytorch.org/models/inception_v3_google-0cc3c7bd.pth" to /root/.cache/torch/hub/checkpoints/inception_v3_google-0cc3c7bd.pth
100%|██████████| 104M/104M [00:00<00:00, 144MB/s] 


Compiling model...
Compiling to TOSA...
Compiling to binary...
binary executable produced!
Start inference...


100%|█████████▉| 781/782 [06:30<00:00,  2.00it/s]

Ignore last batch!





0.6995038412291933

In [20]:
import torchvision.models as models
net = models.vision_transformer.vit_b_16(weights="DEFAULT")
module = compile_with_ufront(net, True, dataloader)
get_ufront_accuracy(module, dataloader)

Downloading: "https://download.pytorch.org/models/vit_b_16-c867db91.pth" to /root/.cache/torch/hub/checkpoints/vit_b_16-c867db91.pth
100%|██████████| 330M/330M [00:02<00:00, 164MB/s]


Compiling model...
Compiling to TOSA...
Compiling to binary...
binary executable produced!
Start inference...


100%|█████████▉| 781/782 [14:43<00:01,  1.13s/it]

Ignore last batch!





0.8054577464788732

In [21]:
#fix low performance of inception3
val_transform = transforms.Compose(
            [
                transforms.Resize(299),
                transforms.CenterCrop(299), #299 instead of 224, see this: https://github.com/IntelLabs/distiller/issues/422
                transforms.ToTensor(),
                transforms.Normalize(mean, std),
            ]
        )
dataset = ImageNetKaggle(root, val_transform)
dataloader = DataLoader(
            dataset,
            batch_size=64, # may need to reduce this depending on your GPU 
            num_workers=8, # may need to reduce this depending on your num of CPUs and RAM
            shuffle=False,
            drop_last=False,
            pin_memory=True
        )
net = inception_v3(weights="DEFAULT") 
module = compile_with_ufront(net, True, dataloader)
get_ufront_accuracy(module, dataloader)

Compiling model...
Compiling to TOSA...
Compiling to binary...
binary executable produced!
Start inference...


100%|█████████▉| 781/782 [16:39<00:01,  1.28s/it]

Ignore last batch!





0.7689660691421255

In [22]:
val_transform = transforms.Compose(
            [
                transforms.Resize(299),
                transforms.CenterCrop(224), #299 instead of 224, see this: https://github.com/IntelLabs/distiller/issues/422
                transforms.ToTensor(),
                transforms.Normalize(mean, std),
            ]
        )
dataset = ImageNetKaggle(root, val_transform)
dataloader = DataLoader(
            dataset,
            batch_size=64, # may need to reduce this depending on your GPU 
            num_workers=8, # may need to reduce this depending on your num of CPUs and RAM
            shuffle=False,
            drop_last=False,
            pin_memory=True
        )

### You may also print high-level of Vision Transformer model

In [23]:
import torchvision.models as models
net = models.vision_transformer.vit_b_16(weights="DEFAULT")
ir = get_ufront_ir(net, dataloader)
# print(ir) # enable this to see high-level IR

Compiling model...


# Bert Model Test

### Compile and run the Bert model

In [25]:
from ufront.pytorch.model import UFrontTorch 
import iree.compiler as ireec
from iree import runtime
from torch_bert import BertModel, BertConfig
import torch
import time
import numpy as np

GPU = True

#sampel input defined in pytorch-pretrained-bert
input_ids = torch.from_numpy(np.array([[31, 51, 99], [15, 5, 0]], dtype="int32"))
input_mask = torch.from_numpy(np.array([[1, 1, 1], [1, 1, 0]], dtype="int32"))
token_type_ids = torch.from_numpy(np.array([[0, 0, 1], [0, 1, 0]], dtype="int32"))

config = BertConfig(vocab_size_or_config_json_file=16000, hidden_size=768,
    num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

net = BertModel(config=config)
net.eval()

model = UFrontTorch(net, batch_size=1, pass_weights=True) # convert torch model to ufront model

output_tensors = model(inputs = [input_ids, token_type_ids, input_mask])

#This will trigger model compilation, i.e., convert Rust computation graph to a unified high-level IR and lower it to TOSA IR
model.compile(optimizer={"type":"sgd", "lr":"0.01", "momentum":"0", "nesterov":"False", "weight_decay":"0"},
                    loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])

modelir = model.dump_ir()


print("Compiling TOSA model...")
tosa_ir= model.dump_tosa_ir()

print("Compiling Binary...")

if GPU:
    binary = ireec.compile_str(tosa_ir,
                    target_backends=["cuda"], 
                    input_type=ireec.InputType.TOSA)
    module = runtime.load_vm_flatbuffer(binary, driver="cuda")
else:
    binary = ireec.compile_str(tosa_ir,
                    target_backends=["llvm-cpu"], 
                    input_type=ireec.InputType.TOSA)
    module = runtime.load_vm_flatbuffer(binary,backend="llvm-cpu") 

ufront_ret = module.forward(input_ids, token_type_ids, input_mask)
for ret in ufront_ret:
    print(ret.to_host())

print(ufront_ret)


Compiling TOSA model...
Compiling Binary...
[[[-0.0126075   1.4139326  -0.91023034 ...  0.28366825  0.08602214
   -0.23005815]
  [-0.542039    3.2243483  -0.38521275 ...  0.6640264  -0.24791919
    0.73075116]
  [-0.72975683  0.75377995 -0.5804731  ... -0.6268702   0.17821047
    0.5281703 ]]

 [[-1.6963978   2.2013307  -2.1347208  ...  1.3283901   0.01012164
    0.93355817]
  [-0.13298714  1.5196626  -0.9118973  ...  0.44733378  0.096411
    0.09232111]
  [-1.90262     2.535816   -1.4880081  ...  1.140133   -0.42023683
   -0.19271582]]]
[[-0.23113337 -0.76582193 -0.87582827 ... -0.282242   -0.4502077
   0.14065129]
 [ 0.3177823  -0.7155069  -0.8724887  ... -0.59379053  0.13726538
  -0.04291175]]
(<IREE DeviceArray: shape=[2, 3, 768], dtype=float32>, <IREE DeviceArray: shape=[2, 768], dtype=float32>)


In [26]:
torch_ret = net(input_ids, token_type_ids, input_mask)
for ret in torch_ret:
    print(ret.detach().numpy())

[[[-0.01260887  1.4139341  -0.9102301  ...  0.28366798  0.08602166
   -0.23005718]
  [-0.54203975  3.2243488  -0.38521317 ...  0.6640283  -0.24792111
    0.73075086]
  [-0.72975695  0.7537807  -0.5804765  ... -0.6268691   0.17821254
    0.5281709 ]]

 [[-1.6963985   2.2013319  -2.13472    ...  1.3283904   0.01012149
    0.933557  ]
  [-0.13299099  1.5196633  -0.9118963  ...  0.4473327   0.09641235
    0.09232146]
  [-1.9026217   2.5358202  -1.4880085  ...  1.1401304  -0.42023662
   -0.1927179 ]]]
[[-0.23113278 -0.76582175 -0.875828   ... -0.28224295 -0.45020658
   0.14065078]
 [ 0.3177829  -0.7155074  -0.8724886  ... -0.59379     0.1372653
  -0.04291317]]


### Results comparison (UFront vs Pytorch output)

In [27]:
def mse(y_true, y_pred):
    return torch.mean((y_true - y_pred) ** 2)

def rmse(y_true, y_pred):
    return torch.sqrt(mse(y_true, y_pred))

def mae(y_true, y_pred):
    return torch.mean(torch.abs(y_true - y_pred))

# also known as cod
def r_square(y_true, y_pred):
    y_mean = torch.mean(y_true)
    ss_tot = torch.sum((y_true - y_mean) ** 2)
    ss_res = torch.sum((y_true - y_pred) ** 2)
    return 1 - ss_res / ss_tot

def mpe(y_true, y_pred):
    return torch.mean((y_true - y_pred) / y_true) * 100

In [28]:
print("Results comparison (UFront vs Pytorch)")

i = 0
for a, b in zip(ufront_ret, torch_ret):
    print("\nOutput index ", i)
    a = a.to_host() # ufront model output
    b = b.detach().numpy() # pytorch model output
    dif = a - b
    mae = np.mean(abs(dif))
    print("MAE: ", mae)
    print("RMSE:", rmse(torch.Tensor(a), torch.Tensor(b)).numpy())
    print("COD:", r_square(torch.Tensor(a), torch.Tensor(b)).numpy())
    print("MPE:", mpe(torch.Tensor(a), torch.Tensor(b)).numpy(), "%")
    i += 1

Results comparison (UFront vs Pytorch)

Output index  0
MAE:  1.1937577e-06
RMSE: 1.4978546e-06
COD: 1.0
MPE: -1.254536e-05 %

Output index  1
MAE:  5.525795e-07
RMSE: 7.10377e-07
COD: 1.0
MPE: -0.0002558513 %


### High-level IR of UFront-Compiled Bert Model

In [29]:
print(modelir)


func.func @forward(%input1: tensor<2x3xi32>, %input2: tensor<2x3xi32>, %input3: tensor<2x3xi32>) -> (tensor<2x3x768xf32>, tensor<2x768xf32>)  { 
	%1="ufront.reshape"(%input3){shape=[2, 1, 3]}:(tensor<2x3xi32>) -> tensor<2x1x3xi32>
	%2="ufront.reshape"(%1){shape=[2, 1, 1, 3]}:(tensor<2x1x3xi32>) -> tensor<2x1x1x3xi32>
	%3="ufront.cast"(%2){dtype="Float"}:(tensor<2x1x1x3xi32>) -> tensor<2x1x1x3xf32>
	%4="ufront.ssub"(%3){scalar=1.0, scalar_position="LEFT"}:(tensor<2x1x1x3xf32>) -> tensor<2x1x1x3xf32>
	%5="ufront.smultiply"(%4){scalar=-10000.0}:(tensor<2x1x1x3xf32>) -> tensor<2x1x1x3xf32>
	%6="ufront.parameter"(){dtype="Int64", initializer="0x5c24c33a40a0", requires_grad=false}:() -> tensor<3xi64>
	%7="ufront.reshape"(%6){shape=[1, 3]}:(tensor<3xi64>) -> tensor<1x3xi64>
	%8="ufront.expand"(%7){sizes=[2, 3]}:(tensor<1x3xi64>) -> tensor<2x3xi64>
	%9="ufront.parameter"(){dtype="Float", initializer="0x5c24dbb00e80", requires_grad=true}:() -> tensor<16000x768xf32>
	%10="ufront.embedding"(%inpu