## Instll dependencies for Torch-MLIR

In [14]:
!pip install iree_compiler==20240129.785 iree_runtime==20240129.785

[0m

In [15]:
!pip list | grep iree

iree-compiler      20240129.785
iree-runtime       20240129.785


In [16]:
!python --version

Python 3.11.5


In [None]:
# https://github.com/llvm/torch-mlir/releases/tag/snapshot-20240127.1096 download and install torch-mlir and corresponding torch-cpu
# Please note: recent torch-mlir only support Python3.8 and Python 3.9, for other python support, you may download older version of torch-mlir
# !pip install https://github.com/llvm/torch-mlir/releases/download/snapshot-20240127.1096/torch_mlir-20240127.1096-cp311-cp311-linux_x86_64.whl --no-dependencies
# !pip install https://github.com/llvm/torch-mlir/releases/download/snapshot-20240127.1096/torch-2.3.0.dev20240122+cpu-cp311-cp311-linux_x86_64.whl --no-dependencies
# !pip install https://download.pytorch.org/whl/cpu/torchvision-0.18.0%2Bcpu-cp311-cp311-linux_x86_64.whl --no-dependencies


In [17]:
!pip list | grep torch

torch              2.3.0.dev20240122+cpu
torch-mlir         20240127.1096
torchvision        0.18.0+cpu


In [12]:
# Older IREE version used for older torch-mlir
# !pip install iree-compiler==20230815.614 
# !pip install https://github.com/iree-org/iree/releases/download/candidate-20230816.615/iree_runtime-20230816.615-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl --no-dependencies --force-reinstall

In [18]:
#dependencies for torch-vit and bert model
!pip install boto3
!pip install tqdm

[0m

# Torch-MLIR Test

In [6]:
import torch
import io
import numpy as np
import time
import torch_mlir
from torchvision.models import resnet18, resnet50, squeezenet1_1, regnet_x_32gf, maxvit_t, shufflenet_v2_x1_5, inception_v3, mobilenet_v3_small, efficientnet_v2_s, densenet121, convnext_small
import torchvision.models as models
from iree import runtime
from typing import Optional
from torch.utils._pytree import tree_map
import iree.runtime as ireert
import iree.compiler as ireec

In [7]:
class IREEInvoker:
    """A wrapper around an IREE module that provides a Pythonic interface.
    
    Specifically, this adapts `module.forward(...)` and similar calls into
    lower-level calls into the functions in the IREE module, and also converts
    between the IREE and Torch types.
    """

    def __init__(self, iree_module):
        self._iree_module = iree_module
        self.device = iree_module._context.config.device

    def __getattr__(self, function_name: str):
        def invoke(*args):
            def wrap(x):
                if isinstance(x, torch.Tensor):
                    return ireert.asdevicearray(self.device, x)
                return x
            def unwrap(x):
                if isinstance(x, ireert.DeviceArray):
                    return torch.from_numpy(np.asarray(x).copy())
                return x
            iree_args = tree_map(wrap, args)
            result = self._iree_module[function_name](*iree_args)
            return tree_map(unwrap, result)
        return invoke
    
def _map_target_backend_to_driver(target_backend):
    if target_backend == "cuda":
        return "cuda"
    if target_backend == "vulkan":
        return "vulkan"
    if target_backend in ("llvm-cpu", "vmvx"):
        return "local-sync"
    raise ValueError(f"Unknown target backend: {target_backend}")

def load_vmfb(flatbuffer, backend="llvm-cpu"):
    """Load an IREE Flatbuffer into an in-process runtime wrapper.
    The wrapper accepts and returns `torch.Tensor` types.
    """
    config = ireert.Config(driver_name=_map_target_backend_to_driver(backend))
    ctx = ireert.SystemContext(config=config)
    vm_module = ireert.VmModule.from_flatbuffer(ctx.instance, flatbuffer)
    ctx.add_vm_module(vm_module)
    return IREEInvoker(ctx.modules.module)

def compile_to_vmfb(mlir_module, target_backend="llvm-cpu", 
                    cuda_llvm_target_arch: Optional[str] = None):
    """Compile an MLIR module to an IREE Flatbuffer.
    The module is expected to be in the format produced by `torch_mlir.compile`
    with `OutputType.LINALG_ON_TENSORS`.
    TODO: Expose more compiler options.
    """
    extra_args = []
    if cuda_llvm_target_arch is not None:
        arch_flag = f"--iree-hal-cuda-llvm-target-arch={cuda_llvm_target_arch}"
        extra_args.append(arch_flag)
    bytecode_stream = io.BytesIO()
    mlir_module.operation.write_bytecode(bytecode_stream)
    bytecode = bytecode_stream.getvalue()
    
    return ireec.compile_str(bytecode,
                             target_backends=[target_backend],
                             input_type=ireec.InputType.TM_TENSOR,
                             extra_args=extra_args)

#### Vision Models

In [4]:
from benchmark import benchmark_module
import io
batch_size = 1
input_sample = np.random.uniform(low=0.0, high=1.0, size=(batch_size, 3, 224, 224)).astype(np.float32)
input = torch.Tensor(input_sample)
model_list = {"MobileNetV3":mobilenet_v3_small(pretrained=False), "ShuffleNetV2":shufflenet_v2_x1_5(pretrained=False),
            "ResNet18":resnet18(pretrained=False), "ResNet50":resnet50(pretrained=False), "SqueezeNet":squeezenet1_1(pretrained=False),
            "DenseNet121":densenet121(pretrained=False), "InceptionV3":inception_v3(pretrained=False), "ViT_B16":models.vision_transformer.vit_b_16(weights=False)}

# to make torch-mlir capable of compiling InceptionV3, remove jit trace check for the InceptionV3's forward function:
    # def forward(self, x: Tensor):
    #     x = self._transform_input(x)
    #     x, aux = self._forward(x)
    #     return x

for modelname, model in model_list.items():
    print("\r\n**********Processing model " + modelname)
    try: 
        model.train(mode=False)
        t1_start = time.perf_counter()
        
        ts_graph = torch.jit.script(model)
        module_ir = torch_mlir.compile(ts_graph, input,
                                            output_type=torch_mlir.OutputType.LINALG_ON_TENSORS)

        binary = compile_to_vmfb(module_ir, target_backend="cuda")

        t2_stop = time.perf_counter()
        module = runtime.load_vm_flatbuffer(binary, driver="cuda")

        print(modelname + "****Compilation Time: {:.3f}s".format(t2_stop - t1_start)) # print performance indicator

        print("Calculating forward latency:\n  ", end="")
        tms = []
        for i in range(10):
            ret = benchmark_module(module.vm_module, entry_function="forward", inputs=["1x3x224x224xf32=1"], device="cuda")
            tm = ret[0].time
            tms.append(float(tm[0:-3]))
        print("{} - {:.3f} ± {:.3f} ms".format(modelname, np.mean(tms), np.std(tms)))
    except Exception as e:
        print(str(e)[:300]) #print error head


**********Processing model MobileNetV3
MobileNetV3****Compilation Time: 4.780s
Calculating forward latency:
  MobileNetV3 - 2.759 ± 0.003 ms

**********Processing model ShuffleNetV2
ShuffleNetV2****Compilation Time: 2.539s
Calculating forward latency:
  ShuffleNetV2 - 1.965 ± 0.005 ms

**********Processing model ResNet18
ResNet18****Compilation Time: 1.489s
Calculating forward latency:
  ResNet18 - 6.158 ± 0.021 ms

**********Processing model ResNet50
ResNet50****Compilation Time: 3.369s
Calculating forward latency:
  ResNet50 - 12.000 ± 0.000 ms

**********Processing model SqueezeNet
SqueezeNet****Compilation Time: 1.544s
Calculating forward latency:
  SqueezeNet - 1.241 ± 0.003 ms

**********Processing model DenseNet121
Lowering Torch Backend IR -> Linalg-on-Tensors Backend IR failed with the following diagnostics:


python exception: Failure while executing pass pipeline:
error: callsite(callsite(callsite(callsite(callsite(callsite("aten::batch_norm"("/root/anaconda3/envs/torch-mli

#### Bert

In [9]:
import iree.compiler as ireec
from iree import runtime
from torch_bert import BertModel, BertConfig
import torch
import torch_mlir
import time
modelname = "Bert"
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

config = BertConfig(vocab_size_or_config_json_file=16000, hidden_size=768,
    num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

model = BertModel(config=config)
model.eval()
try:
    ts_graph = torch.jit.script(model)
    module_ir = torch_mlir.compile(ts_graph, [input_ids, token_type_ids, input_mask], use_tracing=True,
                                        output_type=torch_mlir.OutputType.LINALG_ON_TENSORS)

    binary = compile_to_vmfb(module_ir, target_backend="llvm-cpu")
    compiled_model = runtime.load_vm_flatbuffer(binary,backend="llvm-cpu")
            
    print("Performing benchmark...")

    t1_stop = time.perf_counter()
    print("**** Model {} - Total Time: {:.3f}s".format(modelname, t1_stop - t1_start)) # print performance indicator

    tms = []
    for i in range(10):
        ret = benchmark_module(module.vm_module, entry_function="forward", inputs=[input_ids.numpy(), token_type_ids.numpy(), input_mask.numpy()], device="cuda")
        tm = ret[0].time
        tms.append(float(tm[0:-3]))
    print("{} - {:.3f} ± {:.3f} ms".format(modelname, np.mean(tms), np.std(tms)))
except Exception as e:
    print(e)


Module 'BertIntermediate' has no attribute 'intermediate_act_fn' (This attribute exists on the Python module, but we failed to convert Python type: 'functools.partial' to a TorchScript type. Only tensors and (possibly nested) tuples of tensors, lists, or dictsare supported as inputs or outputs of traced functions, but instead got value of type partial.. Its type was inferred; try adding a type annotation for the attribute.):
  File "/root/ufront/torch_bert.py", line 343
    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.intermediate_act_fn(hidden_states)
                        ~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
        return hidden_states



#### RNN/LSTM

In [8]:
import time
from torch_def import *
import torch_mlir
import numpy as np
modelname = "LSTM"
batch_size = 8
hidden_size = 128
seq_size = 32
input_size = 256
try: 
    t1_start = time.perf_counter()

    input = np.ones((batch_size, seq_size,hidden_size)).astype(np.float32)
    h0 = np.zeros((batch_size, hidden_size), dtype=np.float32)
    c0 = np.zeros((batch_size, hidden_size), dtype=np.float32)

    input, h0, c0 = torch.Tensor(input), torch.Tensor(h0), torch.Tensor(c0)
    model = SimpleLSTM(input_size = 10, hidden_size = hidden_size, seq_size=seq_size)

    model.train(mode=False)
    ts_graph = torch.jit.script(model)
    module_ir = torch_mlir.compile(ts_graph, [input, h0, c0], use_tracing=True,
                                        output_type=torch_mlir.OutputType.LINALG_ON_TENSORS)

    binary = compile_to_vmfb(module_ir, target_backend="llvm-cpu")
    compiled_model = runtime.load_vm_flatbuffer(binary,backend="llvm-cpu")
    print("Performing benchmark...")

    t1_stop = time.perf_counter()
    print("**** Model {} - Total Time: {:.3f}s".format(modelname, t1_stop - t1_start)) # print performance indicator

    tms = []
    for i in range(10):
        ret = benchmark_module(module.vm_module, entry_function="forward", inputs=[input.numpy(), h0.numpy(), c0.numpy()], device="cuda")
        tm = ret[0].time
        tms.append(float(tm[0:-3]))
    print("{} - {:.3f} ± {:.3f} ms".format(modelname, np.mean(tms), np.std(tms)))
except Exception as e:
    print(e)

Lowering TorchScript IR -> Torch Backend IR failed with the following diagnostics:


python exception: Failure while executing pass pipeline:
error: "aten::mul"("/root/ufront/torch_def.py":171:18): unsupported by backend contract: tensor with unknown rank
note: "aten::mul"("/root/ufront/torch_def.py":171:18): see current operation: %59 = "torch.tensor_static_info_cast"(%58) : (!torch.vtensor<[8,128],f32>) -> !torch.vtensor
note: "aten::mul"("/root/ufront/torch_def.py":171:18): this is likely due to a missing transfer function in abstract_interp_lib_gen.py

For Torch-MLIR developers, the error can be reproduced with:
$ torch-mlir-opt -pass-pipeline='builtin.module(torchscript-module-to-torch-backend-pipeline{backend-legal-ops=aten.flatten.using_ints,aten.adaptive_avg_pool1d extra-library=})' /tmp/SimpleLSTM.mlir
Add '-mlir-print-ir-after-all -mlir-disable-threading' to get the IR dump for debugging purpose.



# UFront Test

In [24]:
!pip uninstall ufront -y
!pip install /root/ufront/ufront-0.1.1-cp311-cp311-manylinux_2_28_x86_64.whl

[0mProcessing ./ufront-0.1.1-cp311-cp311-manylinux_2_28_x86_64.whl
Collecting onnxsim==0.4.17 (from ufront==0.1.1)
  Obtaining dependency information for onnxsim==0.4.17 from https://files.pythonhosted.org/packages/a2/e8/eade1b53b5949af186826eb7cff35713cf157cc9b8056880e08a8cd75c48/onnxsim-0.4.17-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Using cached onnxsim-0.4.17-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting rich (from onnxsim==0.4.17->ufront==0.1.1)
  Obtaining dependency information for rich from https://files.pythonhosted.org/packages/87/67/a37f6214d0e9fe57f6ae54b2956d550ca8365857f42a1ce0392bb21d9410/rich-13.7.1-py3-none-any.whl.metadata
  Using cached rich-13.7.1-py3-none-any.whl.metadata (18 kB)
Collecting protobuf>=3.20.2 (from onnx->ufront==0.1.1)
  Obtaining dependency information for protobuf>=3.20.2 from https://files.pythonhosted.org/packages/96/a2/dc4d601c8a5c85b8e3eadf158a7f66696f8129ea3342fb69da60e96b

In [25]:
import pathlib
import time
import torch
from torchvision.models import resnet18, resnet50, squeezenet1_1, regnet_x_32gf, maxvit_t, shufflenet_v2_x1_5, inception_v3, mobilenet_v3_small, efficientnet_v2_s, densenet121, convnext_small
import torchvision.models as models
from ufront.pytorch.model import UFrontTorch
import argparse
import ctypes
from iree.compiler import tools
from iree import runtime
import iree.runtime as ireert
import iree.compiler as ireec
from typing import Optional
import numpy as np
import torch
import iree.runtime as ireert
import iree.compiler as ireec

#### Vision models

In [3]:
FIX_TOSA_IR_FOR_NEW_IREE = True # for compitable with IREE recent release, make it to False if you use older IREE (e.g., before 20230830)
def fix_tosa_for_new_iree(tosa_ir):
    tosa_ir = tosa_ir.split("\n")
    for i in range(len(tosa_ir)):
        ir = tosa_ir[i][:300]
        if ir.find("tosa.mul") > 0:
            tosa_ir[i] = tosa_ir[i].replace("i32","i8") # shift in mul become i8 type
        elif ir.find("axis = ") > 0 or (ir.find("tosa.concat") > 0 and tosa_ir[i].find("axis = ") > 0):
            tosa_ir[i] = tosa_ir[i].replace("i64","i32") # axis attribute become i32 type
    return "\n".join(tosa_ir)

In [31]:
import numpy as np
from benchmark import benchmark_module
batch_size = 1
input_sample = np.random.uniform(low=0.0, high=1.0, size=(batch_size, 3, 224, 224)).astype(np.float32)
input = torch.Tensor(input_sample)

model_list = {"MobileNetV3":mobilenet_v3_small(pretrained=False), "ShuffleNetV2":shufflenet_v2_x1_5(pretrained=False),
            "ResNet18":resnet18(pretrained=False), "ResNet50":resnet50(pretrained=False), "SqueezeNet":squeezenet1_1(pretrained=False),
            "DenseNet121":densenet121(pretrained=False), "InceptionV3":inception_v3(pretrained=False), "ViT_B16":models.vision_transformer.vit_b_16(weights=False, dropout=0.1)}

for modelname, net in model_list.items():
    net.train(False) 

    t1_start = time.perf_counter()
    model = UFrontTorch(net, batch_size=batch_size, pass_weights=True) # convert torch model to ufront model
    #This will trigger Rust frontend for actual model conversion and graph building
    #operators can also be managed by python side (each operator here corresponding to an operator in the Rust computation graph)
    output_tensors = model(inputs = [input])

    #This will trigger model compilation, i.e., convert Rust computation graph to a unified high-level IR and lower it to TOSA IR
    model.compile(optimizer={"type":"sgd", "lr":"0.01", "momentum":"0", "nesterov":"False", "weight_decay":"0"},
                        loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
    tosa_ir= model.dump_tosa_ir()
    t1_stop = time.perf_counter()

    if FIX_TOSA_IR_FOR_NEW_IREE:
        tosa_ir = fix_tosa_for_new_iree(tosa_ir)

    binary = ireec.compile_str(tosa_ir,
                    target_backends=["cuda"], 
                    input_type=ireec.InputType.TOSA)
    t2_stop = time.perf_counter()

    print(modelname + "****Ufront->TOSA Time: {:.3f}s, TOSA->Binary Time: {:.3f}s, Total Time: {:.3f}s".format(t1_stop - t1_start, t2_stop - t1_stop, t2_stop - t1_start)) # print performance indicator
    module = runtime.load_vm_flatbuffer(binary, driver="cuda")
    tms = []
    for i in range(10):
        ret = benchmark_module(module.vm_module, entry_function="forward", inputs=["1x3x224x224xf32=1"], device="cuda")
        tm = ret[0].time
        tms.append(float(tm[0:-3]))
    print("{} - {:.3f} ± {:.3f} ms".format(modelname, np.mean(tms), np.std(tms)))






MobileNetV3****Ufront->TOSA Time: 0.232s, TOSA->Binary Time: 2.532s, Total Time: 2.764s
MobileNetV3 - 1.070 ± 0.000 ms
ShuffleNetV2****Ufront->TOSA Time: 0.300s, TOSA->Binary Time: 1.990s, Total Time: 2.289s
ShuffleNetV2 - 2.070 ± 0.008 ms
ResNet18****Ufront->TOSA Time: 0.339s, TOSA->Binary Time: 1.636s, Total Time: 1.975s
ResNet18 - 2.731 ± 0.003 ms
ResNet50****Ufront->TOSA Time: 0.763s, TOSA->Binary Time: 3.233s, Total Time: 3.996s
ResNet50 - 6.151 ± 0.010 ms
SqueezeNet****Ufront->TOSA Time: 0.064s, TOSA->Binary Time: 1.246s, Total Time: 1.310s
SqueezeNet - 1.170 ± 0.000 ms
DenseNet121****Ufront->TOSA Time: 1.121s, TOSA->Binary Time: 5.600s, Total Time: 6.720s
DenseNet121 - 8.169 ± 0.018 ms
InceptionV3****Ufront->TOSA Time: 0.789s, TOSA->Binary Time: 4.526s, Total Time: 5.315s
InceptionV3 - 12.420 ± 0.040 ms
ViT_B16****Ufront->TOSA Time: 2.344s, TOSA->Binary Time: 4.947s, Total Time: 7.291s
ViT_B16 - 35.150 ± 0.092 ms


#### Language Model (Bert)

In [37]:
from ufront.pytorch.model import UFrontTorch 
import iree.compiler as ireec
from iree import runtime
from benchmark import benchmark_module
from torch_bert import BertModel, BertConfig
import torch
import time
modelname = "Bert"
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

config = BertConfig(vocab_size_or_config_json_file=16000, hidden_size=768,
    num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

net = BertModel(config=config)
net.eval()

t1_start = time.perf_counter()
model = UFrontTorch(net, batch_size=1, pass_weights=True) # convert torch model to ufront model
output_tensors = model(inputs = [input_ids, token_type_ids, input_mask])

model.compile(optimizer={"type":"sgd", "lr":"0.01", "momentum":"0", "nesterov":"False", "weight_decay":"0"},
                    loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])

print("Compiling TOSA model...")
tosa_ir= model.dump_tosa_ir()
t1_stop = time.perf_counter()
if FIX_TOSA_IR_FOR_NEW_IREE:
    tosa_ir = fix_tosa_for_new_iree(tosa_ir)
print("Compiling Binary...")
binary = ireec.compile_str(tosa_ir,
                target_backends=["cuda"], 
                input_type=ireec.InputType.TOSA)
t2_stop = time.perf_counter()
print("Bert****Ufront->TOSA Time: {:.3f}s, TOSA->Binary Time: {:.3f}s, Total Time: {:.3f}s".format(t1_stop - t1_start, t2_stop - t1_stop, t2_stop - t1_start)) # print performance indicator
module = runtime.load_vm_flatbuffer(binary, driver="cuda")

# %timeit -n 100 module.forward(input_ids, token_type_ids, input_mask)
tms = []
for i in range(10):
    ret = benchmark_module(module.vm_module, entry_function="forward", inputs=[input_ids.numpy(), token_type_ids.numpy(), input_mask.numpy()], device="cuda")
    tm = ret[0].time
    tms.append(float(tm[0:-3]))
print("{} - {:.3f} ± {:.3f} ms".format(modelname, np.mean(tms), np.std(tms)))


Compiling TOSA model...
Compiling Binary...
Bert****Ufront->TOSA Time: 2.545s, TOSA->Binary Time: 5.312s, Total Time: 7.857s
Bert - 5.716 ± 0.018 ms


#### RNN/LSTM

In [7]:
import torch
from ufront.pytorch.model import UFrontTorch
import iree.compiler as ireec
from iree import runtime
from benchmark import benchmark_module
import time
import torch
from torch_def import *
import numpy as np
batch_size = 8
hidden_size = 128
seq_size = 32
input_size = 256
modelname = "LSTM"
input = np.ones((batch_size, seq_size,hidden_size)).astype(np.float32)
h0 = np.zeros((batch_size, hidden_size), dtype=np.float32)
c0 = np.zeros((batch_size, hidden_size), dtype=np.float32)
t1_start = time.perf_counter()
input, h0, c0 = torch.Tensor(input), torch.Tensor(h0), torch.Tensor(c0)
lstm = SimpleLSTM(input_size = 10, hidden_size = hidden_size, seq_size=seq_size)
model = UFrontTorch(lstm, batch_size=batch_size, pass_weights=True)
output_tensors = model(inputs = [input, h0, c0])
model.compile(optimizer={"type":"sgd", "lr":"0.01", "momentum":"0", "nesterov":"False", "weight_decay":"0"},
                      loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])

tosa_ir = model.dump_tosa_ir()
t1_stop = time.perf_counter()

if FIX_TOSA_IR_FOR_NEW_IREE:
    tosa_ir = fix_tosa_for_new_iree(tosa_ir)

binary = ireec.compile_str(tosa_ir,
                target_backends=["cuda"], 
                input_type=ireec.InputType.TOSA)
t2_stop = time.perf_counter()

print("LSTM****Ufront->TOSA Time: {:.3f}s, TOSA->Binary Time: {:.3f}s, Total Time: {:.3f}s".format(t1_stop - t1_start, t2_stop - t1_stop, t2_stop - t1_start)) # print performance indicator

module = runtime.load_vm_flatbuffer(binary, driver="cuda")

tms = []
for i in range(10):
    ret = benchmark_module(module.vm_module, entry_function="forward", inputs=[input.numpy(), h0.numpy(), c0.numpy()], device="cuda")
    tm = ret[0].time
    tms.append(float(tm[0:-3]))
print("{} - {:.3f} ± {:.3f} ms".format(modelname, np.mean(tms), np.std(tms)))

LSTM****Ufront->TOSA Time: 0.047s, TOSA->Binary Time: 1.097s, Total Time: 1.145s
LSTM - 0.432 ± 0.006 ms
