In [14]:
!pip install torch-tb-profiler
!pip install onnx
!pip install onnxruntime
!pip install onnxoptimizer


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [15]:
from onnx import __version__, IR_VERSION
from onnx.defs import onnx_opset_version
print(
    f"onnx.__version__={__version__!r}, opset={onnx_opset_version()}, IR_VERSION={IR_VERSION}")


onnx.__version__='1.13.1', opset=18, IR_VERSION=8


In [30]:
import numpy as np
from time import perf_counter

import torch.onnx
import onnx
import torch
import torchvision.models as models
import onnxruntime
import os


def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()


def time_ort_model_evaluation(model_path):
    sess_options = onnxruntime.SessionOptions()
    sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
    session = onnxruntime.InferenceSession(model_path, sess_options)

    time_per_inference = []
    for _ in range(10):
        dummy_input = torch.randn(1, 3, 224, 224)
        # compute ONNX Runtime output prediction
        ort_inputs = {session.get_inputs()[0].name: to_numpy(dummy_input)}
        start = perf_counter()
        session.run(None, ort_inputs)
        time_per_inference.append((1000 * (perf_counter() - start)))

    return np.mean(time_per_inference)


def time_ort_model_evaluation(model_path):
    sess_options = onnxruntime.SessionOptions()
    sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
    session = onnxruntime.InferenceSession(model_path, sess_options)

    time_per_inference = []
    for _ in range(10):
        dummy_input = torch.randn(1, 3, 224, 224)
        # compute ONNX Runtime output prediction
        ort_inputs = {session.get_inputs()[0].name: to_numpy(dummy_input)}
        start = perf_counter()
        session.run(None, ort_inputs)
        time_per_inference.append((1000 * (perf_counter() - start)))

    return np.mean(time_per_inference)


def quantize_onnx_model(onnx_model_path, quantized_model_path):
    from onnxruntime.quantization import quantize_dynamic, QuantType
    import onnx
    onnx_opt_model = onnx.load(onnx_model_path)
    quantize_dynamic(onnx_model_path,
                     quantized_model_path,
                     weight_type=QuantType.QUInt8)  # QInt8

    print(f"quantized model saved to:{quantized_model_path}")


def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()


def perform_onnx_infer(model, model_name, dummy_input):

    input_names = ["actual_input"]
    output_names = ["output"]

    print(
        f'\n\n************************\t{model_name}\t********************************\n\n')

    model_onnx = model_name + ".onnx"
    model_opt_onnx = model_name + "_opt.onnx"
    model_opt_quant_onnx = model_name + "_opt_quant.onnx"

    torch.onnx.export(model, dummy_input, model_onnx, verbose=False,
                      input_names=input_names, output_names=output_names, export_params=True,)

    ort_session = onnxruntime.InferenceSession(model_onnx)

    # compute ONNX Runtime output prediction
    ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(dummy_input)}
    ort_outs = ort_session.run(None, ort_inputs)

    # compare ONNX Runtime and PyTorch results
    torch_out = model(dummy_input)  # torch.randn(1, 3, 224, 224)
    # np.testing.assert_allclose(to_numpy(torch_out), ort_outs[0], rtol=1e-03, atol=1e-05)

    os.system(f'!python -m onnxoptimizer {model_onnx} {model_opt_onnx}')

    print(f'{model_name}\tAverage runtime of ONNX Model in GPU: ' +
          str(time_ort_model_evaluation(model_onnx)))
    print(f'{model_name}\tAverage runtime of ONNX Optimized Model in GPU: ' +
          str(time_ort_model_evaluation(model_opt_onnx)))

    quantize_onnx_model(model_opt_onnx, model_opt_quant_onnx)

    print(f'{model_name}\tONNX full precision model size (MB):',
          os.path.getsize(model_opt_onnx)/(1024*1024))
    print(f'{model_name}\tONNX quantized model size (MB):', os.path.getsize(
        model_opt_quant_onnx)/(1024*1024))

    print(f'{model_name}\tAverage runtime of ONNX Model in TPU: ' +
          str(time_ort_model_evaluation(model_onnx)))
    print(f'{model_name}\tAverage runtime of ONNX Quantized Model in TPU: ' +
          str(time_ort_model_evaluation(model_opt_quant_onnx)))
    print()
    print('-'*80)


In [25]:
import torchvision.datasets
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
selected_models = [
    torchvision.models.resnet34(pretrained=True).to(device),
    torchvision.models.densenet121(pretrained=True).to(device),
    torchvision.models.efficientnet_b0(pretrained=True).to(device),
    torchvision.models.convnext_tiny(pretrained=True).to(device),
]

model_names = [
    'ResNet-34', 'DenseNet-121', 'EfficientNet-B0', 'ConvNeXt-T'
]




In [31]:
for i, model in enumerate(selected_models):
    dummy_input = torch.rand(1, 3, 224, 224).to(device)
    perform_onnx_infer(model, model_names[i], dummy_input)




************************	ResNet-34	********************************


verbose: False, log level: Level.ERROR

ResNet-34	Average runtime of ONNX Model in GPU: 101.58939989996725
ResNet-34	Average runtime of ONNX Optimized Model in GPU: 99.03199939997194
quantized model saved to:ResNet-34_opt_quant.onnx
ResNet-34	ONNX full precision model size (MB): 83.13565731048584
ResNet-34	ONNX quantized model size (MB): 20.88004970550537
ResNet-34	Average runtime of ONNX Model in TPU: 147.2574743000223
ResNet-34	Average runtime of ONNX Quantized Model in TPU: 176.98937220000062

--------------------------------------------------------------------------------


************************	DenseNet-121	********************************


verbose: False, log level: Level.ERROR

DenseNet-121	Average runtime of ONNX Model in GPU: 85.70039960000031
DenseNet-121	Average runtime of ONNX Optimized Model in GPU: 83.50030320002588
quantized model saved to:DenseNet-121_opt_quant.onnx
DenseNet-121	ONNX full precis

In [6]:
!python - m onnxoptimizer ResNet-34.onnx ResNet-34_opt.onnx


In [19]:
!python - m onnxoptimizer DenseNet-121.onnx DenseNet-121_opt.onnx


In [22]:
!python - m onnxoptimizer EfficientNet-B0.onnx EfficientNet-B0_opt.onnx


In [27]:
!python - m onnxoptimizer ConvNeXt-T.onnx ConvNeXt-T_opt.onnx
