## Convert PyTorch Model to ONNX

In [1]:
import timm
model = timm.create_model('convnext_xxlarge.clip_laion2b_soup_ft_in1k', pretrained=True)
model = model.eval()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from timm.utils.model import reparameterize_model
model = reparameterize_model(model)

In [3]:
import torch.onnx
torch.onnx.export(model,
                 torch.rand(1, 3, 224, 224, requires_grad=True),
                 "convnext_xxlarge.clip_laion2b_soup_ft_in1k.onnx",
                 export_params=True,
                 opset_version=16,
                 do_constant_folding=True,
                 input_names=['input'],
                 output_names=['output'], 
                 dynamic_axes={'input' : {0 : 'batch_size'},   
                               'output' : {0 : 'batch_size'}}
)

verbose: False, log level: Level.ERROR



## Inference using ONNX Runtime on CPU

In [13]:
import numpy as np
import onnxruntime as ort
from PIL import Image
from urllib.request import urlopen

#define the priority order for the execution providers

# prefer CUDA Execution Provider over CPU Execution Provider
EP_list = ['CUDAExecutionProvider', 'CPUExecutionProvider', 'OpenVINOExecutionProvider']

# Load an image
img = Image.open(urlopen('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png'))
img = img.convert('RGB')
img = img.resize((224, 224))
img_np = np.array(img).astype(np.float32)

# Load ONNX model
session = ort.InferenceSession("convnext_xxlarge.clip_laion2b_soup_ft_in1k.onnx", providers=EP_list)
session.set_providers(['CPUExecutionProvider'])

# Convert data to the shape the ONNX model expects
input_data = np.transpose(img_np, (2, 0, 1))  # Convert to (C, H, W)
input_data = np.expand_dims(input_data, axis=0)  # Add a batch dimension

# Get input name from the model
input_name = session.get_inputs()[0].name



In [14]:
%%timeit
# Perform inference
output = session.run(None, {input_name: input_data})

957 ms ± 211 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
# Extract output data (assuming model has a single output)
output = session.run(None, {input_name: input_data})
output_data = output[0]
output_data.shape

(1, 1000)

## ONNX Runtime with CUDA Execution Provider

In [16]:
# Load ONNX model
session = ort.InferenceSession("convnext_xxlarge.clip_laion2b_soup_ft_in1k.onnx", providers=EP_list)
session.set_providers(['CUDAExecutionProvider'])

# Convert data to the shape the ONNX model expects
input_data = np.transpose(img_np, (2, 0, 1))  # Convert to (C, H, W)
input_data = np.expand_dims(input_data, axis=0)  # Add a batch dimension

input_data.shape

# Get input name from the model
input_name = session.get_inputs()[0].name

In [17]:
%%timeit
# Perform inference
output = session.run(None, {input_name: input_data})

877 ms ± 230 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## ONNX Runtime with TensorRT Execution Provider

In [18]:
# Load ONNX model
session = ort.InferenceSession("convnext_xxlarge.clip_laion2b_soup_ft_in1k.onnx", providers=EP_list)
session.set_providers(['TensorrtExecutionProvider'])

# Convert data to the shape the ONNX model expects
input_data = np.transpose(img_np, (2, 0, 1))  # Convert to (C, H, W)
input_data = np.expand_dims(input_data, axis=0)  # Add a batch dimension

input_data.shape

# Get input name from the model
input_name = session.get_inputs()[0].name

In [19]:
%%timeit
# Perform inference
output = session.run(None, {input_name: input_data})

710 ms ± 109 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
