In [1]:
!pip install optimum[onnxruntime]

Collecting optimum[onnxruntime]
  Using cached optimum-1.24.0-py3-none-any.whl.metadata (21 kB)
Collecting transformers>=4.29 (from optimum[onnxruntime])
  Using cached transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting huggingface-hub>=0.8.0 (from optimum[onnxruntime])
  Using cached huggingface_hub-0.29.1-py3-none-any.whl.metadata (13 kB)
Collecting onnx (from optimum[onnxruntime])
  Using cached onnx-1.17.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Collecting onnxruntime>=1.11.0 (from optimum[onnxruntime])
  Using cached onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting datasets>=1.2.1 (from optimum[onnxruntime])
  Using cached datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting evaluate (from optimum[onnxruntime])
  Using cached evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting protobuf>=3.20.1 (from optimum[onnxruntime])
  Using cached protobuf-5.29.3-cp38-a

In [2]:
import torch
import torch.nn as nn
import time
import numpy as np
import onnxruntime

# Define a simple model: a two-layer MLP
class SimpleModel(nn.Module):
    def __init__(self, input_size=10, hidden_size=20, output_size=5):
        super(SimpleModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Create the model and set to evaluation mode
model = SimpleModel()
model.eval()

# Create a dummy input tensor
dummy_input = torch.randn(1, 10)

# Run inference using PyTorch
with torch.no_grad():
    torch_output = model(dummy_input)
print("PyTorch output:", torch_output)

# Export the model to ONNX format
onnx_model_path = "simple_model.onnx"
torch.onnx.export(model, 
                  dummy_input, 
                  onnx_model_path,
                  input_names=["input"],
                  output_names=["output"],
                  opset_version=11)
print("Model exported to ONNX.")

# Load the ONNX model with ONNX Runtime
ort_session = onnxruntime.InferenceSession(onnx_model_path)

# Helper function: convert PyTorch tensor to NumPy array
def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

# Run inference using ONNX Runtime
ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(dummy_input)}
ort_outs = ort_session.run(None, ort_inputs)
print("ONNX Runtime output:", ort_outs[0])

# Compare inference speed
n_runs = 1000

# PyTorch inference timing
start_time = time.time()
with torch.no_grad():
    for _ in range(n_runs):
        _ = model(dummy_input)
torch_time = time.time() - start_time

# ONNX Runtime inference timing
start_time = time.time()
for _ in range(n_runs):
    _ = ort_session.run(None, ort_inputs)
onnx_time = time.time() - start_time

print("PyTorch inference time over {} runs: {:.6f} seconds".format(n_runs, torch_time))
print("ONNX Runtime inference time over {} runs: {:.6f} seconds".format(n_runs, onnx_time))

PyTorch output: tensor([[-0.0297, -0.4107, -0.1688, -0.3370,  0.0293]])


  from .autonotebook import tqdm as notebook_tqdm


Model exported to ONNX.
ONNX Runtime output: [[-0.02971356 -0.4107108  -0.16881287 -0.33697784  0.02931507]]
PyTorch inference time over 1000 runs: 0.029034 seconds
ONNX Runtime inference time over 1000 runs: 0.008659 seconds
