In [None]:
!pip install optimum[onnxruntime]

# Runtime Comparion (Torch vs. ONNX) on a Simple Neural Net

In [None]:
import torch
import torch.nn as nn
import time
import numpy as np
import onnxruntime

# Define a simple model: a two-layer MLP
class SimpleModel(nn.Module):
    def __init__(self, input_size=10, hidden_size=20, output_size=5):
        super(SimpleModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Create the model and set to evaluation mode
model = SimpleModel()
model.eval()

# Create a dummy input tensor
dummy_input = torch.randn(1, 10)

# Run inference using PyTorch
with torch.no_grad():
    torch_output = model(dummy_input)
print("PyTorch output:", torch_output)

# Export the model to ONNX format
onnx_model_path = "simple_model.onnx"
torch.onnx.export(model, 
                  dummy_input, 
                  onnx_model_path,
                  input_names=["input"],
                  output_names=["output"],
                  opset_version=11)
print("Model exported to ONNX.")

# Load the ONNX model with ONNX Runtime
ort_session = onnxruntime.InferenceSession(onnx_model_path)

# Helper function: convert PyTorch tensor to NumPy array
def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

# Run inference using ONNX Runtime
ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(dummy_input)}
ort_outs = ort_session.run(None, ort_inputs)
print("ONNX Runtime output:", ort_outs[0])

# Compare inference speed
n_runs = 1000

# PyTorch inference timing
start_time = time.time()
with torch.no_grad():
    for _ in range(n_runs):
        _ = model(dummy_input)
torch_time = time.time() - start_time

# ONNX Runtime inference timing
start_time = time.time()
for _ in range(n_runs):
    _ = ort_session.run(None, ort_inputs)
onnx_time = time.time() - start_time

print("PyTorch inference time over {} runs: {:.6f} seconds".format(n_runs, torch_time))
print("ONNX Runtime inference time over {} runs: {:.6f} seconds".format(n_runs, onnx_time))

# Runtime Comparion (Torch vs. ONNX) on RoBERTa

In [None]:
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForQuestionAnswering

model_id = "deepset/roberta-base-squad2"
task = "question-answering"

# load vanilla transformers and convert to onnx
model = ORTModelForQuestionAnswering.from_pretrained(model_id, from_transformers=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)
onnx_qa_pipeline = pipeline(task, model=model, tokenizer=tokenizer, handle_impossible_answer=True, device=-1)
torch_qa_pipeline = pipeline(task, model=model_id, tokenizer=model_id, device=-1)

In [None]:
from time import perf_counter
import numpy as np

context="Hello, my name is Philipp and I live in Nuremberg, Germany. Currently I am working as a Technical Lead at Hugging Face to democratize artificial intelligence through open source and open science. In the past I designed and implemented cloud-native machine learning architectures for fin-tech and insurance companies. I found my passion for cloud concepts and machine learning 5 years ago. Since then I never stopped learning. Currently, I am focusing myself in the area NLP and how to leverage models like BERT, Roberta, T5, ViT, and GPT2 to generate business value."
question="As what is Philipp working?"

def measure_latency(pipe):
    latencies = []
    # warm up
    for _ in range(10):
        _ = pipe(question=question, context=context)
    # Timed run
    for _ in range(100):
        start_time = perf_counter()
        _ =  pipe(question=question, context=context)
        latency = perf_counter() - start_time
        latencies.append(latency)
    # Compute run statistics
    time_avg_ms = 1000 * np.mean(latencies)
    time_std_ms = 1000 * np.std(latencies)
    return f"Average latency (ms) - {time_avg_ms:.2f} +\- {time_std_ms:.2f}"

print(f"ONNX RoBERTa {measure_latency(onnx_qa_pipeline)}")
print(f"Torch RoBERTa {measure_latency(torch_qa_pipeline)}")