# ONNX Comparison
In this notebook, we will compare the overall performance of the finetuned IndoBERT model after being converted and optimized in ONNX format especially in terms of the model's inference time.

In [26]:
# !pip install onnxruntime onnx transformers optimum

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer

from pathlib import Path
import timeit
import onnxruntime as ort
from onnxruntime import InferenceSession
from onnxruntime.transformers.optimizer import optimize_model
from optimum.onnxruntime import ORTModelForSequenceClassification

In [4]:
i2w = {0: 'positive', 1: 'neutral', 2: 'negative'}

In [5]:
# Create a PATH to save the model
indobert_path = Path("/content/drive/MyDrive/Models/indobert")
onnx_path = Path("/content/drive/MyDrive/Models/indobert-onnx/model.onnx")
optimized_onnx_path = Path("/content/drive/MyDrive/Models/indobert-onnx/optimized.onnx")

In [6]:
# Load data test
test_dataset_path = "/content/test_preprocess.tsv"
df_test = pd.read_table(test_dataset_path, header=None)
df_test.rename(columns={0: "text", 1: "label"}, inplace=True)
df_test.head()

Unnamed: 0,text,label
0,kemarin gue datang ke tempat makan baru yang a...,negative
1,kayak nya sih gue tidak akan mau balik lagi ke...,negative
2,"kalau dipikir-pikir , sebenarnya tidak ada yan...",negative
3,ini pertama kalinya gua ke bank buat ngurusin ...,negative
4,waktu sampai dengan gue pernah disuruh ibu lat...,negative


In [7]:
# Load Tokenizer
tokenizer = BertTokenizer.from_pretrained(indobert_path)

# Instantiate models
model_ib = BertForSequenceClassification.from_pretrained(indobert_path)
model_onnx = InferenceSession(onnx_path, providers=["CPUExecutionProvider"])
model_onnx_opt = InferenceSession(optimized_onnx_path, providers=["CPUExecutionProvider"])

In [15]:
def infer(text):
  inputs = tokenizer.encode(text)
  inputs = torch.LongTensor(inputs).view(1, -1).to(model_ib.device)

  logits = model_ib(inputs)[0]
  label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()
  return i2w[label]

def infer_onnx(text):
  inputs = tokenizer([text])
  inputs_onnx = dict(
      input_ids=np.array(inputs["input_ids"]).astype("int64"),
      attention_mask=np.array(inputs["attention_mask"]).astype("int64"),
      token_type_ids=np.array(inputs["token_type_ids"]).astype("int64")
  )

  logits = model_onnx.run(None, input_feed=inputs_onnx)[0]
  label = torch.topk(torch.from_numpy(logits), k=1, dim=-1)[1].squeeze().item()
  probability = F.softmax(torch.from_numpy(logits), dim=-1).squeeze()[label].item()
  return i2w[label]

def infer_onnx_opt(text):
  inputs = tokenizer([text])
  inputs_onnx = dict(
      input_ids=np.array(inputs["input_ids"]).astype("int64"),
      attention_mask=np.array(inputs["attention_mask"]).astype("int64"),
      token_type_ids=np.array(inputs["token_type_ids"]).astype("int64")
  )

  logits = model_onnx_opt.run(None, input_feed=inputs_onnx)[0]
  label = torch.topk(torch.from_numpy(logits), k=1, dim=-1)[1].squeeze().item()
  probability = F.softmax(torch.from_numpy(logits), dim=-1).squeeze()[label].item()
  return i2w[label]

In [16]:
df_test['pred_ib'] = df_test['text'].apply(infer)
df_test['pred_onnx'] = df_test['text'].apply(infer_onnx)
df_test['pred_onnx_opt'] = df_test['text'].apply(infer_onnx_opt)

In [17]:
df_test.head()

Unnamed: 0,text,label,pred_ib,pred_onnx,pred_onnx_opt
0,kemarin gue datang ke tempat makan baru yang a...,negative,negative,negative,negative
1,kayak nya sih gue tidak akan mau balik lagi ke...,negative,negative,negative,negative
2,"kalau dipikir-pikir , sebenarnya tidak ada yan...",negative,negative,negative,negative
3,ini pertama kalinya gua ke bank buat ngurusin ...,negative,negative,negative,negative
4,waktu sampai dengan gue pernah disuruh ibu lat...,negative,negative,negative,negative


In [18]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

d = {
    "Accuracy": [accuracy_score(df_test['label'], df_test['pred_ib']),
                 accuracy_score(df_test['label'], df_test['pred_onnx']),
                 accuracy_score(df_test['label'], df_test['pred_onnx_opt'])],
    "Precision":[precision_score(df_test['label'], df_test['pred_ib'], average="macro"),
                 precision_score(df_test['label'], df_test['pred_onnx'], average="macro"),
                 precision_score(df_test['label'], df_test['pred_onnx_opt'], average="macro")],
    "Recall":   [recall_score(df_test['label'], df_test['pred_ib'], average="macro"),
                 recall_score(df_test['label'], df_test['pred_onnx'], average="macro"),
                 recall_score(df_test['label'], df_test['pred_onnx_opt'], average="macro")],
    "F1":       [f1_score(df_test['label'], df_test['pred_ib'], average="macro"),
                 f1_score(df_test['label'], df_test['pred_onnx'], average="macro"),
                 f1_score(df_test['label'], df_test['pred_onnx_opt'], average="macro")]
}

df_comp = pd.DataFrame.from_dict(d)
df_comp = df_comp.rename(index={0: 'IndoBERT', 1: 'ONNX', 2: 'ONNX Optimized'})
df_comp

Unnamed: 0,Accuracy,Precision,Recall,F1
IndoBERT,0.916,0.91558,0.875811,0.890512
ONNX,0.916,0.91558,0.875811,0.890512
ONNX Optimized,0.916,0.91558,0.875811,0.890512


The result above shows us that the model shows no decline in terms of accuracy and f1-score after being converted to ONNX.

In [19]:
def benchmark(f, name=""):
    # warmup
    for _ in range(10):
        f()
    seconds_per_iter = timeit.timeit(f, number=100) / 100
    print(
        f"{name}:",
        f"{seconds_per_iter * 1000:.3f} ms",
    )

    return seconds_per_iter * 1000

In [22]:
text = 'Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita'

inputs = tokenizer.encode(text)
inputs = torch.LongTensor(inputs).view(1, -1).to(model_ib.device)

inputs_onnx = tokenizer([text])
inputs_onnx = dict(
    input_ids=np.array(inputs_onnx["input_ids"]).astype("int64"),
    attention_mask=np.array(inputs_onnx["attention_mask"]).astype("int64"),
    token_type_ids=np.array(inputs_onnx["token_type_ids"]).astype("int64")
)

In [23]:
speed_ib = benchmark(lambda: model_ib(inputs), "IndoBERT")
speed_onnx = benchmark(lambda: model_onnx.run(None, input_feed=inputs_onnx), "ONNX")
speed_onnx_opt = benchmark(lambda: model_onnx_opt.run(None, input_feed=inputs_onnx), "ONNX Opt")

IndoBERT: 208.851 ms
ONNX: 81.337 ms
ONNX Opt: 62.661 ms


In [24]:
df_comp['Inference Time'] = [speed_ib, speed_onnx, speed_onnx_opt]
df_comp

Unnamed: 0,Accuracy,Precision,Recall,F1,Inference Time
IndoBERT,0.916,0.91558,0.875811,0.890512,208.851274
ONNX,0.916,0.91558,0.875811,0.890512,81.337069
ONNX Optimized,0.916,0.91558,0.875811,0.890512,62.66086


In [27]:
speed_ib / speed_onnx

2.5677256028046336

In [28]:
speed_ib / speed_onnx_opt

3.333041941971335

The model's inference time is so much faster in the ONNX format. The model is 2.5x faster in the original ONNX form and 3.3x faster after it's being optimized.

Converting a model into ONNX format not only gives you flexibility to use it in various framesworks and platforms, but it speed up the inference speed. These two methods surely must be applied during the deployment process.

In [25]:
df_comp.to_csv("comparison_onnx.csv")