### Importing Necessary Libraries

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
import torch
from optimum.onnxruntime import ORTModelForSequenceClassification, ORTModelForCustomTasks

from optimum.onnxruntime import ORTOptimizer
from optimum.onnxruntime.configuration import OptimizationConfig

from optimum.onnxruntime import ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Paths where original transformer models are downloaded
model_id_bert = 'saved_models/bert-base-multilingual-uncased-sentiment'
model_id_sent_transformer = 'saved_models/all-MiniLM-L6-v2'

# Paths where ONNX models will be stored
model_bert_path_onnx = 'saved_models/bert-base-multilingual-uncased-sentiment-onnx'
model_sent_transformer_path_onnx = 'saved_models/all-MiniLM-L6-v2-onnx'

### ONNX Runtime

Before optimizing our models, we need to convert transformer model to onnx format. So, we will use `ORTModelForSequenceClassification` for bert model and `ORTModelForCustomTasks` for sentence transformer model. And we can use native  - `AutoTokenizer` from 🤗  library.  

Conversion of BERT Transformer Model to BERT ONNX Model

In [3]:
model = ORTModelForSequenceClassification.from_pretrained(model_id_bert, from_transformers=True)
tokenizer = AutoTokenizer.from_pretrained(model_id_bert)

model.save_pretrained(model_bert_path_onnx)
tokenizer.save_pretrained(model_bert_path_onnx)

('saved_models/bert-base-multilingual-uncased-sentiment-onnx/tokenizer_config.json',
 'saved_models/bert-base-multilingual-uncased-sentiment-onnx/special_tokens_map.json',
 'saved_models/bert-base-multilingual-uncased-sentiment-onnx/vocab.txt',
 'saved_models/bert-base-multilingual-uncased-sentiment-onnx/added_tokens.json',
 'saved_models/bert-base-multilingual-uncased-sentiment-onnx/tokenizer.json')

Conversion of Sentence Transformer Model to Sentence Transformer ONNX Model

In [4]:
model = ORTModelForCustomTasks.from_pretrained(model_id_sent_transformer, from_transformers=True)
tokenizer = AutoTokenizer.from_pretrained(model_id_sent_transformer)

model.save_pretrained(model_sent_transformer_path_onnx)
tokenizer.save_pretrained(model_sent_transformer_path_onnx)

('saved_models/all-MiniLM-L6-v2-onnx/tokenizer_config.json',
 'saved_models/all-MiniLM-L6-v2-onnx/special_tokens_map.json',
 'saved_models/all-MiniLM-L6-v2-onnx/vocab.txt',
 'saved_models/all-MiniLM-L6-v2-onnx/added_tokens.json',
 'saved_models/all-MiniLM-L6-v2-onnx/tokenizer.json')

Lets test these ONNX models. We will use our custom functions `src` folder rather than using `pipeline` from optimum library to ensure even at code level, ONNX models are compatible.

In [5]:
# from optimum.pipelines import pipeline

# Original Transformer Model
model_bert = AutoModelForSequenceClassification.from_pretrained(model_id_bert)
tokenizer_bert = AutoTokenizer.from_pretrained(model_id_bert)

# ONNX Model
model_bert_onnx = ORTModelForSequenceClassification.from_pretrained(model_bert_path_onnx, file_name='model.onnx')
tokenizer_bert_onnx = AutoTokenizer.from_pretrained(model_bert_path_onnx)

In [6]:
from src.bertBaseMultiClass import sentiment_analyzer

test_sentence = 'Whole Cake Island is the sweetest country to reside. But it will also spoils person due to comfort zones.'

In [7]:
print(sentiment_analyzer(test_sentence, model_bert, tokenizer_bert))
print(sentiment_analyzer(test_sentence, model_bert_onnx, tokenizer_bert_onnx))

[{'1 star': 0.0116}, {'2 star': 0.0524}, {'3 star': 0.2946}, {'4 star': 0.4845}, {'5 star': 0.1569}]
[{'1 star': 0.0116}, {'2 star': 0.0524}, {'3 star': 0.2946}, {'4 star': 0.4845}, {'5 star': 0.1569}]


In [8]:
# Original Transformer Model
model_sent_transformer = AutoModel.from_pretrained(model_id_sent_transformer)
tokenizer_sent_transformer = AutoTokenizer.from_pretrained(model_id_sent_transformer)

# ONNX Model
model_sent_transformer_onnx = ORTModelForCustomTasks.from_pretrained(model_sent_transformer_path_onnx, file_name='model.onnx')
tokenizer_sent_transformer_onnx = AutoTokenizer.from_pretrained(model_sent_transformer_path_onnx)

In [9]:
from src.sentencesSmilarity import sentences_similarity

test_input = ["This is an example of sentence.", "This is example of another sentence.", "Where is my book?"]

In [10]:
print(sentences_similarity(test_input, model_sent_transformer, tokenizer_sent_transformer))
print(sentences_similarity(test_input, model_sent_transformer_onnx, tokenizer_sent_transformer_onnx))

[{'This is example of another sentence.': 0.929}, {'Where is my book?': 0.03}]
[{'This is example of another sentence.': 0.929}, {'Where is my book?': 0.03}]


We can see that ONNX models are working successfully as these models are giving same output as original transformer models. Now, lets attempt to optimize these models further to accelerate latency and inferencing. 

### ONNX Optimizer

Classes `ORTOptimizer` will be used for graph optimization and `OptimizationConfig` takes in the configuration.

In [11]:
# Loading ONNX models
model_bert_onnx = ORTModelForSequenceClassification.from_pretrained(model_bert_path_onnx, file_name="model.onnx")
model_sent_transformer_onnx = ORTModelForCustomTasks.from_pretrained(model_sent_transformer_path_onnx, file_name='model.onnx')

In [12]:
def save_onnx_optim_model(model, model_path):
    '''
        It allows further optimization of graphs for improving inferencing speed.
    '''
    optimizer = ORTOptimizer.from_pretrained(model_path)
    optim_config = OptimizationConfig(optimization_level=99) # 1, 2 or 99
    optimizer.optimize(optim_config, save_dir=model_path)

In [13]:
save_onnx_optim_model(model_bert_onnx, model_bert_path_onnx)

2023-01-04 08:33:42.367572830 [W:onnxruntime:, inference_session.cc:1458 Initialize] Serializing optimized model with Graph Optimization level greater than ORT_ENABLE_EXTENDED and the NchwcTransformer enabled. The generated model may contain hardware specific optimizations, and should only be used in the same environment the model was optimized in.


In [14]:
save_onnx_optim_model(model_sent_transformer_onnx, model_sent_transformer_path_onnx)

ValueError: Unrecognized model in saved_models/all-MiniLM-L6-v2-onnx. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: albert, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, blenderbot, blenderbot-small, bloom, camembert, canine, clip, codegen, convbert, convnext, ctrl, cvt, data2vec-audio, data2vec-text, data2vec-vision, deberta, deberta-v2, decision_transformer, deit, detr, distilbert, donut-swin, dpr, dpt, electra, encoder-decoder, ernie, flaubert, flava, fnet, fsmt, funnel, glpn, gpt2, gpt_neo, gpt_neox, gptj, groupvit, hubert, ibert, imagegpt, layoutlm, layoutlmv2, layoutlmv3, led, levit, longformer, longt5, luke, lxmert, m2m_100, marian, maskformer, mbart, mctct, megatron-bert, mobilebert, mobilevit, mpnet, mt5, mvp, nezha, nystromformer, openai-gpt, opt, owlvit, pegasus, pegasus_x, perceiver, plbart, poolformer, prophetnet, qdqbert, rag, realm, reformer, regnet, rembert, resnet, retribert, roberta, roformer, segformer, sew, sew-d, speech-encoder-decoder, speech_to_text, speech_to_text_2, splinter, squeezebert, swin, swinv2, t5, tapas, trajectory_transformer, transfo-xl, trocr, unispeech, unispeech-sat, van, videomae, vilt, vision-encoder-decoder, vision-text-dual-encoder, visual_bert, vit, vit_mae, wav2vec2, wav2vec2-conformer, wavlm, xclip, xglm, xlm, xlm-prophetnet, xlm-roberta, xlm-roberta-xl, xlnet, yolos, yoso, onnx_model

We can see that still sentence transformers are still not supported for optimization. So, we will continue to quantize BERT model after comparing score of ONNX bert model after optimization. 

In [15]:
model_bert_onnx_opt = ORTModelForSequenceClassification.from_pretrained(model_bert_path_onnx, file_name="model_optimized.onnx")
tokenizer_bert_onnx = AutoTokenizer.from_pretrained(model_bert_path_onnx)

In [16]:
test_sentence = 'Whole Cake Island is the sweetest country to reside. But it will also spoils person due to comfort zones.'
print(sentiment_analyzer(test_sentence, model_bert_onnx_opt, tokenizer_bert_onnx))

[{'1 star': 0.0116}, {'2 star': 0.0524}, {'3 star': 0.2946}, {'4 star': 0.4845}, {'5 star': 0.1569}]


We can see the scores are same before and after optimization of the models. Lets compare the size of original onnx model and optimized onnx model.

In [17]:
import os
from pathlib import Path

onnx_path = Path(model_bert_path_onnx)

# get model file size
size = os.path.getsize(onnx_path / "model.onnx")/(1024*1024)
print(f"Original Onnx Model file size: {size:.2f} MB")
size = os.path.getsize(onnx_path / "model_optimized.onnx")/(1024*1024)
print(f"Optimized Onnx Model file size: {size:.2f} MB")

Original Onnx Model file size: 638.68 MB
Optimized Onnx Model file size: 638.46 MB


There is not much difference of model sizes after optimization.

Quantization allows for further accelaration of latency and inferencing. Lets see if the size of the model also decreases or not.

### ONNX Quantization

In [18]:
def save_onnx_quantize_model(model, model_path):
    quantizer = ORTQuantizer.from_pretrained(model)
    qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=True)
    quantizer.quantize(save_dir=model_path, quantization_config=qconfig)

In [19]:
save_onnx_quantize_model(model_bert_onnx, model_bert_path_onnx)

Note that we are quantizing the original onnx model not optimized one.

In [20]:
model_quantized = ORTModelForSequenceClassification.from_pretrained(model_bert_path_onnx, file_name="model_quantized.onnx")

In [21]:
test_sentence = 'Whole Cake Island is the sweetest country to reside. But it will also spoils person due to comfort zones.'
print(sentiment_analyzer(test_sentence, model_quantized, tokenizer_bert_onnx))

[{'1 star': 0.249}, {'2 star': 0.1131}, {'3 star': 0.166}, {'4 star': 0.2143}, {'5 star': 0.2576}]


In [22]:
import os
from pathlib import Path

onnx_path = Path(model_bert_path_onnx)

# get model file size
size = os.path.getsize(onnx_path / "model.onnx")/(1024*1024)
print(f"Original Onnx Model file size: {size:.2f} MB")
size = os.path.getsize(onnx_path / "model_optimized.onnx")/(1024*1024)
print(f"Optimized Onnx Model file size: {size:.2f} MB")
size = os.path.getsize(onnx_path / "model_quantized.onnx")/(1024*1024)
print(f"Quantized Onnx Model file size: {size:.2f} MB")

Original Onnx Model file size: 638.68 MB
Optimized Onnx Model file size: 638.46 MB
Quantized Onnx Model file size: 394.59 MB


- There is significant reduction in size after quantization. 
- But the results are affected. Therefore, we cannot use it for production purpose.
- We can use the optimized model as our final model for inferencing in the production environment. 

The above results may vary with model to model. 