In [1]:
!python -V

Python 3.11.4


In [2]:
!pip install transformers datasets evaluate tqdm scikit-learn torch --user



In [3]:
!pip install --upgrade-strategy eager optimum[onnxruntime] --user



In [4]:
from optimum.pipelines import pipeline
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForSequenceClassification
import evaluate
from datasets import load_dataset
import tqdm as notebook_tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from optimum.onnxruntime import ORTOptimizer
from optimum.onnxruntime.configuration import OptimizationConfig

In [7]:
dataset = load_dataset("yahoo_answers_topics", split="test").shuffle().select(range(500))

In [8]:
task_type = "zero-shot-classification"

metric = evaluate.load("accuracy")
# evaluator = evaluate.evaluator(task_type)
save_directory = "tmp/onnx"
model_id = "facebook/bart-large-mnli"
model = ORTModelForSequenceClassification.from_pretrained(model_id, from_transformers=True, export=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

The argument `from_transformers` is deprecated, and will be removed in optimum 2.0.  Use `export` instead
Framework not specified. Using pt to export to ONNX.
Using framework PyTorch: 2.0.1+cpu
Overriding 1 configuration item(s)
	- use_cache -> False
  if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
  if attention_mask.size() != (bsz, 1, tgt_len, src_len):
  if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
  if input_shape[-1] > 1:
  if len(torch.unique_consecutive(eos_mask.sum(1))) > 1:


verbose: False, log level: Level.ERROR



('tmp/onnx\\tokenizer_config.json',
 'tmp/onnx\\special_tokens_map.json',
 'tmp/onnx\\vocab.json',
 'tmp/onnx\\merges.txt',
 'tmp/onnx\\added_tokens.json',
 'tmp/onnx\\tokenizer.json')

In [9]:
classifier = pipeline(task_type, model=model, tokenizer=tokenizer)

In [10]:
optimizer = ORTOptimizer.from_pretrained(model)
optimizer.optimize(OptimizationConfig(optimization_level=99), save_dir=save_directory)

Optimizing model...
Configuration saved in tmp\onnx\ort_config.json
Optimized model saved at: tmp\onnx (external data format: False; saved all tensor to one file: True)


WindowsPath('tmp/onnx')

In [11]:
model_optimized = ORTModelForSequenceClassification.from_pretrained(save_directory, file_name="model_optimized.onnx")

In [12]:
classifier_optimized = pipeline(task_type, model=model_optimized, tokenizer=tokenizer, accelerator='ort')

In [14]:
from optimum.onnxruntime import ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig

In [15]:
quantizer = ORTQuantizer.from_pretrained(model_optimized)
qconfig = AutoQuantizationConfig.avx2(is_static=False, per_channel=True, reduce_range=True)
quantizer.quantize(save_dir=save_directory, quantization_config=qconfig)
model_quantized = ORTModelForSequenceClassification.from_pretrained(save_directory, file_name="model_quantized.onnx")

Creating dynamic quantizer: QOperator (mode: IntegerOps, schema: u8/u8, channel-wise: True)
Quantizing model...
Saving quantized model at: tmp\onnx (external data format: False)
Configuration saved in tmp\onnx\ort_config.json


In [16]:
classifier_quantized = pipeline(task_type, model=model_quantized, tokenizer=tokenizer, accelerator='ort')

In [17]:
import time
from sklearn.metrics import accuracy_score
import numpy as np

def evaluate_pipeline(pipeline):
    y_pred = []
    speed = []
    for i, row in enumerate(dataset):
        start_time = time.time()*1000
        res = pipeline(row["question_title"], candidate_labels)
        end_time = time.time()*1000
        pred = res['labels'][res['scores'].index(max(res['scores']))]
        latency = end_time - start_time
        # print('Inference ', i, '\n', row["question_title"], '\nPrediction: ', pred, '\nLabel: ', dataset.features["topic"].int2str(i), '\nLatency: ', latency)
        y_pred.append(pred)
        speed.append(latency)
    print(np.mean(speed))
    print(accuracy_score(y_true, y_pred))

In [18]:
candidate_labels = dataset.features["topic"].int2str(set(dataset["topic"]))

In [19]:
y_true = dataset.features["topic"].int2str(dataset["topic"])

In [20]:
evaluate_pipeline(classifier)

1167.1261821289063
0.456


In [21]:
evaluate_pipeline(classifier_optimized)

1088.7857856445312
0.456


In [22]:
evaluate_pipeline(classifier_quantized)

849.9055986328125
0.162
