In [1]:
# !pip install transformers datasets evaluate tqdm scikit-learn

In [2]:
# !pip install --upgrade-strategy eager optimum[onnxruntime]

In [1]:
from optimum.pipelines import pipeline
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForSequenceClassification
import evaluate
from datasets import load_dataset
import tqdm as notebook_tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from optimum.onnxruntime import ORTOptimizer
from optimum.onnxruntime.configuration import OptimizationConfig

In [4]:
dataset = load_dataset("yahoo_answers_topics", split="test").shuffle().select(range(500))

In [5]:
task_type = "zero-shot-classification"

metric = evaluate.load("accuracy")
# evaluator = evaluate.evaluator(task_type)
save_directory = "tmp/distilled/onnx"
model_id = "valhalla/distilbart-mnli-12-9"
model = ORTModelForSequenceClassification.from_pretrained(model_id, from_transformers=True, export=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

The argument `from_transformers` is deprecated, and will be removed in optimum 2.0.  Use `export` instead
Downloading (…)lve/main/config.json: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.39k/1.39k [00:00<?, ?B/s]
Framework not specified. Using pt to export to ONNX.
Downloading pytorch_model.bin: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.43G/1.43G [01:10<00:00, 20.3MB/s]
Downloading (…)okenizer_config.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26.0/26.0 [00:00<?, ?B/s]
Downloading (…)olve/main/vocab.json: 100%|█████████████

verbose: False, log level: Level.ERROR



('tmp/distilled/onnx\\tokenizer_config.json',
 'tmp/distilled/onnx\\special_tokens_map.json',
 'tmp/distilled/onnx\\vocab.json',
 'tmp/distilled/onnx\\merges.txt',
 'tmp/distilled/onnx\\added_tokens.json',
 'tmp/distilled/onnx\\tokenizer.json')

In [6]:
classifier = pipeline(task_type, model=model, tokenizer=tokenizer)

In [7]:
optimizer = ORTOptimizer.from_pretrained(model)
optimizer.optimize(OptimizationConfig(optimization_level=99), save_dir=save_directory)

Optimizing model...
Configuration saved in tmp\distilled\onnx\ort_config.json
Optimized model saved at: tmp\distilled\onnx (external data format: False; saved all tensor to one file: True)


WindowsPath('tmp/distilled/onnx')

In [8]:
model_optimized = ORTModelForSequenceClassification.from_pretrained(save_directory, file_name="model_optimized.onnx")

In [9]:
classifier_optimized = pipeline(task_type, model=model_optimized, tokenizer=tokenizer)

In [10]:
from optimum.onnxruntime import ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig

In [11]:
quantizer = ORTQuantizer.from_pretrained(model_optimized)
qconfig = AutoQuantizationConfig.avx2(is_static=False, per_channel=True, reduce_range=True)
quantizer.quantize(save_dir=save_directory, quantization_config=qconfig)
model_quantized = ORTModelForSequenceClassification.from_pretrained(save_directory, file_name="model_quantized.onnx")
classifier_quantized = pipeline(task_type, model=model_quantized, tokenizer=tokenizer)

Creating dynamic quantizer: QOperator (mode: IntegerOps, schema: u8/u8, channel-wise: True)
Quantizing model...
Saving quantized model at: tmp\distilled\onnx (external data format: False)
Configuration saved in tmp\distilled\onnx\ort_config.json


In [12]:
import time
from sklearn.metrics import accuracy_score
import numpy as np

def evaluate_pipeline(pipeline):
    y_pred = []
    speed = []
    for i, row in enumerate(dataset):
        start_time = time.time()*1000
        res = pipeline(row["question_title"], candidate_labels)
        end_time = time.time()*1000
        pred = res['labels'][res['scores'].index(max(res['scores']))]
        latency = end_time - start_time
        # print('Inference ', i, '\n', row["question_title"], '\nPrediction: ', pred, '\nLabel: ', dataset.features["topic"].int2str(i), '\nLatency: ', latency)
        y_pred.append(pred)
        speed.append(latency)
    print(np.mean(speed))
    print(accuracy_score(y_true, y_pred))

In [13]:
candidate_labels = dataset.features["topic"].int2str(set(dataset["topic"]))

In [14]:
y_true = dataset.features["topic"].int2str(dataset["topic"])

In [15]:
evaluate_pipeline(classifier)

1081.77134375
0.472


In [16]:
evaluate_pipeline(classifier_optimized)

1000.9123334960938
0.472


In [17]:
evaluate_pipeline(classifier_quantized)

527.645947265625
0.274
