<a href="https://colab.research.google.com/github/devansh20/intern/blob/main/OnnxOptimization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install optimum[onnxruntime]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optimum[onnxruntime]
  Downloading optimum-1.2.3.tar.gz (75 kB)
[K     |████████████████████████████████| 75 kB 4.1 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting transformers[sentencepiece]>=4.15.0
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 39.2 MB/s 
Collecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 11.7 MB/s 
Collecting coloredlogs
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[K     |████████████████████████████████| 46 kB 3.5 MB/s 
[?25hCollecting protobuf==3.20.1
  Downloading protobuf-3.20.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.0 MB)
[K

In [2]:
from pathlib import Path
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForQuestionAnswering

model_id = "deepset/roberta-base-squad2"
onnx_path = Path("onnx")
task = "question-answering"

# load vanilla transformers and convert to onnx
model = ORTModelForQuestionAnswering.from_pretrained(model_id, from_transformers=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# save onnx checkpoint and tokenizer
model.save_pretrained(onnx_path)
tokenizer.save_pretrained(onnx_path)

# test the model with using transformers pipeline, with handle_impossible_answer for squad_v2 
optimum_qa = pipeline(task, model=model, tokenizer=tokenizer, handle_impossible_answer=True)
prediction = optimum_qa(question="What's my name?", context="My name is Philipp and I live in Nuremberg.")

print(prediction)

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/473M [00:00<?, ?B/s]

{'score': 0.9041661620140076, 'start': 11, 'end': 18, 'answer': 'Philipp'}


In [9]:
type(model)

optimum.onnxruntime.modeling_ort.ORTModelForQuestionAnswering

In [6]:
from transformers import AutoModelForQuestionAnswering
modelq = AutoModelForQuestionAnswering.from_pretrained(model_id)


In [10]:
type(modelq)

transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering

In [28]:
onnx_path = Path("onnx")

In [15]:
model.save_pretrained(onnx_path,file_name="bertbasener.onnx")

In [47]:
from pathlib import Path
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForTokenClassification

model_id = "dslim/bert-base-NER"
onnx_path = Path("onnx")
task = "ner"

# load vanilla transformers and convert to onnx
model = ORTModelForTokenClassification.from_pretrained(model_id, from_transformers=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# save onnx checkpoint and tokenizer
model.save_pretrained(onnx_path)
tokenizer.save_pretrained(onnx_path)

# test the model with using transformers pipeline, with handle_impossible_answer for squad_v2 
optimum_qa = pipeline(task, model=model, tokenizer=tokenizer)
prediction = optimum_qa("My name is Phillip and I live in Nuremberg")

print(prediction)

Downloading:   0%|          | 0.00/829 [00:00<?, ?B/s]

[{'entity': 'B-PER', 'score': 0.99736637, 'index': 4, 'word': 'Phillip', 'start': 11, 'end': 18}, {'entity': 'B-LOC', 'score': 0.9989875, 'index': 9, 'word': 'Nuremberg', 'start': 33, 'end': 42}]


In [48]:
from optimum.onnxruntime import ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig

In [49]:
qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=True)
quantizer = ORTQuantizer.from_pretrained(model_id, feature="token-classification")


In [50]:
quantizer.export(
    onnx_model_path=onnx_path / "model.onnx",
    onnx_quantized_model_output_path=onnx_path / "model-quantized.onnx",
    quantization_config=qconfig,
)

PosixPath('onnx/model-quantized.onnx')

In [51]:
import os
size = os.path.getsize(onnx_path / "model.onnx")/(1024*1024)
print(f"Vanilla Onnx Model file size: {size:.2f} MB")
size = os.path.getsize(onnx_path / "model-quantized.onnx")/(1024*1024)
print(f"Quantized Onnx Model file size: {size:.2f} MB")

Vanilla Onnx Model file size: 411.05 MB
Quantized Onnx Model file size: 168.52 MB


In [52]:
quantized_model = ORTModelForTokenClassification.from_pretrained(onnx_path, file_name="model-quantized.onnx")

In [53]:
optimum_qa = pipeline(task, model=quantized_model, tokenizer=tokenizer)
prediction = optimum_qa("My name is Phillip and I live in Nuremberg")

print(prediction)

[{'entity': 'I-MISC', 'score': 0.17141421, 'index': 1, 'word': 'My', 'start': 0, 'end': 2}, {'entity': 'I-MISC', 'score': 0.16337714, 'index': 2, 'word': 'name', 'start': 3, 'end': 7}, {'entity': 'I-MISC', 'score': 0.17809716, 'index': 3, 'word': 'is', 'start': 8, 'end': 10}, {'entity': 'I-MISC', 'score': 0.16824414, 'index': 4, 'word': 'Phillip', 'start': 11, 'end': 18}, {'entity': 'I-MISC', 'score': 0.19437209, 'index': 5, 'word': 'and', 'start': 19, 'end': 22}, {'entity': 'I-MISC', 'score': 0.17633548, 'index': 6, 'word': 'I', 'start': 23, 'end': 24}, {'entity': 'I-MISC', 'score': 0.17522964, 'index': 7, 'word': 'live', 'start': 25, 'end': 29}, {'entity': 'I-MISC', 'score': 0.19189443, 'index': 8, 'word': 'in', 'start': 30, 'end': 32}, {'entity': 'I-MISC', 'score': 0.1728881, 'index': 9, 'word': 'Nuremberg', 'start': 33, 'end': 42}]


In [54]:
optimum_qa = pipeline(task, model=model, tokenizer=tokenizer)
prediction = optimum_qa("My name is Phillip and I live in Nuremberg")

print(prediction)

[{'entity': 'B-PER', 'score': 0.99736637, 'index': 4, 'word': 'Phillip', 'start': 11, 'end': 18}, {'entity': 'B-LOC', 'score': 0.9989875, 'index': 9, 'word': 'Nuremberg', 'start': 33, 'end': 42}]


In [42]:
from pathlib import Path
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForQuestionAnswering

model_id = "deepset/roberta-base-squad2"
onnx_path = Path("onnx")
task = "question-answering"

# load vanilla transformers and convert to onnx
model = ORTModelForQuestionAnswering.from_pretrained(model_id, from_transformers=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# save onnx checkpoint and tokenizer
model.save_pretrained(onnx_path)
tokenizer.save_pretrained(onnx_path)

# test the model with using transformers pipeline, with handle_impossible_answer for squad_v2 
optimum_qa = pipeline(task, model=model, tokenizer=tokenizer, handle_impossible_answer=True)
prediction = optimum_qa(question="What's my name?", context="My name is Philipp and I live in Nuremberg.")

print(prediction)

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

{'score': 0.9041661620140076, 'start': 11, 'end': 18, 'answer': 'Philipp'}


In [43]:
from optimum.onnxruntime import ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig

# create ORTQuantizer and define quantization configuration
quantizer = ORTQuantizer.from_pretrained(model_id, feature=task)
qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=True)

# apply the quantization configuration to the model


PosixPath('onnx/model-quantized.onnx')

In [44]:
quantizer.export(
    onnx_model_path=onnx_path / "model.onnx",
    onnx_quantized_model_output_path=onnx_path / "model-quantized.onnx",
    quantization_config=qconfig,
)

PosixPath('onnx/model-quantized.onnx')

In [45]:
import os
size = os.path.getsize(onnx_path / "model.onnx")/(1024*1024)
print(f"Vanilla Onnx Model file size: {size:.2f} MB")
size = os.path.getsize(onnx_path / "model-quantized.onnx")/(1024*1024)
print(f"Quantized Onnx Model file size: {size:.2f} MB")

Vanilla Onnx Model file size: 473.34 MB
Quantized Onnx Model file size: 230.83 MB


In [46]:
quantized_model = ORTModelForQuestionAnswering.from_pretrained(onnx_path, file_name="model-quantized.onnx")

# test the quantized model with using transformers pipeline
quantized_optimum_qa = pipeline(task, model=quantized_model, tokenizer=tokenizer, handle_impossible_answer=True)
prediction = quantized_optimum_qa(question="What's my name?", context="My name is Philipp and I live in Nuremberg.")
print(prediction)

{'score': 0.9206579923629761, 'start': 11, 'end': 18, 'answer': 'Philipp'}
