# OpenVINO (50 баллов)

## Установим Зависимости

In [2]:
!pip install -U openvino nncf
# # или пре-релизная версия:
# !pip install --pre -U openvino --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly git+https://github.com/openvinotoolkit/nncf.git

!pip install transformers[torch] datasets evaluate

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


## Скачиваем Предобученную Модель

Выберите классификационную модель из [Huggingface Hub](https://huggingface.co/models?pipeline_tag=text-classification&sort=trending&search=sst2), либо возьмите модель по умолчанию. Этот ноутбук сделан с рассчётом модель, натренированную на [sst2](https://huggingface.co/datasets/nyu-mll/glue/viewer/sst2) датасете. Если выберете другую модель и датасет, перепешите соответствующие блоки проверки accuracy. Единственное ограничение - модель должна быть трансформер энкодером.

In [37]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_id = "philschmid/MiniLM-L6-H384-uncased-sst2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
hf_model = AutoModelForSequenceClassification.from_pretrained(model_id)

In [4]:
hf_model.save_pretrained("base_model")
tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [5]:
tokenizer

BertTokenizerFast(name_or_path='philschmid/MiniLM-L6-H384-uncased-sst2', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

## Конвертируем Модель в OpenVINO (5 баллов)



In [38]:
import openvino as ov

# готовим пример входа
text_input = ["I love accelerating networks"]
hf_input = tokenizer(text_input, return_tensors="pt")
ov_model = ov.convert_model(hf_model, example_input=hf_input.data)
ov.save_model(ov_model, "model.xml")

compiled_model = ov.compile_model(ov_model)
print(compiled_model(hf_input.data))



{<ConstOutput: names[logits] shape[?,2] type: f32>: array([[-3.0815837,  2.9208455]], dtype=float32)}


Провалидируем предсказания сконвертированной модели:

In [39]:
import torch
import numpy as np
from datasets import load_dataset
import evaluate


val_dataset = load_dataset("glue", "sst2", split="validation")
accuracy = evaluate.load("accuracy")

@torch.no_grad
def accuracy_evaluate(model, dataset=val_dataset, accuracy=accuracy):   
    for sample in dataset:
        tokenized = {**tokenizer(sample["sentence"], return_tensors="pt")}
        logits = model(tokenized)
        pred = np.argmax(logits, axis=1)
        accuracy.add(references=sample["label"], predictions=pred)

    return accuracy.compute()

print(f"PyTorch:  {accuracy_evaluate(lambda x: hf_model(**x).logits)}")
print(f"OpenVINO: {accuracy_evaluate(lambda x: compiled_model(x)[compiled_model.output()])}")

PyTorch:  {'accuracy': 0.9013761467889908}
OpenVINO: {'accuracy': 0.9013761467889908}


## Benchmark (5 баллов)

Добавьте несколько инференсов в бенчмарк для того, чтобы получить более точные результаты.

In [8]:
from time import perf_counter
from statistics import median

@torch.no_grad
def benchmark(model, dataset, num_warmup=10):
    tokenized_dataset = [{**tokenizer(sample["sentence"], return_tensors="pt")} for sample in dataset]

    # add warmup step
    for i in range(num_warmup):
        warmup_data = tokenized_dataset[i % len(tokenized_dataset)]
        _ = model(warmup_data)
    
    times = []
    for data in tokenized_dataset:
        start = perf_counter()
        model(data)
        end = perf_counter()
        times.append(end - start)

    return (
        f"{sum(times):.5f}s, FPS={(len(dataset) / sum(times)):.3f}, "
        f"latency: {min(times):.5f}s, {median(times):.5f}s, {max(times):.5f}s"
    )

print("Pytorch:  ", benchmark(lambda x: hf_model(**x), val_dataset))
print("Openvino: ", benchmark(lambda x: compiled_model(x), val_dataset))

Pytorch:   3.68378s, FPS=236.714, latency: 0.00256s, 0.00412s, 0.01453s
Openvino:  2.62611s, FPS=332.051, latency: 0.00178s, 0.00291s, 0.00764s


## Inference Hints (3 балла)

Скомпилируйте модель с разными инференс хинтами и сравните результаты бенчмарка. Не забудьте указать "CPU" в качестве таргета.

In [9]:
import openvino.properties as props
import openvino.properties.hint as hints

core = ov.Core()
print(core.available_devices)
#core.set_property("CPU", {hints.execution_mode: hints.ExecutionMode.PERFORMANCE})
config = {hints.performance_mode: hints.PerformanceMode.THROUGHPUT}
compiled_througput = core.compile_model(ov_model, "AUTO", config)
print("Openvino: ", benchmark(lambda x: compiled_througput(x), val_dataset))

['CPU', 'GPU']
Openvino:  4.42930s, FPS=196.871, latency: 0.00228s, 0.00441s, 0.01701s


In [10]:
config = {hints.performance_mode: hints.PerformanceMode.THROUGHPUT}
compiled_througput = core.compile_model(ov_model, "CPU", config)
print("Openvino: ", benchmark(lambda x: compiled_througput(x), val_dataset))

Openvino:  2.78592s, FPS=313.002, latency: 0.00178s, 0.00312s, 0.00608s


LATENCY HINT

In [11]:
config = {hints.performance_mode: hints.PerformanceMode.LATENCY}
compiled_latency = core.compile_model(ov_model, "CPU", config)
print("Openvino: ", benchmark(lambda x: compiled_latency(x), val_dataset))

Openvino:  2.14571s, FPS=406.393, latency: 0.00131s, 0.00242s, 0.00428s


## Async Inference

Переписать бенчмарк под асинхронный инференс. Он должен принимать на вход асинхронную очередь и датасет.

### Простой Бенчмарк (5 баллов)
Простая версия бенчмарка должна замерить FPS:

In [12]:
from typing import Dict, Any, Optional


def completion_callback(
    infer_request: ov.InferRequest, user_data: Optional[Dict[str, Any]] = None
) -> None:
    if user_data is not None:
        user_data['results'].append(infer_request.results)

infer_queue = ov.AsyncInferQueue(compiled_througput)
infer_queue.set_callback(completion_callback)

def simple_benchmark_async(queue, dataset):
    tokenized_dataset = [{**tokenizer(sample["sentence"], return_tensors="np")} for sample in dataset]
    user_data = {'results': []}

    # Start timing
    start_time = perf_counter()

    # Start async inference
    for data in tokenized_dataset: 
        inputs = {k: v for k, v in data.items()} 
        queue.start_async(inputs, user_data)

    # Wait for all requests to complete
    queue.wait_all()

    # Compute elapsed time
    elapsed = perf_counter() - start_time

    return f"{elapsed:.5f}s, FPS={(len(dataset) / elapsed):.3f}"

print("Openvino: ", simple_benchmark_async(infer_queue, val_dataset))

Openvino:  1.11363s, FPS=783.022


### Добавить Измерение latency В Асинхронный Бенчмарк (14 баллов)

Используйте `completion_callback` для подсчёта latency.

In [13]:
def completion_callback(
    infer_request: ov.InferRequest,
    user_data: Dict[str, Any],
) -> None:
    if user_data is not None:
        user_data['results'].append(infer_request.results)


infer_queue.set_callback(completion_callback)

def benchmark_async(queue, dataset):
    tokenized_dataset = [{**tokenizer(sample["sentence"], return_tensors="np")} for sample in dataset]
    times = [0 for _ in range(len(dataset))]

    user_data = {'results': []}

    start_time = perf_counter()

    # Start async inference
    for i, data in enumerate(tokenized_dataset):
        start = perf_counter()
        queue.start_async({k: v for k, v in data.items()}, user_data)
        end = perf_counter()
        times[i] = (end - start)

    # Ensure all requests are completed
    queue.wait_all()

    elapsed = perf_counter() - start_time

    return (
        f"{elapsed:.5f}s, FPS={(len(dataset) / elapsed):.3f}, "
        f"latency: {min(times):.5f}s, {median(times):.5f}s, {max(times):.5f}s"
    )

print("Openvino: ", benchmark_async(infer_queue, val_dataset))

Openvino:  1.11316s, FPS=783.357, latency: 0.00006s, 0.00080s, 0.00592s


## Benchmark App

### Измерьте Производительность Модели с Помощью CLI benchmark_app (1 балл)

Чтобы не ждать по минуте можете использовать флаг `-t 30`. Выполнение какого слоя занимает больше всего времени?

In [14]:
! benchmark_app -m model.xml -t 30 -shape [1,128] -pc -pcsort "sort"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[Step 1/11] Parsing and validating input arguments
[ INFO ] Parsing input parameters
[Step 2/11] Loading OpenVINO Runtime
[ INFO ] OpenVINO:
[ INFO ] Build ................................. 2024.0.0-14509-34caeefd078-releases/2024/0
[ INFO ] 
[ INFO ] Device info:
[ INFO ] CPU
[ INFO ] Build ................................. 2024.0.0-14509-34caeefd078-releases/2024/0
[ INFO ] 
[ INFO ] 
[Step 3/11] Setting device configuration
[Step 4/11] Reading model files
[ INFO ] Loading model files
[ INFO ] Read model took 8.04 ms
[ INFO ] Original model I/O parameters:
[ INFO ] Model inputs:
[ INFO ]     input_ids (node: input_ids) : i64 / [...] / [?,?]
[ INFO ]     attention_mask (node: attention_mask) : i64 / [...] / [?,?]
[ INFO ]     token_type_ids (node: token_type_ids) : i64 / [...] / [?,?]
[ INFO ] Model outputs:
[ INFO ]     logits (node: __module.classifier/aten::linear/Add) : f32 / [...] / [?,2]
[Step 5/11] Resizing model to match image sizes and given batch
[ INFO ] Model batch size: 1

[ INFO ] Performance counts sorted for 0-th infer request
__module.bert.en...  Status.EXECUTED      layerType: FullyConnected       execType: brgemm_avx2_f32      realTime (ms): 3.726      cpuTime (ms): 3.726     proportion: 5.29%   
__module.bert.en...  Status.EXECUTED      layerType: FullyConnected       execType: brgemm_avx2_f32      realTime (ms): 3.647      cpuTime (ms): 3.647     proportion: 5.18%   
__module.bert.en...  Status.EXECUTED      layerType: FullyConnected       execType: brgemm_avx2_f32      realTime (ms): 3.642      cpuTime (ms): 3.642     proportion: 5.18%   
__module.bert.en...  Status.EXECUTED      layerType: FullyConnected       execType: brgemm_avx2_f32      realTime (ms): 3.639      cpuTime (ms): 3.639     proportion: 5.17%   
__module.bert.en...  Status.EXECUTED      layerType: FullyConnected       execType: brgemm_avx2_f32      realTime (ms): 3.628      cpuTime (ms): 3.628     proportion: 5.16%   
__module.bert.en...  Status.EXECUTED      layerType: FullyConnected       execType: brgemm_avx2_f32      realTime (ms): 3.626      cpuTime (ms): 3.626     proportion: 5.15%   
__module.bert.en...  Status.EXECUTED      layerType: FullyConnected       execType: brgemm_avx2_f32      realTime (ms): 3.561      cpuTime (ms): 3.561     proportion: 5.06%   
__module.bert.en...  Status.EXECUTED      layerType: FullyConnected       execType: brgemm_avx2_f32      realTime (ms): 3.528      cpuTime (ms): 3.528     proportion: 5.01%   
__module.bert.en...  Status.EXECUTED      layerType: FullyConnected       execType: brgemm_avx2_f32      realTime (ms): 3.503      cpuTime (ms): 3.503     proportion: 4.98%   
__module.bert.en...  Status.EXECUTED      layerType: FullyConnected       execType: brgemm_avx2_f32      realTime (ms): 3.502      cpuTime (ms): 3.502     proportion: 4.98%   
__module.bert.en...  Status.EXECUTED      layerType: FullyConnected       execType: brgemm_avx2_f32      realTime (ms): 3.499      cpuTime (ms): 3.499     proportion: 4.97%   
__module.bert.en...  Status.EXECUTED      layerType: FullyConnected       execType: brgemm_avx2_f32      realTime (ms): 3.476      cpuTime (ms): 3.476     proportion: 4.94%   
__module.bert.en...  Status.EXECUTED      layerType: FullyConnected       execType: brgemm_avx2_f32      realTime (ms): 0.814      cpuTime (ms): 0.814     proportion: 1.16%   
__module.bert.en...  Status.EXECUTED      layerType: FullyConnected       execType: brgemm_avx2_f32      realTime (ms): 0.814      cpuTime (ms): 0.814     proportion: 1.16%   
__module.bert.en...  Status.EXECUTED      layerType: FullyConnected       execType: brgemm_avx2_f32      realTime (ms): 0.812      cpuTime (ms): 0.812     proportion: 1.15%   
__module.bert.en...  Status.EXECUTED      layerType: FullyConnected       execType: brgemm_avx2_f32      realTime (ms): 0.811      cpuTime (ms): 0.811     proportion: 1.15%   
__module.bert.en...  Status.EXECUTED      layerType: FullyConnected       execType: brgemm_avx2_f32      realTime (ms): 0.811      cpuTime (ms): 0.811     proportion: 1.15%   
__module.bert.en...  Status.EXECUTED      layerType: FullyConnected       execType: brgemm_avx2_f32      realTime (ms): 0.811      cpuTime (ms): 0.811     proportion: 1.15%   
__module.bert.en...  Status.EXECUTED      layerType: FullyConnected       execType: brgemm_avx2_f32      realTime (ms): 0.811      cpuTime (ms): 0.811     proportion: 1.15%   
__module.bert.en...  Status.EXECUTED      layerType: FullyConnected       execType: brgemm_avx2_f32      realTime (ms): 0.809      cpuTime (ms): 0.809     proportion: 1.15%   
__module.bert.en...  Status.EXECUTED      layerType: FullyConnected       execType: brgemm_avx2_f32      realTime (ms): 0.807      cpuTime (ms): 0.807     proportion: 1.15%   
__module.bert.en...  Status.EXECUTED      layerType: FullyConnected       execType: brgemm_avx2_f32      realTime (ms): 0.807      cpuTime (ms): 0.807     proportion: 1.15%   
__module.bert.en...  Status.EXECUTED      layerType: FullyConnected       execType: brgemm_avx2_f32      realTime (ms): 0.806      cpuTime (ms): 0.806     proportion: 1.15%   
__module.bert.en...  Status.EXECUTED      layerType: FullyConnected       execType: brgemm_avx2_f32      realTime (ms): 0.805      cpuTime (ms): 0.805     proportion: 1.14%   
__module.bert.en...  Status.EXECUTED      layerType: FullyConnected       execType: brgemm_avx2_f32      realTime (ms): 0.804      cpuTime (ms): 0.804     proportion: 1.14%   
__module.bert.en...  Status.EXECUTED      layerType: FullyConnected       execType: brgemm_avx2_f32      realTime (ms): 0.803      cpuTime (ms): 0.803     proportion: 1.14%   


Answer: Multi-Head Attention - FC layers

### (Optional) Попробуйте подобрать параметры, чтобы увеличить FPS относительно

## NNCF

### Дефолтная Квантизация (5 баллов)

Квантизуйте модель с дефолтными параметрами. Замерьте accuracy.

In [40]:
calibration_dataset = load_dataset("glue", "sst2", split="train[:10%]")


In [41]:
import nncf

def transform_fn(sample):
    return {**tokenizer(sample["sentence"], return_tensors="pt")}

calibration_dataset = nncf.Dataset(calibration_dataset, transform_fn)
validation_dataset = nncf.Dataset(val_dataset, transform_fn)

In [42]:
#quntized_model = nncf.quantize(ov_model, calibration_dataset=calibration_dataset)
quantized_model = nncf.quantize(ov_model, calibration_dataset)
ov.save_model(quantized_model, "qbert.xml")

Output()

Output()

In [43]:
compiled_model = ov.compile_model(quantized_model)
print(f"OpenVINO: {accuracy_evaluate(lambda x: compiled_model(x)[compiled_model.output()])}")

OpenVINO: {'accuracy': 0.8509174311926605}


Замерьте FPS квантизованной модели с помощью benchmark функции или benchmark_app:

In [44]:
! benchmark_app -m qbert.xml -t 30 -shape [1,128] -pc -pcsort "sort"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[Step 1/11] Parsing and validating input arguments
[ INFO ] Parsing input parameters
[Step 2/11] Loading OpenVINO Runtime
[ INFO ] OpenVINO:
[ INFO ] Build ................................. 2024.0.0-14509-34caeefd078-releases/2024/0
[ INFO ] 
[ INFO ] Device info:
[ INFO ] CPU
[ INFO ] Build ................................. 2024.0.0-14509-34caeefd078-releases/2024/0
[ INFO ] 
[ INFO ] 
[Step 3/11] Setting device configuration
[Step 4/11] Reading model files
[ INFO ] Loading model files
[ INFO ] Read model took 16.75 ms
[ INFO ] Original model I/O parameters:
[ INFO ] Model inputs:
[ INFO ]     input_ids (node: input_ids) : i64 / [...] / [?,?]
[ INFO ]     attention_mask (node: attention_mask) : i64 / [...] / [?,?]
[ INFO ]     token_type_ids (node: token_type_ids) : i64 / [...] / [?,?]
[ INFO ] Model outputs:
[ INFO ]     logits (node: __module.classifier/aten::linear/Add) : f32 / [...] / [?,2]
[Step 5/11] Resizing model to match image sizes and given batch
[ INFO ] Model batch size: 

### Accuracy Control (10 баллов)

Квантизуйте модель так, чтобы потеря accuracy была в пределах 1%. Замерьте FPS получившейся модели.

In [51]:
def validate(model, dataset):
    accuracy = evaluate.load("accuracy") 
    for i, sample in enumerate(dataset):
        if i >=499:
            break
        tokenized = {**tokenizer(sample["sentence"], return_tensors="pt")}
        logits = model(tokenized)[model.output()]
        pred = np.argmax(logits, axis=1)
        accuracy.add(references=sample["label"], predictions=pred)

    return accuracy.compute()['accuracy']



quntized_model = nncf.quantize_with_accuracy_control(ov_model, 
                                                     calibration_dataset=calibration_dataset, 
                                                     validation_dataset=validation_dataset, 
                                                     validation_fn=validate)

ov.save_model(quntized_model, "qbert.xml")

Output()

Output()

INFO:nncf:Validation of initial model was started
INFO:nncf:Elapsed Time: 00:00:00
INFO:nncf:Elapsed Time: 00:00:05
INFO:nncf:Metric of initial model: 0.9078156312625251
INFO:nncf:Collecting values for each data item using the initial model
INFO:nncf:Elapsed Time: 00:19:56
INFO:nncf:Validation of quantized model was started
INFO:nncf:Elapsed Time: 00:00:00
INFO:nncf:Elapsed Time: 00:00:03
INFO:nncf:Metric of quantized model: 0.843687374749499
INFO:nncf:Collecting values for each data item using the quantized model
INFO:nncf:Elapsed Time: 00:19:53
INFO:nncf:Accuracy drop: 0.06412825651302612 (absolute)
INFO:nncf:Accuracy drop: 0.06412825651302612 (absolute)
INFO:nncf:Total number of quantized operations in the model: 85
INFO:nncf:Number of parallel workers to rank quantized operations: 1


KeyError: 499

In [None]:
! benchmark_app -m qbert.xml -t 30 -shape [1,128] -pc -pcsort "sort"

# Дополнительно (2 балла)

Добавьте в модель:
1. Токенизационный препроцессинг с помощью `openvino-tokenizers`
2. (Hard) Добавьте постпроцессинг в модель, чтобы она сразу отдавала результат `np.argmax(logits, axis=1)`

In [None]:
# !pip install --pre -U openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly git+https://github.com/openvinotoolkit/nncf.git