## Установим зависимости

In [None]:
# !pip install pip -U
# !pip install torch==2.2.* torchvision==0.17
# !pip install polygraphy==0.49.9
# !pip install tensorrt==8.6.* --extra-index-url https://pypi.nvidia.com
# !pip install onnx

## Подготовим полезные функции

In [1]:
from pathlib import Path

import torch

from torch.utils.data.dataloader import DataLoader 
from torchvision import transforms
from torchvision.datasets.imagenette import Imagenette

import os

CLASSES_MAPPING = {
    0: 0,
    1: 217,
    2: 848,
    3: 491,
    4: 497,
    5: 566,
    6: 569,
    7: 571,
    8: 574,
    9: 701,
}

root_dir = "/home/gvasserm/dev/ml_acceleration/imagenette/"
print(os.listdir(root_dir))
def imagenette_dataloader(batch_size, height, width, split="val"):    
    dataset = Imagenette(
        root=root_dir, 
        split=split, 
        download=False,
        transform=transforms.Compose([
            transforms.Resize((height, width)), 
            transforms.ToTensor(), 
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225], inplace=True),
        ])
    )

    return DataLoader(dataset, batch_size=batch_size)


def validate(model, batch_size, height, width):
    val_dataloder = imagenette_dataloader(batch_size, height, width)

    with torch.no_grad():
        acc = []
        for images, labels in val_dataloder:
            output = model(images.cuda())
            _, predicted_labels = torch.max(output, dim=1)
            predicted_labels = predicted_labels.cpu().tolist()
            
            acc += [predicted_label == CLASSES_MAPPING[label] for predicted_label, label in zip(predicted_labels, labels.tolist())]
            
        print(f"acc = {sum(acc) * 100 / len(acc):.2f}%")

from torchvision.models import mobilenet_v2
from torchvision.models import MobileNetV2


model = mobilenet_v2(weights=MobileNetV2).eval().cuda()
validate(model, 16, 224, 224)

['imagenette2']




acc = 79.21%


## Воспроизвидите функцию для замера latency из лекции (10 баллов)

In [2]:
import numpy as np
import time
def latency_benchmark(model, test_input, warmup_n=10, benchmark_n=100):
    # model - модель для замеров
    # test_input - тестовый пример
    # warmup_n - кол-во шагов для warmup
    # benchmark_n - кол-во шагов для замера латенси

    # Warm-up phase: run the model several times to stabilize performance
    for _ in range(warmup_n):
        model(test_input)
        torch.cuda.synchronize()  # Wait for CUDA to finish

    bsz = np.float64(test_input.shape[0])

    # Benchmark phase: collect execution times
    timings = []
    for _ in range(benchmark_n):
        start_time = time.time()
        model(test_input)
        torch.cuda.synchronize()  # Ensure model has finished processing
        end_time = time.time()
        elapsed_time = (end_time - start_time) * 1000  # Convert to milliseconds
        timings.append(elapsed_time/bsz)

    # Calculate mean and standard deviation of the timings
    mean_ms = np.mean(timings)
    std_ms = np.std(timings)
    
    print(f"{mean_ms:.3f}ms +- {std_ms:.3f}ms")

    assert (std_ms / mean_ms) < 0.1, "слишком большое отклонение в измерения (> 10%), проверте код, возможно стоит поднять кол-во запусков"

## Проверяем как работает функция замера latency

In [3]:
# запускаем под no_grad, чтобы минимизировать потребление памяти (исключает выделение памяти под градиенты)
with torch.no_grad():
    latency_benchmark(
        model, 
        torch.ones(1, 3, 640, 480, device="cuda"), 
        warmup_n=10, 
        benchmark_n=100,
    )

3.321ms +- 0.216ms


## Напишите функцию для записи CUDA graph (10 баллов)

Функция должна вернуть объект CUDAGraph с записанным графом, входной тензор для передачи данных и выходной тензор для копирования результатов.

In [4]:
def record_CUDA_graph(model, batch_size, height, width, warmup_n=10):
    # model - модель для записи cuda Graph
    # batch_size - размер батча входных данных
    # height - высота картинки
    # width - ширина картинки
    # warmup_n - кол-во шагов для warmup

    # Move the model to GPU and set it to evaluation mode
    model.to('cuda').eval()

    # Create a random input tensor with the specified dimensions on CUDA
    input_tensor = torch.randn(batch_size, 3, height, width, device='cuda')

    # Create a new CUDA stream for capturing the graph
    stream = torch.cuda.Stream()

    # Warm-up phase: run the model several times on the non-default stream to stabilize performance
    with torch.cuda.stream(stream):
        for _ in range(warmup_n):
            model(input_tensor)
        # Synchronize the stream to ensure all operations are completed
        torch.cuda.synchronize()

    # Instantiate a CUDA graph and start capturing on the specified stream
    with torch.cuda.stream(stream):
        graph = torch.cuda.CUDAGraph()
        graph.capture_begin()
        # Perform the computation you want to capture
        output_tensor = model(input_tensor)
        graph.capture_end()
        torch.cuda.synchronize()

    return graph, input_tensor, output_tensor

## Проверяем как работает функция записи CUDA graph

In [5]:
bsz, ch, height, width = 1, 3, 224, 224

graph, input_placeholder, output_placeholder = record_CUDA_graph(model, bsz, height, width, warmup_n=10)

test_data = torch.ones(bsz, ch, height, width, device="cuda")
# запускаем под no_grad, чтобы минимизировать потребление памяти (исключает выделение памяти под градиенты)
with torch.no_grad():
    # запускаем исходную модель
    model_output = model(test_data)
    
    # запускаем graph
    input_placeholder.copy_(test_data)
    graph.replay()
    graph_output = output_placeholder.clone()
    
    # сравниваем выходы
    assert torch.all(model_output == graph_output), "выход оригинальной модели и CUDA graph не совпадают"

    print("Success cuda graph")

Success cuda graph


## Сравниваем latency оригинальной модели и CUDA graph

In [6]:
def graph_runner(input_data):
    input_placeholder.copy_(input_data)
    graph.replay()
    return output_placeholder

bsz, ch, height, width = 32, 3, 224, 224
test_data = torch.ones(bsz, ch, height, width, device="cuda")

# запускаем под no_grad, чтобы минимизировать потребление памяти (исключает выделение памяти под градиенты)
with torch.no_grad():
    latency_benchmark(
        model, 
        test_data, 
        warmup_n=20, 
        benchmark_n=500,
    )

    bsz, ch, height, width =  1, 3, 224, 224
    test_data = torch.ones(bsz, ch, height, width, device="cuda")
    latency_benchmark(
        graph_runner, 
        test_data, 
        warmup_n=20, 
        benchmark_n=500,
    )

0.523ms +- 0.019ms
0.852ms +- 0.073ms


## Экспортируйте torchvision модель в onnx файл (10 баллов)

Для тестов нам потребуется 2 варианта onnx:
1. все входные оси фиксированны (1, 3, 224, 224). Файл назовите "my-model-ssss.onnx".
1. размер батча, высота и ширина динамические, количество входных каналов фиксированное (1, 3, height, width). Файл назовите "my-model-dsdd.onnx".

Имя входа должно быть "x", имя выхода "output".

Докуметацию по экспорту в onnx можно почитать [тут](https://pytorch.org/docs/stable/onnx_torchscript.html#torch.onnx.export).

In [7]:
model.cpu()

sample_input = torch.randn(1, 3, 224, 224, device='cpu')
input_names = ["input"]
output_names = ["output"]
output_file = "my-model-ssss.onnx"

torch.onnx.export(model,               # model being run
                sample_input,        # model input (or a tuple for multiple inputs)
                output_file,         # where to save the model (can be a file or file-like object)
                export_params=True,  # store the trained parameter weights inside the model file
                opset_version=12,    # the ONNX version to export the model to
                do_constant_folding=True,  # whether to execute constant folding for optimization
                input_names=input_names,   # the model's input names
                output_names=output_names)

output_file = "my-model-dsdd.onnx"

# Define dynamic axes
dynamic_axes = {
    'input': {0: 'batch_size', 2: 'height', 3: 'width'},  # Dynamically adjust batch size, height, and width
    'output': {0: 'batch_size'}  # Assume output has dynamic batch size; adjust depending on model architecture
}

torch.onnx.export(model,               # model being run
                sample_input,        # model input (or a tuple for multiple inputs)
                output_file,         # where to save the model (can be a file or file-like object)
                export_params=True,  # store the trained parameter weights inside the model file
                opset_version=12,    # the ONNX version to export the model to
                do_constant_folding=True,  # whether to execute constant folding for optimization
                input_names=input_names,   # the model's input names
                output_names=output_names,
                dynamic_axes=dynamic_axes)


## Скомпилируйте простейший вариант модели без динамических осей

In [8]:
from polygraphy.backend.trt import CreateConfig, Profile, Calibrator
from polygraphy.comparator import DataLoader as DL
from polygraphy.backend.trt import engine_from_network
from polygraphy.backend.trt import NetworkFromOnnxPath
from polygraphy.backend.trt import save_engine
from polygraphy.backend.trt import TrtRunner, EngineFromBytes


model_ssss = NetworkFromOnnxPath("my-model-ssss.onnx")
config = CreateConfig()

engine = engine_from_network(model_ssss, config=config)
save_engine(engine, path="my-model-ssss.engine")

[W] 'colored' module is not installed, will not use colors when logging. To enable colors, please install the 'colored' module: python3 -m pip install colored
[I] Configuring with profiles:[
        Profile 0:
            {input [min=[1, 3, 224, 224], opt=[1, 3, 224, 224], max=[1, 3, 224, 224]]}
    ]
[I] Building engine with configuration:
    Flags                  | []
    Engine Capability      | EngineCapability.DEFAULT
    Memory Pools           | [WORKSPACE: 16116.69 MiB, TACTIC_DRAM: 16116.69 MiB]
    Tactic Sources         | [CUBLAS, CUBLAS_LT, CUDNN, EDGE_MASK_CONVOLUTIONS, JIT_CONVOLUTIONS]
    Profiling Verbosity    | ProfilingVerbosity.DETAILED
    Preview Features       | [FASTER_DYNAMIC_SHAPES_0805, DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805]
[I] Finished engine building in 19.461 seconds
[I] Saving engine to my-model-ssss.engine


<tensorrt_bindings.tensorrt.ICudaEngine at 0x791b3df837b0>

## Проверти что точность не просела

**ВАЖНО:**<br>
Опция ``copy_outputs_to_host=False`` позволяет пропустить копирование данных с GPU на CPU.<br>
Вместо numpy array мы получим, torch.Tensor, что бывает очень полезно и экономит время на копировании.

In [12]:
from polygraphy.backend.trt import TrtRunner
with open("my-model-ssss.engine", "rb") as f:
    engine_bytes = f.read()
        
engine = EngineFromBytes(engine_bytes)
with TrtRunner(engine) as trt_runner:
    def validation_trt_runner(input_data):
        # пропустим копирование на CPU copy_outputs_to_host=False
        output = trt_runner.infer(feed_dict={"input": input_data}, copy_outputs_to_host=False)
        return output['output']

    validate(validation_trt_runner, batch_size=1, height=224, width=224)

acc = 79.24%


## Сделайте замер latency с помощью ранее написанной функции latency_benchmark

In [13]:
with TrtRunner(engine) as trt_runner:
    def validation_trt_runner(input_data):
        # пропустим копирование на CPU copy_outputs_to_host=False
        output = trt_runner.infer(feed_dict={"input": input_data}, copy_outputs_to_host=False)
        return output['output']
    latency_benchmark(validation_trt_runner, test_input=torch.ones(1, 3, 224, 224), warmup_n=10, benchmark_n=100)

1.610ms +- 0.291ms


AssertionError: слишком большое отклонение в измерения (> 10%), проверте код, возможно стоит поднять кол-во запусков

## Теперь на основе примера выше скомпилируйте модель с динамическим batch size в диапазоне [1, 64] (5 баллов)
**ВАЖНО:**<br>
Как задать конфиг для динамических осей? Читаем доку [тут](https://docs.nvidia.com/deeplearning/tensorrt/polygraphy/docs/backend/trt/profile.html#optimization-profile) и добавляем профиль в config.

In [14]:
model_dsdd = NetworkFromOnnxPath("my-model-dsdd.onnx")
profiles=[
    Profile().add('input', min=(1, 3, 224, 224), opt=(32, 3, 224, 224), max=(64, 3, 640, 640))
]
config = CreateConfig(profiles=profiles)
engine = engine_from_network(model_dsdd, config=config)
save_engine(engine, path="my-model-dsdd.engine")

[I] Configuring with profiles:[
        Profile 0:
            {input [min=(1, 3, 224, 224), opt=(32, 3, 224, 224), max=(64, 3, 640, 640)]}
    ]
[I] Building engine with configuration:
    Flags                  | []
    Engine Capability      | EngineCapability.DEFAULT
    Memory Pools           | [WORKSPACE: 16116.69 MiB, TACTIC_DRAM: 16116.69 MiB]
    Tactic Sources         | [CUBLAS, CUBLAS_LT, CUDNN, EDGE_MASK_CONVOLUTIONS, JIT_CONVOLUTIONS]
    Profiling Verbosity    | ProfilingVerbosity.DETAILED
    Preview Features       | [FASTER_DYNAMIC_SHAPES_0805, DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805]
[I] Finished engine building in 32.129 seconds
[I] Saving engine to my-model-dsdd.engine


<tensorrt_bindings.tensorrt.ICudaEngine at 0x791b3d787330>

## Проверте что точность не просела и замерте latency для batch size 1 и 64

In [15]:
with open("my-model-dsdd.engine", "rb") as f:
    engine_bytes = f.read()
    
engine = EngineFromBytes(engine_bytes)

with TrtRunner(engine) as trt_runner:
    def validation_trt_runner(input_data):
        # пропустим копирование на CPU copy_outputs_to_host=False
        output = trt_runner.infer(feed_dict={"input": input_data}, copy_outputs_to_host=False)
        return output['output']

    validate(validation_trt_runner, batch_size=1, height=224, width=224)
    validate(validation_trt_runner, batch_size=64, height=224, width=224)

acc = 79.24%
acc = 79.24%


## Скомпилируйте квантованный вариант engine c динамическим batch size [1, 64] (15 балов)

Для этого вам потребуется на основе кода ``imagenette_val_dataloader`` сделать свой калибратор. Документацию на калибратор можно найти [тут](https://docs.nvidia.com/deeplearning/tensorrt/polygraphy/docs/backend/trt/calibrator.html#polygraphy.backend.trt.calibrator.Calibrator).

In [16]:
def polygraphy_compatible_loader(dataloader):
    for images, _ in dataloader:
        yield  {"input": images.numpy()}  # Assuming the model only requires the image data, not labels.

model_int8 = NetworkFromOnnxPath("my-model-dsdd.onnx")
profiles=[
    Profile().add('input', min=(1, 3, 224, 224), opt=(32, 3, 224, 224), max=(64, 3, 640, 640))
]

data_loader = imagenette_dataloader(32, 224, 224, split="train")

calibrator = Calibrator(
    data_loader=polygraphy_compatible_loader(data_loader),
    cache='calibration.cache',
)

config = CreateConfig(
    int8=True,
    calibrator=calibrator,
    profiles=profiles
)

engine = engine_from_network(model_int8, config=config)
save_engine(engine, path="my-model-int8.engine")

[I] Configuring with profiles:[
        Profile 0:
            {input [min=(1, 3, 224, 224), opt=(32, 3, 224, 224), max=(64, 3, 640, 640)]}
    ]
[I] Using calibration profile: {input [min=(1, 3, 224, 224), opt=(32, 3, 224, 224), max=(64, 3, 640, 640)]}
[W] TensorRT does not currently support using dynamic shapes during calibration. The `OPT` shapes from the calibration profile will be used for tensors with dynamic shapes. Calibration data is expected to conform to those shapes. 
[I] Building engine with configuration:
    Flags                  | [INT8]
    Engine Capability      | EngineCapability.DEFAULT
    Memory Pools           | [WORKSPACE: 16116.69 MiB, TACTIC_DRAM: 16116.69 MiB]
    Tactic Sources         | [CUBLAS, CUBLAS_LT, CUDNN, EDGE_MASK_CONVOLUTIONS, JIT_CONVOLUTIONS]
    Profiling Verbosity    | ProfilingVerbosity.DETAILED
    Preview Features       | [FASTER_DYNAMIC_SHAPES_0805, DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805]
    Calibrator             | Calibrator(<ge

<tensorrt_bindings.tensorrt.ICudaEngine at 0x791b3d736d70>

## Проверте что точность не просела и замерте latency для batch size 1 и 64

In [17]:
with open("my-model-int8.engine", "rb") as f:
    engine_bytes = f.read()
    
engine = EngineFromBytes(engine_bytes)

bsz, ch, height, width = 64, 3, 224, 224
test_data = torch.ones(bsz, ch, height, width, device="cuda")

with TrtRunner(engine) as trt_runner:
    def validation_trt_runner(input_data):
        # пропустим копирование на CPU copy_outputs_to_host=False
        output = trt_runner.infer(feed_dict={"input": input_data}, copy_outputs_to_host=False)
        return output['output']

    validate(validation_trt_runner, batch_size=64, height=224, width=224)

    latency_benchmark(
        validation_trt_runner, 
        test_data, 
        warmup_n=10, 
        benchmark_n=100,
    )

acc = 77.38%
0.065ms +- 0.003ms
