# DL/DLOps (2023) Lab Assignment 6

In [None]:
!pip install torch-tb-profiler
!pip install onnx
!pip install onnxruntime
!pip install onnxoptimizer


In [None]:
import torch
import torch.nn
import torch.optim
import torch.profiler
import torch.utils.data
import torchvision.datasets
import torchvision.models
import torchvision.transforms as T


In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')


In [None]:
device


**Q1.**

a. Load and preprocessing CIFAR100 dataset using standard augmentation and
normalization techniques [10 Marks]

In [None]:
transform = T.Compose(
    [T.Resize(224),
     T.ToTensor(),
     T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
train_set = torchvision.datasets.CIFAR10(
    root='./data', train=True, download=True, transform=transform)
train_loader = torch.utils.data.DataLoader(
    train_set, batch_size=32, shuffle=True)


b. Train the following models for 50 epoch and at the same time profile the model using Tensorboard during the training step [5*4 = 20 Marks]

○ ResNet-34

○ DenseNet-121

○ EfficientNet-B0

○ ConvNeXt-T

In [None]:
selected_models = [
    torchvision.models.resnet34(pretrained=True).to(device),
    torchvision.models.densenet121(pretrained=True).to(device),
    torchvision.models.efficientnet_b0(pretrained=True).to(device),
    torchvision.models.convnext_tiny(pretrained=True).to(device),
]

model_names = [
    'ResNet-34', 'DenseNet-121', 'EfficientNet-B0', 'ConvNeXt-T'
]


In [None]:
def train(data):
    inputs, labels = data[0].to(device=device), data[1].to(device=device)
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


In [None]:
!rm - rf ./logs/


In [None]:

for i, model in enumerate(selected_models):
    epoch = 50
    optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    criterion = torch.nn.CrossEntropyLoss().cuda(device)
    model.train()
    with torch.profiler.profile(
            schedule=torch.profiler.schedule(
                wait=1, warmup=1, active=3, repeat=2),
            on_trace_ready=torch.profiler.tensorboard_trace_handler(
                './logs/'+model_names[i]),
            record_shapes=True,
            profile_memory=True,
            with_stack=True
    ) as prof:
        for e in range(epoch):
            for step, batch_data in enumerate(train_loader):
                # if step >= (1 + 1 + 3) * 2:
                #    break
                train(batch_data)
                prof.step()


In [None]:
%load_ext tensorboard


In [None]:
# %reload_ext tensorboard


In [None]:
#!kill 1044


In [None]:
%tensorboard - -logdir = ./logs


**Q1.c.** Then perform the following model inferencing techniques on the above listed models
[10*2 = 20 Marks]

○ Torchscript

In [None]:
import torch
import torch.nn.functional as F
import torchvision.models as models
import numpy as np
from time import perf_counter
from copy import deepcopy


def timer(f, *args):
    start = perf_counter()
    f(*args)
    return (1000 * (perf_counter() - start))


In [None]:
import os


def get_size(file):
    return os.path.getsize(file)/(1024*1024)


In [None]:


torch_script_results = []
for i, model in enumerate(selected_models):
    mod_name = model_names[i]

    dummy_input = torch.rand(1, 3, 224, 224).to(device)

    # *** This is the TorchScript export
    model_scripted = torch.jit.script(model)
    frozen_mod = torch.jit.optimize_for_inference(model_scripted)
    model.eval()
    # Get the unscripted model's prediction...
    unscripted_output = model(dummy_input)
    # ...and do the same for the scripted version
    scripted_output = model_scripted(dummy_input)
    # ...and do the same for the scripted version
    scripted_frozen_mod_output = frozen_mod(dummy_input)

    unscripted_top5 = F.softmax(unscripted_output, dim=1).topk(
        5).indices.cpu().numpy().squeeze()
    scripted_top5 = F.softmax(scripted_output, dim=1).topk(
        5).indices.cpu().numpy().squeeze()
    frozen_scripted_top5 = F.softmax(scripted_frozen_mod_output, dim=1).topk(
        5).indices.cpu().numpy().squeeze()
    print()
    print(f'{mod_name} Python model top 5 results:\t  {unscripted_top5}')
    print(f'{mod_name} TorchScript model top 5 results:\t  {scripted_top5}')
    print(f'{mod_name} TorchScript Frozen model top 5 results:\t  {frozen_scripted_top5}')

    torch.save(model.state_dict(), f'models/{mod_name}_pytorch.pt')
    model_scripted.save(f'models/{mod_name}_scripted.pt')
    frozen_mod.save(f'models/{mod_name}_scripted_frzn.pt')

    model_size = get_size(f'models/{mod_name}_pytorch.pt')
    model_scripted_size = get_size(f'models/{mod_name}_scripted.pt')
    frozen_mod_size = get_size(f'models/{mod_name}_scripted_frzn.pt')

    avgRuntimePytorch = np.mean([timer(model, dummy_input) for _ in range(10)])
    avgRuntimeTorchScript = np.mean(
        [timer(model_scripted, dummy_input) for _ in range(10)])
    avgRuntimeTorchScriptFrozen = np.mean(
        [timer(frozen_mod, dummy_input) for _ in range(10)])
    print()
    print(f"{mod_name} Average runtime of Pytorch Model in {device}: \t" +
          str(avgRuntimePytorch))
    print(f"{mod_name} Average runtime of TorchScript Model in {device} : \t" +
          str(avgRuntimeTorchScript))
    print(f"{mod_name} Average runtime of TorchScript Frozen Model in {device} : \t" +
          str(avgRuntimeTorchScriptFrozen))
    print()
    print(f"{mod_name} Size of Pytorch Model in {device}: \t" + str(model_size))
    print(f"{mod_name} Size of TorchScript Model in {device} : \t" +
          str(model_scripted_size))
    print(f"{mod_name} Size of TorchScript Frozen Model in {device} : \t" +
          str(frozen_mod_size))

    torch_script_results.append({
        "Model": mod_name,
        "model_size": model_size,
        "model_scripted_size": model_scripted_size,
        "frozen_mod_size": frozen_mod_size,
        "avgRuntimePytorch": avgRuntimePytorch,
        "avgRuntimeTorchScript": avgRuntimeTorchScript,
        "avgRuntimeTorchScriptFrozen": avgRuntimeTorchScriptFrozen,
        "unscripted_top5": unscripted_top5,
        "scripted_top5": scripted_top5,
        "frozen_scripted_top5": frozen_scripted_top5,
    })


In [None]:
import pandas as pd

df = pd.DataFrame(torch_script_results)


In [None]:
df


**Q1.c.** Then perform the following model inferencing techniques on the above listed models
[10*2 = 20 Marks]

○ ONNX 

In [None]:
!pip install torch-tb-profiler
!pip install onnx
!pip install onnxruntime
!pip install onnxoptimizer


In [None]:
from onnx import __version__, IR_VERSION
from onnx.defs import onnx_opset_version
print(
    f"onnx.__version__={__version__!r}, opset={onnx_opset_version()}, IR_VERSION={IR_VERSION}")


In [None]:
import numpy as np
from time import perf_counter

import torch.onnx
import onnx
import torch
import torchvision.models as models
import onnxruntime
import os


def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()


def time_ort_model_evaluation(model_path):
    sess_options = onnxruntime.SessionOptions()
    sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
    session = onnxruntime.InferenceSession(model_path, sess_options)

    time_per_inference = []
    for _ in range(10):
        dummy_input = torch.randn(1, 3, 224, 224)
        # compute ONNX Runtime output prediction
        ort_inputs = {session.get_inputs()[0].name: to_numpy(dummy_input)}
        start = perf_counter()
        session.run(None, ort_inputs)
        time_per_inference.append((1000 * (perf_counter() - start)))

    return np.mean(time_per_inference)


def time_ort_model_evaluation(model_path):
    sess_options = onnxruntime.SessionOptions()
    sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
    session = onnxruntime.InferenceSession(model_path, sess_options)

    time_per_inference = []
    for _ in range(10):
        dummy_input = torch.randn(1, 3, 224, 224)
        # compute ONNX Runtime output prediction
        ort_inputs = {session.get_inputs()[0].name: to_numpy(dummy_input)}
        start = perf_counter()
        session.run(None, ort_inputs)
        time_per_inference.append((1000 * (perf_counter() - start)))

    return np.mean(time_per_inference)


def quantize_onnx_model(onnx_model_path, quantized_model_path):
    from onnxruntime.quantization import quantize_dynamic, QuantType
    import onnx
    onnx_opt_model = onnx.load(onnx_model_path)
    quantize_dynamic(onnx_model_path,
                     quantized_model_path,
                     weight_type=QuantType.QUInt8)  # QInt8

    print(f"quantized model saved to:{quantized_model_path}")


def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()


def perform_onnx_infer(model, model_name, dummy_input):

    input_names = ["actual_input"]
    output_names = ["output"]

    print(
        f'\n\n************************\t{model_name}\t********************************\n\n')

    model_onnx = model_name + ".onnx"
    model_opt_onnx = model_name + "_opt.onnx"
    model_opt_quant_onnx = model_name + "_opt_quant.onnx"

    torch.onnx.export(model, dummy_input, model_onnx, verbose=False,
                      input_names=input_names, output_names=output_names, export_params=True,)

    ort_session = onnxruntime.InferenceSession(model_onnx)

    # compute ONNX Runtime output prediction
    ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(dummy_input)}
    ort_outs = ort_session.run(None, ort_inputs)

    # compare ONNX Runtime and PyTorch results
    torch_out = model(dummy_input)  # torch.randn(1, 3, 224, 224)
    # np.testing.assert_allclose(to_numpy(torch_out), ort_outs[0], rtol=1e-03, atol=1e-05)

    os.system(f'!python -m onnxoptimizer {model_onnx} {model_opt_onnx}')

    print(f'{model_name}\tAverage runtime of ONNX Model in GPU: ' +
          str(time_ort_model_evaluation(model_onnx)))
    print(f'{model_name}\tAverage runtime of ONNX Optimized Model in GPU: ' +
          str(time_ort_model_evaluation(model_opt_onnx)))

    quantize_onnx_model(model_opt_onnx, model_opt_quant_onnx)

    print(f'{model_name}\tONNX full precision model size (MB):',
          os.path.getsize(model_opt_onnx)/(1024*1024))
    print(f'{model_name}\tONNX quantized model size (MB):', os.path.getsize(
        model_opt_quant_onnx)/(1024*1024))

    print(f'{model_name}\tAverage runtime of ONNX Model in TPU: ' +
          str(time_ort_model_evaluation(model_onnx)))
    print(f'{model_name}\tAverage runtime of ONNX Quantized Model in TPU: ' +
          str(time_ort_model_evaluation(model_opt_quant_onnx)))
    print()
    print('-'*80)


In [None]:
for i, model in enumerate(selected_models):
    dummy_input = torch.rand(1, 3, 224, 224).to(device)
    perform_onnx_infer(model, model_names[i], dummy_input)
