In [1]:
%matplotlib inline

import torch
import matplotlib.pyplot as plt
import numpy as np
import torchvision
from torchvision import transforms
import torch.nn.functional as F
import torch.optim as optim
from torchsummary import summary
import time
from torchvision.models import resnet50, ResNet50_Weights, ResNet152_Weights, EfficientNet_B7_Weights

  warn(f"Failed to load image Python extension: {e}")


### Measure inference time for ResNet152

Measure inference time for some pretrained models for example ResNet152

In [2]:
#Select device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [10]:
#Network model
model = torchvision.models.resnet152(weights=ResNet152_Weights.IMAGENET1K_V2).to(device)

#Change last fully conected layer to a one with 1 outputs. This layer is trainable.
model.fc = torch.nn.Sequential(
               torch.nn.Linear(2048, 128),
               torch.nn.ReLU(inplace=True),
               torch.nn.Linear(128, 1),
               torch.nn.Sigmoid()).to(device)

In [11]:
optimal_batch_size = 1
dummy_input = torch.randn(optimal_batch_size, 3,224,224, dtype=torch.float).to(device)

In [12]:
# INIT LOGGERS
starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
repetitions = 300
timings=np.zeros((repetitions,1))

model.eval()

#GPU-WARM-UP
for _ in range(10):
    #Forward passto model
    _ = model(dummy_input)

# MEASURE PERFORMANCE
with torch.no_grad():
    for rep in range(repetitions):
        starter.record()
        _ = model(dummy_input)
        ender.record()
        # WAIT FOR GPU SYNC
        torch.cuda.synchronize()
        curr_time = starter.elapsed_time(ender)
        timings[rep] = curr_time
mean_syn = np.sum(timings) / repetitions
std_syn = np.std(timings)
print(f'{mean_syn} ms')

12.026528333028157 ms


### Measure inferece time for EfficientNet b7

In [13]:
#Network model
model = torchvision.models.efficientnet_b7(weights=EfficientNet_B7_Weights.IMAGENET1K_V1).to(device)

#Change last fully conected layer to a one with 1 outputs. This layer is trainable.
model.classifier = torch.nn.Sequential(
               torch.nn.Linear(2560, 128),
               torch.nn.ReLU(inplace=True),
               torch.nn.Linear(128, 1),
               torch.nn.Sigmoid()).to(device)

In [14]:
optimal_batch_size = 1
dummy_input = torch.randn(optimal_batch_size, 3, 600, 600, dtype=torch.float).to(device)

In [15]:
# INIT LOGGERS
starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
repetitions = 300
timings=np.zeros((repetitions,1))

model.eval()

#GPU-WARM-UP
for _ in range(10):
    #Forward passto model
    _ = model(dummy_input)

# MEASURE PERFORMANCE
with torch.no_grad():
    for rep in range(repetitions):
        starter.record()
        _ = model(dummy_input)
        ender.record()
        # WAIT FOR GPU SYNC
        torch.cuda.synchronize()
        curr_time = starter.elapsed_time(ender)
        timings[rep] = curr_time
mean_syn = np.sum(timings) / repetitions
std_syn = np.std(timings)
print(f'{mean_syn} ms')

32.1107331720988 ms


### Model quantization

In [19]:
#Network model
modelE7 = torchvision.models.efficientnet_b7(weights=EfficientNet_B7_Weights.IMAGENET1K_V1).to(device)

#Change last fully conected layer to a one with 1 outputs. This layer is trainable.
modelE7.classifier = torch.nn.Sequential(
               torch.nn.Linear(2560, 128),
               torch.nn.ReLU(inplace=True),
               torch.nn.Linear(128, 1),
               torch.nn.Sigmoid()).to(device)

In [24]:
import copy
from torch.ao.quantization import get_default_qconfig
from torch.ao.quantization.quantize_fx import convert_fx, prepare_fx

#fp32_model = resnet50().eval()
modelE7.eval()
model = copy.deepcopy(modelE7)
# `qconfig` means quantization configuration, it specifies how should we
# observe the activation and weight of an operator
# `qconfig_dict`, specifies the `qconfig` for each operator in the model
# we can specify `qconfig` for certain types of modules
# we can specify `qconfig` for a specific submodule in the model
# we can specify `qconfig` for some functioanl calls in the model
# we can also set `qconfig` to None to skip quantization for some operators
qconfig = get_default_qconfig("fbgemm")
qconfig_dict = {"": qconfig}# `prepare_fx` inserts observers in the model based on the configuration in `qconfig_dict`
model_prepared = prepare_fx(model, qconfig_dict)# calibration runs the model with some sample data, which allows observers to record the statistics of
# the activation and weigths of the operators
calibration_data = [torch.randn(1, 3, 224, 224).to(device) for _ in range(100)]
for i in range(len(calibration_data)):
   model_prepared(calibration_data[i])# `convert_fx` converts a calibrated model to a quantized model, this includes inserting
# quantize, dequantize operators to the model and swap floating point operators with quantized operators
model_quantized = convert_fx(copy.deepcopy(model_prepared))# benchmark

  torch.tensor(weight_qparams["scale"], dtype=torch.float, device=device))
  torch.tensor(weight_qparams["zero_point"], dtype=torch.int, device=device))
  torch.tensor(weight_qparams["scale"], dtype=torch.float, device=device))
  torch.tensor(


In [26]:

#Dummy data
optimal_batch_size = 1
dummy_input = torch.randn(optimal_batch_size, 3, 600, 600, dtype=torch.float).to(device)

# INIT LOGGERS
starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
repetitions = 300
timings=np.zeros((repetitions,1))

model_quantized.to(device)
model_quantized.eval()

#GPU-WARM-UP
for _ in range(10):
    #Forward passto model
    _ = model_quantized(dummy_input)

# MEASURE PERFORMANCE
with torch.no_grad():
    for rep in range(repetitions):
        starter.record()
        _ = model_quantized(dummy_input)
        ender.record()
        # WAIT FOR GPU SYNC
        torch.cuda.synchronize()
        curr_time = starter.elapsed_time(ender)
        timings[rep] = curr_time
mean_syn = np.sum(timings) / repetitions
std_syn = np.std(timings)
print(f'{mean_syn} ms')

NotImplementedError: Could not run 'quantized::conv2d.new' with arguments from the 'QuantizedCUDA' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'quantized::conv2d.new' is only available for these backends: [QuantizedCPU, BackendSelect, Python, Named, Conjugate, Negative, ZeroTensor, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradXLA, AutogradLazy, AutogradXPU, AutogradMLC, AutogradHPU, Tracer, AutocastCPU, Autocast, Batched, VmapMode, Functionalize].

QuantizedCPU: registered at /opt/conda/conda-bld/pytorch_1646755903507/work/aten/src/ATen/native/quantized/cpu/qconv.cpp:939 [kernel]
BackendSelect: fallthrough registered at /opt/conda/conda-bld/pytorch_1646755903507/work/aten/src/ATen/core/BackendSelectFallbackKernel.cpp:3 [backend fallback]
Python: registered at /opt/conda/conda-bld/pytorch_1646755903507/work/aten/src/ATen/core/PythonFallbackKernel.cpp:47 [backend fallback]
Named: registered at /opt/conda/conda-bld/pytorch_1646755903507/work/aten/src/ATen/core/NamedRegistrations.cpp:7 [backend fallback]
Conjugate: registered at /opt/conda/conda-bld/pytorch_1646755903507/work/aten/src/ATen/ConjugateFallback.cpp:18 [backend fallback]
Negative: registered at /opt/conda/conda-bld/pytorch_1646755903507/work/aten/src/ATen/native/NegateFallback.cpp:18 [backend fallback]
ZeroTensor: registered at /opt/conda/conda-bld/pytorch_1646755903507/work/aten/src/ATen/ZeroTensorFallback.cpp:86 [backend fallback]
ADInplaceOrView: fallthrough registered at /opt/conda/conda-bld/pytorch_1646755903507/work/aten/src/ATen/core/VariableFallbackKernel.cpp:64 [backend fallback]
AutogradOther: fallthrough registered at /opt/conda/conda-bld/pytorch_1646755903507/work/aten/src/ATen/core/VariableFallbackKernel.cpp:35 [backend fallback]
AutogradCPU: fallthrough registered at /opt/conda/conda-bld/pytorch_1646755903507/work/aten/src/ATen/core/VariableFallbackKernel.cpp:39 [backend fallback]
AutogradCUDA: fallthrough registered at /opt/conda/conda-bld/pytorch_1646755903507/work/aten/src/ATen/core/VariableFallbackKernel.cpp:47 [backend fallback]
AutogradXLA: fallthrough registered at /opt/conda/conda-bld/pytorch_1646755903507/work/aten/src/ATen/core/VariableFallbackKernel.cpp:51 [backend fallback]
AutogradLazy: fallthrough registered at /opt/conda/conda-bld/pytorch_1646755903507/work/aten/src/ATen/core/VariableFallbackKernel.cpp:55 [backend fallback]
AutogradXPU: fallthrough registered at /opt/conda/conda-bld/pytorch_1646755903507/work/aten/src/ATen/core/VariableFallbackKernel.cpp:43 [backend fallback]
AutogradMLC: fallthrough registered at /opt/conda/conda-bld/pytorch_1646755903507/work/aten/src/ATen/core/VariableFallbackKernel.cpp:59 [backend fallback]
AutogradHPU: fallthrough registered at /opt/conda/conda-bld/pytorch_1646755903507/work/aten/src/ATen/core/VariableFallbackKernel.cpp:68 [backend fallback]
Tracer: registered at /opt/conda/conda-bld/pytorch_1646755903507/work/torch/csrc/autograd/TraceTypeManual.cpp:293 [backend fallback]
AutocastCPU: fallthrough registered at /opt/conda/conda-bld/pytorch_1646755903507/work/aten/src/ATen/autocast_mode.cpp:461 [backend fallback]
Autocast: fallthrough registered at /opt/conda/conda-bld/pytorch_1646755903507/work/aten/src/ATen/autocast_mode.cpp:305 [backend fallback]
Batched: registered at /opt/conda/conda-bld/pytorch_1646755903507/work/aten/src/ATen/BatchingRegistrations.cpp:1059 [backend fallback]
VmapMode: fallthrough registered at /opt/conda/conda-bld/pytorch_1646755903507/work/aten/src/ATen/VmapModeRegistrations.cpp:33 [backend fallback]
Functionalize: registered at /opt/conda/conda-bld/pytorch_1646755903507/work/aten/src/ATen/FunctionalizeFallbackKernel.cpp:52 [backend fallback]


### ToDo

- Check if quantized model can be fixed in cuda
- Test tensorRT

### References

- https://deci.ai/blog/measure-inference-time-deep-neural-networks/