# Static Quantization

## 1. Import packages

In [1]:
import torch
import resnet
import copy
import torch.nn as nn
import helper

## 2. Create a model instance

In [2]:
model_fp32 = resnet.resnet50(pretrained=True)
fused_model_fp32 = copy.deepcopy(model_fp32)

fused_model_fp32.eval()
print(fused_model_fp32)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

## 3. Fusion layer

In [3]:
# Fuse the activations to preceding layers, where applicable.
# This needs to be done manually depending on the model architecture.
# Common fusions include `conv + relu` and `conv + batchnorm + relu`
fused_model_fp32 = torch.quantization.fuse_modules(fused_model_fp32, [["conv1", "bn1", "relu"]], inplace=True)
for module_name, module in fused_model_fp32.named_children():
    if "layer" in module_name:
        for basic_block_name, basic_block in module.named_children():
            torch.quantization.fuse_modules(basic_block, [["conv1", "bn1", "relu"], ["conv2", "bn2", "relu2"]], inplace=True)
            for sub_block_name, sub_block in basic_block.named_children():
                if sub_block_name == "downsample":
                    torch.quantization.fuse_modules(sub_block, [["0", "1"]], inplace=True)

## 4. Static Quantization

In [4]:
class QuantizedResNet18(nn.Module):
    def __init__(self, model_fp32):
        super(QuantizedResNet18, self).__init__()
        
        self.quant = torch.quantization.QuantStub()
        self.model_fp32 = model_fp32
        self.dequant = torch.quantization.DeQuantStub()
        
    def forward(self, x):
        x = self.quant(x)
        x = self.model_fp32(x)
        x = self.dequant(x)
        return x

In [5]:
quantized_model = QuantizedResNet18(model_fp32=fused_model_fp32)

quantized_model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
torch.quantization.prepare(quantized_model, inplace=True)

quantized_model = torch.quantization.convert(quantized_model, inplace=True)
quantized_model.eval()
print(quantized_model)



QuantizedResNet18(
  (quant): Quantize(scale=tensor([1.]), zero_point=tensor([0]), dtype=torch.quint8)
  (model_fp32): ResNet(
    (conv1): QuantizedConvReLU2d(3, 64, kernel_size=(7, 7), stride=(2, 2), scale=1.0, zero_point=0, padding=(3, 3))
    (bn1): Identity()
    (relu): Identity()
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): QuantizedConvReLU2d(64, 64, kernel_size=(1, 1), stride=(1, 1), scale=1.0, zero_point=0)
        (bn1): Identity()
        (conv2): QuantizedConvReLU2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=1.0, zero_point=0, padding=(1, 1))
        (bn2): Identity()
        (conv3): QuantizedConv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), scale=1.0, zero_point=0, bias=False)
        (bn3): QuantizedBatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): Identity()
        (downsample): Sequential(
          (0): 

## 5. Test

In [6]:
cpu_device = torch.device("cpu:0")
int8_cpu_inference_latency = helper.measure_inference_latency(model=quantized_model, device=cpu_device, input_size=(1,3,224,224), num_samples=100)
print(f"int8_cpu_inference_latency : {int8_cpu_inference_latency}")

fp32_cpu_inference_latency = helper.measure_inference_latency(model=model_fp32, device=cpu_device, input_size=(1,3,224,224), num_samples=100)
print(f"fp32_cpu_inference_latency : {fp32_cpu_inference_latency}")

int8_cpu_inference_latency : 0.015300188064575195
fp32_cpu_inference_latency : 0.05555289506912232


# Test

In [7]:
import urllib
url, filename = ("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")
try: urllib.URLopener().retrieve(url, filename)
except: urllib.request.urlretrieve(url, filename)

In [8]:
import time
from PIL import Image
from torchvision import transforms
input_image = Image.open("cat.jpg")
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
input_tensor = preprocess(input_image)
input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model

print(f"Input shape : {input_batch.shape}")
print(f"Input : {input_batch[0][0][0][:14]}")

Input shape : torch.Size([1, 3, 224, 224])
Input : tensor([2.2489, 2.2489, 2.2489, 2.2489, 2.2489, 2.2489, 2.2489, 2.2489, 2.2489,
        2.2489, 2.2489, 2.2489, 2.2489, 2.2489])


In [10]:
# output = model_fp32(input_batch)
output = quantized_model(input_batch)

In [11]:
# Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes
print(output[0][:7])
# The output has unnormalized scores. To get probabilities, you can run a softmax on it.
probabilities = torch.nn.functional.softmax(output[0], dim=0)

tensor([0., 8., 0., 0., 2., 0., 0.])


In [12]:
# Download ImageNet labels
!wget -N https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt

--2022-03-12 21:07:25--  https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt
raw.githubusercontent.com (raw.githubusercontent.com)을(를) 해석하는 중... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...
접속 raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... 접속됨.
HTTP 요청을 전송했습니다. 응답을 기다리는 중입니다... 200 OK
길이: 10472 (10K) [text/plain]
다음 위치에 저장: `imagenet_classes.txt'


Last-modified 헤더가 누락되었습니다 -- time-stamps를 끕니다.
2022-03-12 21:07:25 (36.4 MB/s) - `imagenet_classes.txt' 저장됨 [10472/10472]



In [13]:
# Read the categories
with open("imagenet_classes.txt", "r") as f:
    categories = [s.strip() for s in f.readlines()]
# Show top categories per image
top5_prob, top5_catid = torch.topk(probabilities, 5)
for i in range(top5_prob.size(0)):
    print(categories[top5_catid[i]], top5_prob[i].item())

torch 0.9253488183021545
spotlight 0.0460704043507576
whistle 0.016948355361819267
candle 0.006234951317310333
ping-pong ball 0.002293710596859455


In [14]:
torch.save(quantized_model, 'quantized_model.pth')
torch.save(fused_model_fp32, 'fused_model_fp32.pth')