# Static Quantization

## 1. Import packages

In [1]:
import torch
import resnet
import copy
import torch.nn as nn
import helper
import torchvision
import torchvision.transforms as transforms
import os

In [2]:
cuda_device = torch.device("cuda:0")
cpu_device = torch.device("cpu:0")

## 2. Create a model instance

In [3]:
model_fp32 = resnet.resnet18(pretrained=False, num_classes=10)

In [4]:
state_dict = torch.load('resnet18.pt')
model_fp32.load_state_dict(state_dict)

<All keys matched successfully>

# CIFAR10 DataSet

In [5]:
transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    # transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

In [6]:
num_workers = 8
    
train_set = torchvision.datasets.CIFAR10(root="data", train=True, download=True, transform=transform) 
test_set = torchvision.datasets.CIFAR10(root="data", train=False, download=True, transform=transform)
    
train_sampler = torch.utils.data.RandomSampler(train_set)
test_sampler = torch.utils.data.SequentialSampler(test_set)
    
train_loader = torch.utils.data.DataLoader(
    dataset=train_set, batch_size=32,
    sampler=train_sampler, num_workers=num_workers)

test_loader = torch.utils.data.DataLoader(
    dataset=test_set, batch_size=32,
    sampler=test_sampler, num_workers=num_workers)

Files already downloaded and verified
Files already downloaded and verified


evaluate_model `fp32_eval_loss` and `fp32_eval_accurac`

In [7]:
fp32_eval_loss, fp32_eval_accuracy = helper.evaluate_model(model=model_fp32, test_loader=test_loader, device=cuda_device, criterion=None)
print(fp32_eval_loss, fp32_eval_accuracy)

100%|██████████| 313/313 [00:01<00:00, 251.67it/s]

0.0 tensor(0.9259, device='cuda:0')





## 3. Fusion layer

In [8]:
fused_model_fp32 = copy.deepcopy(model_fp32)
fused_model_fp32.eval()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (skip_add): FloatFunctional(
        (activation_post_process): Identity()
      )
      (relu2): ReLU(inplace=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
     

In [9]:
# Fuse the activations to preceding layers, where applicable.
# This needs to be done manually depending on the model architecture.
# Common fusions include `conv + relu` and `conv + batchnorm + relu`
fused_model_fp32 = torch.quantization.fuse_modules(fused_model_fp32, [["conv1", "bn1", "relu"]], inplace=True)
for module_name, module in fused_model_fp32.named_children():
    if "layer" in module_name:
        for basic_block_name, basic_block in module.named_children():
            torch.quantization.fuse_modules(basic_block, [["conv1", "bn1", "relu"], ["conv2", "bn2"]], inplace=True)
            for sub_block_name, sub_block in basic_block.named_children():
                if sub_block_name == "downsample":
                    torch.quantization.fuse_modules(sub_block, [["0", "1"]], inplace=True)

## fused model equivalence

In [10]:
model_fp32.eval()
fused_model_fp32.eval()

assert helper.model_equivalence(model_1=model_fp32, model_2=fused_model_fp32, device=cpu_device, rtol=1e-03, atol=1e-03, num_tests=100, input_size=(1,3,224,224)), "Fused model is not equivalent to the original model!"

## 4. Static Quantization

In [11]:
class QuantizedResNet18(nn.Module):
    def __init__(self, model_fp32):
        super(QuantizedResNet18, self).__init__()
        
        self.quant = torch.quantization.QuantStub()
        self.model_fp32 = model_fp32
        self.dequant = torch.quantization.DeQuantStub()
        
    def forward(self, x):
        x = self.quant(x)
        x = self.model_fp32(x)
        x = self.dequant(x)
        return x

In [12]:
def calibrate_model(model, loader, device=torch.device("cpu:0")):

    model.to(device)
    model.eval()

    for inputs, labels in loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        _ = model(inputs)

In [13]:
quantized_model = QuantizedResNet18(model_fp32=fused_model_fp32)

quantized_model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
torch.quantization.prepare(quantized_model, inplace=True)

# Calibration
calibrate_model(model=quantized_model, loader=test_loader, device=cuda_device)
quantized_model = quantized_model.to(cpu_device)

quantized_model_int8 = torch.quantization.convert(quantized_model, inplace=True)
quantized_model_int8.eval()
print(quantized_model_int8)

  reduce_range will be deprecated in a future release of PyTorch."


QuantizedResNet18(
  (quant): Quantize(scale=tensor([0.0408]), zero_point=tensor([60]), dtype=torch.quint8)
  (model_fp32): ResNet(
    (conv1): QuantizedConvReLU2d(3, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.005240552127361298, zero_point=0, padding=(1, 1))
    (bn1): Identity()
    (relu): Identity()
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): QuantizedConvReLU2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.0032231544610112906, zero_point=0, padding=(1, 1))
        (bn1): Identity()
        (relu): Identity()
        (conv2): QuantizedConv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.007309970445930958, zero_point=79, padding=(1, 1))
        (bn2): Identity()
        (skip_add): QFunctional(
          scale=0.009357315488159657, zero_point=56
          (activation_post_process): Identity()
        )
        (relu2): QuantizedReLU(inplace=True)
      )
  

# Test

In [14]:
import urllib
url, filename = ("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")
try: urllib.URLopener().retrieve(url, filename)
except: urllib.request.urlretrieve(url, filename)

In [15]:
import time
from PIL import Image
from torchvision import transforms
input_image = Image.open("cat.jpg")
preprocess = transforms.Compose([
    transforms.Resize(32),
    transforms.CenterCrop(32),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]),
])

input_tensor = preprocess(input_image)
input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model

print(f"Input shape : {input_batch.shape}")
print(f"Input : {input_batch[0][0][0][:14]}")

Input shape : torch.Size([1, 3, 32, 32])
Input : tensor([2.5141, 2.5141, 2.5141, 2.5141, 2.5141, 2.5141, 2.5141, 2.5141, 2.5141,
        2.5141, 2.5141, 2.5141, 2.5141, 2.5141])


In [16]:
# output = model_fp32(input_batch)
# output = fused_model_fp32(input_batch)
output = quantized_model_int8(input_batch)

In [17]:
# Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes
print(output[0])
# The output has unnormalized scores. To get probabilities, you can run a softmax on it.
probabilities = torch.nn.functional.softmax(output[0], dim=0)

tensor([-0.8179, -1.0223, -0.5453,  5.7252, -0.5453,  0.3408, -0.4771, -0.6816,
        -0.9542, -0.9542])


In [18]:
categories = ["airplane", "automobile", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck"]

# Show top categories per image
top5_prob, top5_catid = torch.topk(probabilities, 5)
for i in range(top5_prob.size(0)):
    print(categories[top5_catid[i]], top5_prob[i].item())

cat 0.9831167459487915
dog 0.004510259255766869
frog 0.0019906775560230017
deer 0.001859520678408444
bird 0.001859520678408444
