# Homework 3

## Student : Jad El Karchi - AI

# Inroduction

Useful links:
- Intorduction to Quantization on PyTorch https://pytorch.org/blog/introduction-to-quantization-on-pytorch/
- PyTorch modules that provide quantization classes and functions https://pytorch.org/docs/stable/quantization.html#modules-that-provide-quantization-functions-and-classes

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import defaultdict
import copy
import os

# Task 1 [2 points]

For X = DD + MM + YY + YY, where DD/MM/YYYY is your date of birth  

1. Represent the number X in the following formats:

  a) int8,  big-endian

  b) int8, little-endian

  b) int16, big-endian

  c) int16, little-endian

  d) float32, big-endian

  e) float32, little-endian

2. Write representations in the same formats a)-e) for -X


Use the following style when writing/printing:

- For all cases: 8 bits - space - 8 bits - space - ...

- Additionally, for float big-endian format print the result the following ways: 

  - sign bit - space - exponent bits - space - fraction bits

  - sign multiplier - space - exponent multiplier - space - faction multiplier (all multipliers in float format, multiply them to check whether the result is close to the initial number)  



In [None]:
import numpy as np

DD, MM, YY_1, YY_2 = 18, 12, 20, 1

X = DD + MM + YY_1 + YY_2

neg_X = -X


def represent_in_formats(value, data_type, endian):
    try :
        if data_type == 'int8':
            packed_value = value.to_bytes(1, byteorder=endian, signed=True)
        elif data_type == 'int16':
            packed_value = value.to_bytes(2, byteorder=endian, signed=True)
        elif data_type == 'float32':
            packed_value = value.to_bytes(4, byteorder=endian, signed=True)
        else:
            return "Unsupported data type"
        
        binary_representation = ' '.join(format(byte, '08b') for byte in packed_value)
        return binary_representation
    except :
        return "value unsupported (too big)"

    
formats = [('int8', 'big'), ('int8', 'little'), ('int16', 'big'), ('int16', 'little'), ('float32', 'big'), ('float32', 'little')]
for data_type, endian in formats:
    print(f"{data_type}, {endian}-endian:")
    representation_X = represent_in_formats(X, data_type, endian)
    representation_neg_X = represent_in_formats(neg_X, data_type, endian)

    print(f"  X: {representation_X}")
    print(f" -X: {representation_neg_X}\n")

# Task 2 [4 point]

Given tensor X and using PyTorch tools


1. Implement per-tensor affine quantization :

  a) int8 symmetric 

  b) uint8 symmetric 

  c) int8 assymmetric

  You'll need to do that for several given input tensors.
  
  - What can you say by comparing approximation errors of a)-c) representations? 

  - Explain why some quantization schemes suit better for some inputs.

2. Implement per-tensor and per-channel (along axis = 0) affine quantization using int8 symmetric quantization.

  You'll need to do that for several given input tensors.

  - What can you say  by comparing approximation errors for per-tensor and per-channel quantization  for different inputs? 

  - Explain why some quantization schemes suit better for some inputs.


Useful links:

- Quantized Tensors in PyTorch: https://pytorch.org/docs/stable/quantization.html#quantized-tensors


## Demostrative Example

Pay attention to types of tensors when performing quantization / dequantization using PyTorch tools


In [3]:
# Generate a float tensor
t = torch.arange(512).reshape(32, 16) + torch.randn((32, 16))

print(f'After generation: \n\t type: {t.dtype}, \n\n\t tesor: {t[0, :]}')

# Quantize the tensor
qt = torch.quantize_per_tensor(t, scale=1.5, zero_point=0, dtype=torch.qint8)

print(f'\n\nAfter quantization: \n\t type: {qt.dtype},'+
      f'\n\n\t quantized tensor in float representation: {qt[0, :]},'+
      f' \n\n\t quantized tensor in int representation: {qt.int_repr()[0, :]}')

# Dequantize the tensor
dqt = qt.dequantize()

print(f'\n\nAfter dequantization: \n\t type: {dqt.dtype}, \n\n\t dequantized tesor: {dqt[0, :]}')


After generation: 
	 type: torch.float32, 

	 tesor: tensor([ 1.1300,  0.6253,  1.2075,  2.3684,  4.7084,  5.0779,  7.0216,  7.8060,
         9.1087,  9.4054, 10.6627, 11.2264, 12.2299, 14.5136, 14.7325, 13.5463])


After quantization: 
	 type: torch.qint8,

	 quantized tensor in float representation: tensor([ 1.5000,  0.0000,  1.5000,  3.0000,  4.5000,  4.5000,  7.5000,  7.5000,
         9.0000,  9.0000, 10.5000, 10.5000, 12.0000, 15.0000, 15.0000, 13.5000],
       size=(16,), dtype=torch.qint8,
       quantization_scheme=torch.per_tensor_affine, scale=1.5, zero_point=0), 

	 quantized tensor in int representation: tensor([ 1,  0,  1,  2,  3,  3,  5,  5,  6,  6,  7,  7,  8, 10, 10,  9],
       dtype=torch.int8)


After dequantization: 
	 type: torch.float32, 

	 dequantized tesor: tensor([ 1.5000,  0.0000,  1.5000,  3.0000,  4.5000,  4.5000,  7.5000,  7.5000,
         9.0000,  9.0000, 10.5000, 10.5000, 12.0000, 15.0000, 15.0000, 13.5000])


In [4]:
# Print scale and zero_point
try:
  print(qt.q_scale(), qt.q_zero_point())
except:
  print(qt.q_per_channel_scales(), qt.q_per_channel_zero_points())

1.5 0


## Task 2.0 Implement Quantization / Dequantization [2 point]

In [5]:
def quantize_tensor(x, torch_dtype, is_symmetric=True, is_per_channel=False, axis = 0):
  if is_per_channel:
    x_quantized = quantize_tensor_per_channel(x, torch_dtype, is_symmetric, axis)

  else:
    x_quantized = quantize_tensor_per_tensor(x, torch_dtype, is_symmetric)

  return x_quantized

### Per Tensor Quantization. Fill the blanks in the code.

In [6]:
def quantize_tensor_per_tensor(x, torch_dtype, is_symmetric=True):

    print(f"\nQuantization type: {torch_dtype}, symmetric: {is_symmetric}, per_channel: {False} ")

    bits = torch.iinfo(torch_dtype).bits

    # Minimum  and maximum quantization values
    if torch_dtype == torch.qint8: # torch.iinfo(torch_dtype).min != 0 
      quant_min = -2**(bits - 1) 
      # ANSWER
      quant_max =  2**(bits-1)-1

    elif torch_dtype == torch.quint8: #
      quant_min = 0 
      # ANSWER
      quant_max =  2**(bits)-1

    x_min = x.min()
    x_max = x.max()
    

    if is_symmetric:
      scale = 2 * torch.where(x_min.abs() > x_max, x_min.abs(), x_max) / (quant_max - quant_min)
      zero_point = 2**(bits - 1) if torch.iinfo(torch_dtype).min == 0 else 0
    else:
      x_max = max(x_max, 0)
      x_min = min(x_min, 0)
      # ANSWER
      scale =  (x_max - x_min) / (quant_max - quant_min)
      zero_point = quant_min-round(int(np.array(x_min)/scale.item()))


    # Use PyTorch build-in function
    x_quantized = torch.quantize_per_tensor(x, scale, zero_point=zero_point, dtype=torch_dtype)

    return x_quantized                                          


In [7]:
# Unit test
t = torch.ones(512).reshape(32, 16)
qt = quantize_tensor_per_tensor(t, torch.qint8, is_symmetric=False)
assert torch.max(t - qt.dequantize()) < torch.finfo(t.dtype).eps

t = torch.ones(512).reshape(32, 16) + 1e-8
qt = quantize_tensor_per_tensor(t, torch.qint8, is_symmetric=False)
assert torch.max(t - qt.dequantize()) < torch.finfo(t.dtype).eps

t = torch.arange(256).reshape(8, 32).to(torch.float32)
qt = quantize_tensor_per_tensor(t, torch.qint8, is_symmetric=False)
assert torch.max(t - qt.dequantize()) < torch.finfo(t.dtype).eps


Quantization type: torch.qint8, symmetric: False, per_channel: False 


NameError: name 'np' is not defined

### Per Channel Quantization. Fill the blanks in the code.

In [None]:
import tensorly as tl
tl.set_backend('pytorch')

In [None]:
def quantize_tensor_per_channel(x, torch_dtype, is_symmetric=True, axis = 0):
  '''
  Takes float PyTorch tensor as input.
  
  Returns
  -------
  x_quantized: PyTorch tensor of PyTorch quantization types (e.g., torch.qint8, torch.quint8)
    This format contains integer representation, scale, zero_point.
    If you want to extract integer representation use 'x_int = x_quantized.int_repr()'.
  '''
  
  print(f"\nQuantization type: {torch_dtype}, symmetric: {is_symmetric}, per_channel: {True} ")

  bits = torch.iinfo(torch_dtype).bits


  if torch_dtype==torch.qint8:
    quant_min = -2**(bits - 1)
    quant_max =  2**(bits - 1) - 1

  elif torch_dtype==torch.quint8:
    quant_min = 0
    quant_max =  2*(2**(bits-1))-1

  unfolded_t = tl.base.unfold(x, mode=0)
  x_max =  unfolded_t.max(dim=-1)[0]
  x_min =  unfolded_t.min(dim=-1)[0]
  


  if is_symmetric:
    scale = 2 * torch.where(x_min.abs() > x_max, x_min.abs(), x_max) / (quant_max - quant_min) 
    
    zero_point = torch.repeat_interleave(
        torch.tensor(2**(bits - 1) if torch.iinfo(torch_dtype).min == 0 else 0),len(scale))

  else:
    x_max = torch.where(x_max < torch.zeros_like(x_max), torch.zeros_like(x_max), x_max)
    x_min = torch.where(x_min > torch.zeros_like(x_min), torch.zeros_like(x_min), x_min)

    scale = ((x_max - x_min) / (quant_max - quant_min))
    zero_point = torch.floor(quant_min-x_min/scale)

    

  # Use PyTorch build-in function
  x_quantized =  torch.quantize_per_channel(x, scales=scale, zero_points=zero_point, dtype=torch_dtype, axis=axis)
  return x_quantized

In [None]:
# Unit test
t = torch.ones(512).reshape(32, 16).to(torch.float32)
qt = quantize_tensor_per_channel(t, torch.qint8, is_symmetric=False)
assert torch.max(t - qt.dequantize()) < torch.finfo(t.dtype).eps

t = torch.ones(512).reshape(32, 16) + 1e-2
qt = quantize_tensor_per_channel(t, torch.qint8, is_symmetric=False)
assert torch.max(t - qt.dequantize()) < torch.finfo(t.dtype).eps

t = torch.arange(256).reshape(1, 256).to(torch.float32)
qt = quantize_tensor_per_channel(t, torch.qint8, is_symmetric=False)
assert torch.max(t - qt.dequantize()) < torch.finfo(t.dtype).eps


Quantization type: torch.qint8, symmetric: False, per_channel: True 

Quantization type: torch.qint8, symmetric: False, per_channel: True 

Quantization type: torch.qint8, symmetric: False, per_channel: True 


### Tensor Dequantization. Fill blanks in the code. [1 point]

In [None]:
def dequantize_tensor(x_quantized):
    # Implement using scale, zero_point, integer representation
    if x_quantized.qscheme() == torch.per_tensor_affine:
        # Get scale, zero_point from x_quantized
        # ANSWER
        scale =  x_quantized.q_scale()
        zero_point =  x_quantized.q_zero_point()

    else:
        axis = x_quantized.q_per_channel_axis() 

        # ANSWER 
        scale = x_quantized.q_per_channel_scales()
        zero_point = x_quantized.q_per_channel_zero_points()

        ## Broadcasting along axis:
        for i, shp in enumerate(x_quantized.shape):
            if i != axis:
                scale.unsqueeze(i)
                zero_point.unsqueeze(i)

    # ANSWER
    x_int =  x_quantized.int_repr()
    x_dequantized =  scale * (x_int - zero_point).to(torch.long) # Use x_int in torch.long type to avoid errors

    return x_dequantized

In [None]:
# Unit test
t = torch.arange(512).reshape(32, 16) + torch.randn((32, 16))

# Unit test1 
qt = torch.quantize_per_tensor(t, scale=1.5, zero_point=0, dtype=torch.qint8)
dqt = qt.dequantize()

dqt_custom = dequantize_tensor(qt)

rel_err = torch.norm(dqt - dqt_custom) / torch.norm(dqt)
assert rel_err < torch.finfo(t.dtype).eps, rel_err


# Unit test2
qt = torch.quantize_per_channel(t, scales=torch.arange(16).to(torch.float), zero_points=torch.zeros(16, dtype=torch.int), dtype=torch.qint8, axis=1)
dqt = qt.dequantize()

dqt_custom = dequantize_tensor(qt)

rel_err = torch.norm(dqt - dqt_custom) / torch.norm(dqt)
assert rel_err < torch.finfo(t.dtype).eps, rel_err

## Task 2.1. Perform Quantization. Analyze. [1 point]

For three given tensors perform per-tensor affine quantization :
a) int8 symmetric
b) uint8 symmetric
c) int8 assymmetric


- What can you say by comparing approximation errors of a)-c) representations?
- Explain why some quantization schemes suit better for some inputs.

In [None]:
shape = (512, 3, 3)
x1 = torch.rand(shape) * 100      # uniform in the range [0, 100] 
x2 = torch.rand(shape) * 100 - 50 # uniform in the range [-50, 50] 
x3 = torch.rand(shape) * 100 - 20 # uniform in the range [-20, 80] 


results = defaultdict(dict)


# Experiments 1
for torch_dtype, is_symmetric, is_per_channel in [[torch.qint8, True, False],
                                                  [torch.quint8, True, False],
                                                  [torch.qint8, False, False]]:

  tmp_dict = defaultdict()

  for i, x in enumerate([x1, x2, x3]):
    x_quantized = quantize_tensor(x,
                                  torch_dtype = torch_dtype,
                                  is_symmetric = is_symmetric,
                                  is_per_channel = is_per_channel)
    
    approx_error = torch.norm(x_quantized.dequantize() - x)/torch.norm(x)

    tmp_dict[i] = (x_quantized, approx_error)

  key = (torch_dtype,
         'symmetric' if is_symmetric else 'asymmetric',
         'per_channel' if is_per_channel else 'per_tensor')
  results[key] = tmp_dict
  

# Compute difference in approximations
for i, x in enumerate([x1, x2, x3]):
  print(f"\ntensor number {i}, tensor range: {(x.min(), x.max())}")
  for key in results.keys():
    print(f'\tquantization scheme {key}, approx_error: {results[key][i][1]}')

# TODO : COMMENT



Quantization type: torch.qint8, symmetric: True, per_channel: False 

Quantization type: torch.qint8, symmetric: True, per_channel: False 

Quantization type: torch.qint8, symmetric: True, per_channel: False 

Quantization type: torch.quint8, symmetric: True, per_channel: False 

Quantization type: torch.quint8, symmetric: True, per_channel: False 

Quantization type: torch.quint8, symmetric: True, per_channel: False 

Quantization type: torch.qint8, symmetric: False, per_channel: False 

Quantization type: torch.qint8, symmetric: False, per_channel: False 

Quantization type: torch.qint8, symmetric: False, per_channel: False 

tensor number 0, tensor range: (tensor(0.0045), tensor(99.9478))
	quantization scheme (torch.qint8, 'symmetric', 'per_tensor'), approx_error: 0.003955337218940258
	quantization scheme (torch.quint8, 'symmetric', 'per_tensor'), approx_error: 0.003955337218940258
	quantization scheme (torch.qint8, 'asymmetric', 'per_tensor'), approx_error: 0.0019940254278481007




a) int8 symmetric
- Uses signed integers within a symmetric range.
- Ideal for values centered around zero.

b) uint8 symmetric
- Uses unsigned integers within a symmetric range.
- Like int8 symmetric but lacks negative values.

c) int8 asymmetric
- Uses signed integers with an asymmetric range.
- Woeks best for values with a specific bias or offset from zero.


The scheme depends on the value characteristics, Symmetric for zero-centered data, asymmetric for biased data. In this scenario, all schemes perform well, with int8 asymmetric showing an advantage for tensors with a mean different than 0.

## Task 2.2. Perform Quantization. Analyze. [1 point]

For two given tensors perform per-tensor and per-channel (along axis = 0) affine quantization using int8 symmetric quantization.

- What can you say by comparing approximation errors for per-tensor and per-channel quantization for different inputs?
- Explain why some quantization schemes suit better for some inputs.

In [None]:
shape = (1024, 32, 64)
x1 = torch.rand(shape) * 100    # uniform in the range [0, 100] 
x1 = x1 * torch.arange(1, 1 + shape[0])[:, None, None] # then multiple each channel by (channel_index + 1)

x2 = torch.rand(shape) * 100 - 50 # uniform in the range [-50, 50] 

results = defaultdict(dict)


# Experiments 2
for torch_dtype, is_symmetric, is_per_channel in [[torch.qint8, True, False],
                                                  [torch.qint8, True, True]]:

  tmp_dict = defaultdict()

  for i, x in enumerate([x1, x2]):
    x_quantized = quantize_tensor(x,
                                  torch_dtype = torch_dtype,
                                  is_symmetric = is_symmetric,
                                  is_per_channel = is_per_channel,
                                  axis = 0)
    
    approx_error = torch.norm(x_quantized.dequantize() - x)/torch.norm(x)

    tmp_dict[i] = (x_quantized, approx_error)

  key = (torch_dtype,
         'symmetric' if is_symmetric else 'asymmetric',
         'per_channel' if is_per_channel else 'per_tensor')
  results[key] = tmp_dict
  

# Compute difference in approximations
for i, x in enumerate([x1, x2]):
  print(f"\ntensor number {i}, tensor range: {(x.min(), x.max())}")
  for key in results.keys():
    print(f'\tquantization scheme {key}, approx_error: {results[key][i][1]}')



Quantization type: torch.qint8, symmetric: True, per_channel: False 

Quantization type: torch.qint8, symmetric: True, per_channel: False 

Quantization type: torch.qint8, symmetric: True, per_channel: True 

Quantization type: torch.qint8, symmetric: True, per_channel: True 

tensor number 0, tensor range: (tensor(0.0063), tensor(102393.9922))
	quantization scheme (torch.qint8, 'symmetric', 'per_tensor'), approx_error: 0.006783206947147846
	quantization scheme (torch.qint8, 'symmetric', 'per_channel'), approx_error: 0.003921858966350555

tensor number 1, tensor range: (tensor(-50.0000), tensor(50.0000))
	quantization scheme (torch.qint8, 'symmetric', 'per_tensor'), approx_error: 0.00392519123852253
	quantization scheme (torch.qint8, 'symmetric', 'per_channel'), approx_error: 0.003924562595784664


# Task 3 [4 points]

Consider a simple PyTorch model and modify it using QuantStub() and DeQuantStub() to imitate

1) Quantization of all layers

2) Quantization only of convolutionsl layers

Useful links:
- PyTorch model preparation for Quantization https://pytorch.org/docs/stable/quantization.html#model-preparation-for-quantization


Given a class for a neural network prepare it to be quantizable and quantize.

Namely, you should quantize

1. whole model using default PyTorch static quantization qconfig

2. part of the model

In [None]:
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p") / 1024**2)
    os.remove('temp.p')

### Fill blanks in `quantize_model` function that performs quantization [1 point]

In [None]:
def quantize_model(model, qconfig_dict={}, input_shape=None):

    print(f'QConfig \n {qconfig_dict}')

    # Layers with qconfig=None will not be quantized
    model.qconfig = None

    for mname, m in model.named_modules():
        if mname in qconfig_dict:
            m.qconfig = qconfig_dict[mname]
        
    print(f'\n Model before preparation \n{model}')

    # Prepare the model for quantization by propagating qconfig
    
    # ANSWER
    model = torch.quantization.prepare(model)
    
    
    print(f'\n Model after preparation & before calibration (activation stats computation) \n{model}')

    # Collect statistics for quantization

    # ANSWER
    if input_shape is not None:
        model(torch.randn(input_shape))
    
    print(f'\n Model after calibration & before conversion \n{model}')

    # Quantize the model
    
    # ANSWER
    model = torch.quantization.convert(model)

    print(f'\n Model after conversion \n{model}')    
    return model

In [None]:
from torchvision.models import vgg16
model = vgg16(pretrained=True)

model.eval()
for p in model.parameters():
  p.requires_grad=False



In [None]:
model.features

Sequential(
  (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): ReLU(inplace=True)
  (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): ReLU(inplace=True)
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (6): ReLU(inplace=True)
  (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (8): ReLU(inplace=True)
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (11): ReLU(inplace=True)
  (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (13): ReLU(inplace=True)
  (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (15): ReLU(inplace=True)
  (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (17): Conv2d(256, 512, kernel_si

### Fuse all possible layers [1 point]

In [None]:
fmodel = copy.deepcopy(model)

# Fuse all possible layers
# ANSWER
fused_model = torch.quantization.fuse_modules(
    fmodel,
    [
        ['features.0', 'features.1'],  
        ['features.2', 'features.3'],  
        ['features.5', 'features.6'],  
        ['features.7', 'features.8'],  
        ['features.10', 'features.11'],
        ['features.12', 'features.13'],
        ['features.15', 'features.16'],
        ['features.19', 'features.20'],
        ['features.22', 'features.23'],
        ['features.26', 'features.27'],
    ])

: 

In [40]:
fused_model

VGG(
  (features): Sequential(
    (0): ConvReLU2d(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
    )
    (1): Identity()
    (2): ConvReLU2d(
      (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
    )
    (3): Identity()
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): ConvReLU2d(
      (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
    )
    (6): Identity()
    (7): ConvReLU2d(
      (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
    )
    (8): Identity()
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): ConvReLU2d(
      (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
    )
    (11): Identity()
    (12): Conv2d(256, 256

### Quantize the whole model [1 point]


1. whole model  using default PyTorch static quantization qconfig

# Modify model to be quantizable and set qconfig_dict


In [41]:
qmodel = copy.deepcopy(fused_model)
# ANSWER
qconfig_dict = torch.quantization.get_default_qconfig('fbgemm')
qmodel = quantize_model(qmodel, qconfig_dict, input_shape=(1, 3, 224, 224))

QConfig 
 QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=functools.partial(<class 'torch.ao.quantization.observer.PerChannelMinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_channel_symmetric){})

 Model before preparation 
VGG(
  (features): Sequential(
    (0): ConvReLU2d(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
    )
    (1): Identity()
    (2): ConvReLU2d(
      (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
    )
    (3): Identity()
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): ConvReLU2d(
      (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
    )
    (6): Identity()
    (7): ConvReLU2d(
      (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU




 Model after conversion 
VGG(
  (features): Sequential(
    (0): ConvReLU2d(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
    )
    (1): Identity()
    (2): ConvReLU2d(
      (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
    )
    (3): Identity()
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): ConvReLU2d(
      (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
    )
    (6): Identity()
    (7): ConvReLU2d(
      (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
    )
    (8): Identity()
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): ConvReLU2d(
      (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
    )
    (11): Identity()

In [42]:
print_size_of_model(model)
print_size_of_model(qmodel)

Size (MB): 527.8007297515869
Size (MB): 527.8010959625244


In [None]:
# Check inference time
shape = (2, 3, 228, 224)
x = torch.randn(shape)

In [None]:
%timeit _ = model(x)

In [None]:
%timeit _ = qmodel(x)

### Quantize a  part of the model [1 point]

Namely
  - all convolutional layers using Min-Max observers 
  - first fully-connected layer using default PyTorch static quantization qconfig 

In [None]:
qmodel = copy.deepcopy(fmodel)

# Modify model to be quantizable and set qconfig_dict
# ANSWER
for layer in qmodel.features.children():
    if isinstance(layer, nn.Conv2d):
        qmodel.features[layer].qconfig = torch.quantization.default_observer
        qmodel.features[layer] = torch.quantization.ObserverQuantize(layer)

qmodel.classifier[0].qconfig = torch.quantization.default_qconfig
qconfig_dict =  {
    '': torch.quantization.default_qconfig,
    'classifier.0': torch.quantization.default_qconfig,
}

In [None]:
# Quantize the model
qmodel =  quantize_model(qmodel, qconfig_dict, input_shape=(1, 3, 224, 224))

In [None]:
print_size_of_model(model)
print_size_of_model(qmodel)

In [None]:
# Check inference time
shape = (2, 3, 228, 224)
x = torch.randn(shape)

In [None]:
%timeit _ = model(x)

In [None]:
%timeit _ = qmodel(x)