<a href="https://colab.research.google.com/github/felixsimard/comp551-p4/blob/main/Felix_Extreme_Compression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## COMP 551: Applied Machine Learning
### **P4 - Reproducibility in ML**

Charles Bourbeau (260868653) <br>
Mathis Renier () <br>
Felix Simard (260865674) <br>

Dec 10th, 2021



## Setup

In [1]:
!pip install regex requests hydra-core omegaconf bitarray



In [2]:
!pip install fairseq



In [3]:
import torch
import torch.nn as nn
from torchvision import models
from torchsummary import summary
import torch.nn.functional as F

from fairseq.modules.quantization.pq import quantize_model_, SizeTracker

from operator import attrgetter, itemgetter
import re

In [4]:
# roberta = torch.hub.load('pytorch/fairseq', 'roberta.large')
# roberta.eval()

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [6]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 square convolution
        # kernel
        self.conv1 = nn.Conv2d(1, 6, (3,3))
        self.conv2 = nn.Conv2d(6, 16, (3,3))
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 5 * 5, 120)  # 5*5 from image dimension
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square, you can specify with a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


model = Net()
print(model)

Net(
  (conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


In [7]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)

In [8]:
# efficient_net_b3 = models.efficientnet_b3().to(device)
# summary(efficient_net_b3, (3, 256, 256))

# vgg11 = models.vgg11().to(device)
# summary(vgg11, (3, 256, 256))

## Reproducing Paper Results

In [9]:
config = {
    'n_centroids': {
              'Conv2d': ('kernel_size', {'*': 256}),
              'Linear': ('in_features', {'*': 256})
          },
    'block_sizes': {
              'Conv2d': ('kernel_size', {'(3, 3)': 9, '(1, 1)': 4}), # '(3, 3)': 9
              'Linear': ('in_features', {'*': 4})
          },
    'layers_to_quantize': [".*?"]
}

In [10]:
# get configuration parameters
n_centroids_config = config['n_centroids']
block_sizes_config = config['block_sizes']
layers_to_quantize = config['layers_to_quantize']

In [11]:
size_tracker = SizeTracker(model)

In [12]:
# import logging

# logger = logging.getLogger()

# for step in range(len(layers_to_quantize)):
#     quantized_layers = quantize_model_(
#         model,
#         size_tracker,
#         layers_to_quantize,
#         block_sizes_config,
#         n_centroids_config,
#         step=step
#     )
#     logger.info(layers_to_quantize[step])
#     logger.info(f"Finetuning stage {step}, quantized layers: {quantized_layers}")
#     logger.info(f"{size_tracker}")

    # Don't forget to re-create/update trainer/optimizer since model parameters have changed
    # optimizer = 

    # Finetune the centroids with your usual training loop for a few epochs
    # trainer.train_epoch()

In [13]:
# eval_model(model, test_loader)

## Quantization using PyTorch's modules

https://spell.ml/blog/pytorch-quantization-X8e7wBAAACIAHPhT

In [14]:
import torch
import torch.quantization
import torch.nn as nn
from torchvision import models
from torchsummary import summary
import torch.nn.functional as F
from fairseq.modules.quantization.pq import quantize_model_, SizeTracker
import numpy as np

In [15]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [16]:
from torchvision import datasets
import torchvision.transforms as transforms

# number of subprocesses to use for data loading
num_workers = 0
# how many samples per batch to load
batch_size = 20

# convert data to torch.FloatTensor
transform = transforms.ToTensor()

# choose the training and test datasets
train_data = datasets.MNIST(root='data', train=True,
                                   download=True, transform=transform)
test_data = datasets.MNIST(root='data', train=False,
                                  download=True, transform=transform)

# prepare data loaders
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size,
    num_workers=num_workers)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, 
    num_workers=num_workers)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to data/MNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting data/MNIST/raw/train-images-idx3-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to data/MNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting data/MNIST/raw/train-labels-idx1-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to data/MNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting data/MNIST/raw/t10k-images-idx3-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to data/MNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting data/MNIST/raw/t10k-labels-idx1-ubyte.gz to data/MNIST/raw



In [17]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()

        self.conv = nn.Sequential(
            nn.Conv2d(1, 4, kernel_size=(3, 3), padding='same'),
            nn.ReLU(),
            nn.Conv2d(4, 8, kernel_size=(3, 3), padding='same'),
            nn.ReLU(),
            nn.Conv2d(8, 8, kernel_size=(3, 3), padding='same'),
            nn.ReLU(),
        )
        self.dense = nn.Sequential(
            nn.Linear(8 * 784, 100),
            nn.ReLU(),
            # nn.BatchNorm1d(100),
            nn.Linear(100, 100),
            nn.ReLU(),
            # nn.BatchNorm1d(100),
            nn.Linear(100, 10),
            nn.ReLU(),
        )
    
    def forward(self, x):
        x = self.conv(x)
        x = x.view(x.size()[0], -1)
        x = self.dense(x)
        return x

In [18]:
class MLP2(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv2d_1 = nn.Conv2d(1, 4, kernel_size=(3, 3), padding='same')
        self.relu_1 = nn.ReLU()
        self.conv2d_2 = nn.Conv2d(4, 8, kernel_size=(3, 3), padding='same')
        self.relu_2 = nn.ReLU()
        self.conv2d_3 = nn.Conv2d(8, 8, kernel_size=(3, 3), padding='same')
        self.relu_3 = nn.ReLU()
        self.linear_1 = nn.Linear(8 * 784, 100)
        self.relu_4 = nn.ReLU()
        self.linear_2 = nn.Linear(100, 100)
        self.relu_5 = nn.ReLU()
        self.linear_3 = nn.Linear(100, 10)
        self.relu_6 = nn.ReLU()

        self.quant = torch.quantization.QuantStub()
        self.dequant = torch.quantization.DeQuantStub()
    
    def forward(self, x):
        # x = x.contiguous(memory_format=torch.channels_last)
        x = self.quant(x)
        x = self.conv2d_1(x)
        x = self.relu_1(x)
        x = self.conv2d_2(x)
        x = self.relu_2(x)
        x = self.conv2d_3(x)
        x = self.relu_3(x)
        x = x.view(x.size()[0], -1)
        x = self.linear_1(x)
        x = self.relu_4(x)
        x = self.linear_2(x)
        x = self.relu_5(x)
        x = self.linear_3(x)
        x = self.relu_6(x)
        x = self.dequant(x)

        return x

In [19]:
# Instantiate model
model = MLP2()

In [20]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)

In [21]:
def train(model, train_loader, n_epochs=5):
    # number of epochs to train the model

    model.train() # prep model for training

    for epoch in range(n_epochs):
        train_loss = 0.0

        for data, target in train_loader:
            data = data.to(device)
            target = target.to(device)
            
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()*data.size(0)

        train_loss = train_loss/len(train_loader.dataset)

        print('Epoch: {} \tTraining Loss: {:.6f}'.format(
            epoch+1, 
            train_loss
            ))

In [22]:
model = model.to(device)
st = SizeTracker(model)
print("Model size: {} MB".format(round(st.compute_size(), 4)))

Model size: 2.4389 MB


In [None]:
train(model, train_loader)

Epoch: 1 	Training Loss: 1.011996
Epoch: 2 	Training Loss: 0.155383
Epoch: 3 	Training Loss: 0.109444
Epoch: 4 	Training Loss: 0.085776
Epoch: 5 	Training Loss: 0.069296


In [23]:
def eval_model(model, test_loader):
    # initialize lists to monitor test loss and accuracy
    test_loss = 0.0
    class_correct = list(0. for i in range(10))
    class_total = list(0. for i in range(10))

    model.eval() # prep model for *evaluation*

    for data, target in test_loader:
        data = data.to(device)
        target = target.to(device)
        
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the loss
        loss = criterion(output, target)
        # update test loss 
        test_loss += loss.item()*data.size(0)
        # convert output probabilities to predicted class
        _, pred = torch.max(output, 1)
        # compare predictions to true label
        correct = np.squeeze(pred.eq(target.data.view_as(pred)))
        # calculate test accuracy for each object class
        for i in range(batch_size):
            label = target.data[i]
            class_correct[label] += correct[i].item()
            class_total[label] += 1

    # calculate and print avg test loss
    test_loss = test_loss/len(test_loader.dataset)
    print('Test Loss: {:.6f}\n'.format(test_loss))

    for i in range(10):
        if class_total[i] > 0:
            print('Test Accuracy of %5s: %2d%% (%2d/%2d)' % (
                str(i), 100 * class_correct[i] / class_total[i],
                np.sum(class_correct[i]), np.sum(class_total[i])))
        else:
            print('Test Accuracy of %5s: N/A (no training examples)' % (classes[i]))

    print('\nTest Accuracy (Overall): %2d%% (%2d/%2d)' % (
        100. * np.sum(class_correct) / np.sum(class_total),
        np.sum(class_correct), np.sum(class_total)))

In [None]:
eval_model(model, test_loader)

Test Loss: 0.094191

Test Accuracy of     0: 99% (973/980)
Test Accuracy of     1: 99% (1128/1135)
Test Accuracy of     2: 96% (994/1032)
Test Accuracy of     3: 97% (982/1010)
Test Accuracy of     4: 98% (965/982)
Test Accuracy of     5: 98% (876/892)
Test Accuracy of     6: 96% (923/958)
Test Accuracy of     7: 95% (980/1028)
Test Accuracy of     8: 94% (920/974)
Test Accuracy of     9: 95% (967/1009)

Test Accuracy (Overall): 97% (9708/10000)


### Dynamic Quantization

In [None]:
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear, torch.nn.Conv2d, torch.nn.ReLU}, dtype=torch.qint8
)
st_dq = SizeTracker(quantized_model)
print("Using Dynamic Quantization: {} MB".format(round(st_dq.compute_size(), 4)))

Using Dynamic Quantization: 0.0035 MB


In [None]:
# Evaluate dynamically quantized model
eval_model(quantized_model, test_loader)

Test Loss: 0.094938

Test Accuracy of     0: 99% (973/980)
Test Accuracy of     1: 99% (1128/1135)
Test Accuracy of     2: 96% (995/1032)
Test Accuracy of     3: 97% (982/1010)
Test Accuracy of     4: 98% (966/982)
Test Accuracy of     5: 98% (876/892)
Test Accuracy of     6: 96% (921/958)
Test Accuracy of     7: 95% (980/1028)
Test Accuracy of     8: 94% (919/974)
Test Accuracy of     9: 95% (967/1009)

Test Accuracy (Overall): 97% (9707/10000)


### Quantization Aware Training

In [None]:
m = MLP2()
layers_to_quantize = []
for l in list(m.named_modules())[1:]:
    n = l[0]
    if 'conv' in n or 'relu' in n or 'linear' in n:
        layers_to_quantize.append(n)
layers_to_quantize

['conv2d_1',
 'relu_1',
 'conv2d_2',
 'relu_2',
 'conv2d_3',
 'relu_3',
 'linear_1',
 'relu_4',
 'linear_2',
 'relu_5',
 'linear_3',
 'relu_6']

In [None]:
model_fp32 = MLP2()
model_fp32.train()
model_fp32.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')

model_fp32_fused = torch.quantization.fuse_modules(
    model_fp32, ['conv2d_1', 'relu_1']
)
model_fp32_prepared = torch.quantization.prepare_qat(model_fp32_fused)


# calibration
train(model_fp32_prepared, train_loader, n_epochs=5)

  reduce_range will be deprecated in a future release of PyTorch."


Epoch: 1 	Training Loss: 2.303031
Epoch: 2 	Training Loss: 2.303031
Epoch: 3 	Training Loss: 2.303031
Epoch: 4 	Training Loss: 2.303031
Epoch: 5 	Training Loss: 2.303031


In [None]:
model_fp32_prepared.eval()
model_int8 = torch.quantization.convert(model_fp32_prepared)
model_int8

RuntimeError: ignored

### Trying quantize_model_ from paper

In [31]:
model = MLP().to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)

In [32]:
train(model, train_loader, n_epochs=5)

Epoch: 1 	Training Loss: 0.976792
Epoch: 2 	Training Loss: 0.156519
Epoch: 3 	Training Loss: 0.109381
Epoch: 4 	Training Loss: 0.084417
Epoch: 5 	Training Loss: 0.068244


In [33]:
eval_model(model, test_loader)

Test Loss: 0.087788

Test Accuracy of     0: 99% (974/980)
Test Accuracy of     1: 99% (1130/1135)
Test Accuracy of     2: 97% (1004/1032)
Test Accuracy of     3: 98% (992/1010)
Test Accuracy of     4: 97% (960/982)
Test Accuracy of     5: 97% (874/892)
Test Accuracy of     6: 94% (908/958)
Test Accuracy of     7: 95% (981/1028)
Test Accuracy of     8: 96% (941/974)
Test Accuracy of     9: 96% (974/1009)

Test Accuracy (Overall): 97% (9738/10000)


In [35]:
import logging

logger = logging.getLogger()

for step in range(len(layers_to_quantize)):
    quantized_layers = quantize_model_(
        model,
        size_tracker,
        layers_to_quantize,
        block_sizes_config,
        n_centroids_config,
        step=step
    )
    logger.info(layers_to_quantize[step])
    print(f"Finetuning stage {step}, quantized layers: {quantized_layers}")
    print(f"{size_tracker}")

    # Don't forget to re-create/update trainer/optimizer since model parameters have changed
    # optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)

    # Finetune the centroids with your usual training loop for a few epochs
    # train(model, train_loader, n_epochs=3)

KeyError: ignored