<a href="https://colab.research.google.com/github/felixsimard/comp551-p4/blob/main/MNIST_MLP_Quant.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# IMPORTS AND STUFF


In [1]:
!pip install regex requests hydra-core omegaconf bitarray



In [2]:
!pip install fairseq

Collecting fairseq
  Downloading fairseq-0.10.2-cp37-cp37m-manylinux1_x86_64.whl (1.7 MB)
[?25l[K     |▏                               | 10 kB 23.9 MB/s eta 0:00:01[K     |▍                               | 20 kB 28.9 MB/s eta 0:00:01[K     |▋                               | 30 kB 20.0 MB/s eta 0:00:01[K     |▊                               | 40 kB 17.6 MB/s eta 0:00:01[K     |█                               | 51 kB 14.2 MB/s eta 0:00:01[K     |█▏                              | 61 kB 10.3 MB/s eta 0:00:01[K     |█▍                              | 71 kB 11.4 MB/s eta 0:00:01[K     |█▌                              | 81 kB 12.5 MB/s eta 0:00:01[K     |█▊                              | 92 kB 11.6 MB/s eta 0:00:01[K     |██                              | 102 kB 12.3 MB/s eta 0:00:01[K     |██▏                             | 112 kB 12.3 MB/s eta 0:00:01[K     |██▎                             | 122 kB 12.3 MB/s eta 0:00:01[K     |██▌                             | 133 k

In [3]:
import torch
import torch.nn as nn
from torchvision import models
from torchsummary import summary
import torch.nn.functional as F
import numpy as np


from fairseq.modules.quantization.pq import quantize_model_, SizeTracker
from fairseq.modules.quant_noise import quant_noise

from operator import attrgetter, itemgetter
import re

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Work

In [4]:
from torchvision import datasets
import torchvision.transforms as transforms

# number of subprocesses to use for data loading
num_workers = 0
# how many samples per batch to load
batch_size = 20

# convert data to torch.FloatTensor
transform = transforms.ToTensor()

# choose the training and test datasets
train_data = datasets.MNIST(root='data', train=True,
                                   download=True, transform=transform)
test_data = datasets.MNIST(root='data', train=False,
                                  download=True, transform=transform)

# prepare data loaders
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size,
    num_workers=num_workers)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, 
    num_workers=num_workers)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to data/MNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting data/MNIST/raw/train-images-idx3-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to data/MNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting data/MNIST/raw/train-labels-idx1-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to data/MNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting data/MNIST/raw/t10k-images-idx3-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to data/MNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting data/MNIST/raw/t10k-labels-idx1-ubyte.gz to data/MNIST/raw



In [6]:
for ims, labels in train_loader:
    im = ims[0:10]
    break
im.size()
im.flatten(1).size()

torch.Size([10, 784])

# Model without quantization

In [19]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()

        self.conv = nn.Sequential(
            nn.Conv2d(1, 4, kernel_size=(3, 3), padding='same'),
            nn.ReLU(),
            nn.Conv2d(4, 8, kernel_size=(3, 3), padding='same'),
            nn.ReLU(),
            nn.Conv2d(8, 8, kernel_size=(3, 3), padding='same'),
            nn.ReLU(),
        )
        self.dense = nn.Sequential(
            nn.Linear(8 * 784, 100),
            nn.ReLU(),
            # nn.BatchNorm1d(100),
            nn.Linear(100, 100),
            nn.ReLU(),
            # nn.BatchNorm1d(100),
            nn.Linear(100, 10),
            nn.ReLU(),
        )
    
    def forward(self, x):
        x = self.conv(x)
        x = x.view(x.size()[0], -1)
        x = self.dense(x)
        return x

In [27]:
class MLPQuant(nn.Module):
    def __init__(self):
        super().__init__()
        p = 0.01
        block_size = 4
        self.conv = nn.Sequential(
            quant_noise(nn.Conv2d(1, 4, kernel_size=(3, 3), padding='same'), p, 9),
            nn.ReLU(),
            quant_noise(nn.Conv2d(4, 8, kernel_size=(3, 3), padding='same'), p, 9),
            nn.ReLU(),
            quant_noise(nn.Conv2d(8, 8, kernel_size=(3, 3), padding='same'), p, 9),
            nn.ReLU(),
        )
        self.dense = nn.Sequential(
            quant_noise(nn.Linear(8 * 784, 100), p, block_size),
            nn.ReLU(),
            # nn.BatchNorm1d(100),
            quant_noise(nn.Linear(100, 100), p, block_size),
            nn.ReLU(),
            # nn.BatchNorm1d(100),
            quant_noise(nn.Linear(100, 10), p, block_size),
            nn.ReLU(),
        )
    
    def forward(self, x):
        x = self.conv(x)
        x = x.view(x.size()[0], -1)
        x = self.dense(x)
        return x
    

In [8]:
for data, labels in train_loader:
    bz, c, h , w = data.size()
    data = data.resize(bz, c, h * w, 1)
    break
data.size()



torch.Size([20, 1, 784, 1])

In [21]:
model = MLP()

In [22]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)

In [23]:
def train(model, train_loader, n_epochs=5):
    # number of epochs to train the model

    model.train() # prep model for training

    for epoch in range(n_epochs):
        train_loss = 0.0

        for data, target in train_loader:
            data = data.to(device)
            target = target.to(device)
            
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()*data.size(0)

        train_loss = train_loss/len(train_loader.dataset)

        print('Epoch: {} \tTraining Loss: {:.6f}'.format(
            epoch+1, 
            train_loss
            ))

In [24]:
model = model.to(device)
train(model, train_loader)

Epoch: 1 	Training Loss: 1.953932
Epoch: 2 	Training Loss: 0.442669
Epoch: 3 	Training Loss: 0.351463
Epoch: 4 	Training Loss: 0.107045
Epoch: 5 	Training Loss: 0.073592


In [25]:
def eval_model(model, test_loader):
    # initialize lists to monitor test loss and accuracy
    test_loss = 0.0
    class_correct = list(0. for i in range(10))
    class_total = list(0. for i in range(10))

    model.eval() # prep model for *evaluation*

    for data, target in test_loader:
        data = data.to(device)
        target = target.to(device)
        
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the loss
        loss = criterion(output, target)
        # update test loss 
        test_loss += loss.item()*data.size(0)
        # convert output probabilities to predicted class
        _, pred = torch.max(output, 1)
        # compare predictions to true label
        correct = np.squeeze(pred.eq(target.data.view_as(pred)))
        # calculate test accuracy for each object class
        for i in range(batch_size):
            label = target.data[i]
            class_correct[label] += correct[i].item()
            class_total[label] += 1

    # calculate and print avg test loss
    test_loss = test_loss/len(test_loader.dataset)
    print('Test Loss: {:.6f}\n'.format(test_loss))

    for i in range(10):
        if class_total[i] > 0:
            print('Test Accuracy of %5s: %2d%% (%2d/%2d)' % (
                str(i), 100 * class_correct[i] / class_total[i],
                np.sum(class_correct[i]), np.sum(class_total[i])))
        else:
            print('Test Accuracy of %5s: N/A (no training examples)' % (classes[i]))

    print('\nTest Accuracy (Overall): %2d%% (%2d/%2d)' % (
        100. * np.sum(class_correct) / np.sum(class_total),
        np.sum(class_correct), np.sum(class_total)))

In [26]:
eval_model(model, test_loader)

Test Loss: 0.093046

Test Accuracy of     0: 99% (975/980)
Test Accuracy of     1: 99% (1128/1135)
Test Accuracy of     2: 95% (988/1032)
Test Accuracy of     3: 97% (987/1010)
Test Accuracy of     4: 98% (966/982)
Test Accuracy of     5: 98% (877/892)
Test Accuracy of     6: 96% (926/958)
Test Accuracy of     7: 93% (959/1028)
Test Accuracy of     8: 95% (933/974)
Test Accuracy of     9: 95% (968/1009)

Test Accuracy (Overall): 97% (9707/10000)


# Train with Quant-Noise

In [28]:
config = {
    'n_centroids': {
              'Conv2d': ('kernel_size', {'*': 256}),
              'Linear': ('in_features', {'*': 256})
          },
    'block_sizes': {
              'Conv2d': ('kernel_size', {'(3, 3)': 9, '(1, 1)': 4}), # '(3, 3)': 9
              'Linear': ('in_features', {'*': 4})
          },
    'layers_to_quantize': [".*?"]
}

In [29]:
n_centroids_config = config['n_centroids']
block_sizes_config = config['block_sizes']
layers_to_quantize = config['layers_to_quantize']

In [30]:
quant_model = MLPQuant()
quant_model = quant_model.to(device)

In [31]:
train(quant_model, train_loader)

Epoch: 1 	Training Loss: 2.311579
Epoch: 2 	Training Loss: 2.302808
Epoch: 3 	Training Loss: 2.302808
Epoch: 4 	Training Loss: 2.302808
Epoch: 5 	Training Loss: 2.302808


In [32]:
eval_model(quant_model, test_loader)

Test Loss: 2.302931

Test Accuracy of     0:  0% ( 0/980)
Test Accuracy of     1:  0% ( 0/1135)
Test Accuracy of     2:  0% ( 0/1032)
Test Accuracy of     3: 100% (1010/1010)
Test Accuracy of     4:  0% ( 0/982)
Test Accuracy of     5:  0% ( 0/892)
Test Accuracy of     6:  0% ( 0/958)
Test Accuracy of     7:  0% ( 0/1028)
Test Accuracy of     8:  0% ( 0/974)
Test Accuracy of     9:  0% ( 0/1009)

Test Accuracy (Overall): 10% (1010/10000)


In [33]:
size_tracker = SizeTracker(model)
import logging

logger = logging.getLogger()

# Quantize model by stages
for step in range(len(layers_to_quantize)):

    # quantize model in-place
    quantized_layers = quantize_model_(
        quant_model,
        size_tracker,
        layers_to_quantize,
        block_sizes_config,
        n_centroids_config,
        step=step,
    )
    logger.info(f"Finetuning stage {step}, quantized layers: {quantized_layers}")
    logger.info(f"{size_tracker}")

    # Don't forget to re-create/update trainer/optimizer since model parameters have changed
    optimizer = torch.optim.SGD(quant_model.parameters(), lr=1e-2)

    # Finetune the centroids with your usual training loop for a few epochs
    train(quant_model, train_loader, n_epochs=5)

TypeError: ignored

In [18]:
eval_model(model, test_loader)

TypeError: ignored