In [5]:
import torch

import torch.nn as nn
from torchvision import datasets, transforms
import torch.optim as optim
import torch.nn.functional as F
import copy
from torch.ao.quantization import QuantStub, DeQuantStub

### Original Network

In [6]:
class Net(nn.Module):
    def __init__(self):
        super(Net,self).__init__()
        self.nnn =nn.Sequential(
        nn.Conv2d(1,10,kernel_size=3,padding=1),
        nn.BatchNorm2d(10),
        nn.ReLU(),
        nn.MaxPool2d(2, 2),
        nn.Conv2d(10,20,kernel_size=3,padding=1),
        nn.BatchNorm2d(20),
        nn.ReLU(),
        nn.MaxPool2d(2, 2),
        nn.Conv2d(20,20,kernel_size=3,padding=1),
        nn.ReLU(),
        nn.MaxPool2d(2, 2),
        nn.Flatten(),
        nn.Linear(20*3*3,50),
        nn.ReLU(),
        nn.Linear(50,10),
        nn.LogSoftmax(dim=1))

        #self.conv1 = nn.Conv2d(1,10,kernel_size=5)
        #self.conv2 = nn.Conv2d(10,20,kernel_size=5)
        #self.conv2_drop = nn.Dropout2d()
        #self.fc1 = nn.Linear(320,50)
        #self.fc2 = nn.Linear(50,10)

        self.quant = QuantStub()
        self.dequant = DeQuantStub()

    def forward(self,x):
        """x = self.conv1(x)
        x = F.relu(x)
        x = F.max_pool2d(x,2,2)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x,2,2)
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        #x = F.dropout(x, training=self.training)
        x = self.fc2(x)"""
        print("input shape ",x.shape)
        #x = self.quant(x)
        out = self.nnn(x)
        #out = self.dequant(out)
        return out

In [8]:
def train(model,device,train_loader,optimizer,epoch):
    model.train()
    for batch_idx,(data,target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output,target)
        loss.backward()
        optimizer.step()
        if batch_idx % 1000 == 0:
            print('Train Epoch : {} \t Loss: {:.6f}'.format(epoch,loss.item()))
        
def test(model,device,test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data,target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output,target,reduction='sum') #sum up batch loss
            pred = output.max(1,keepdim=True)[1]
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:,.0f}%)\n'.format(test_loss,correct,len(test_loader.dataset),100.*correct / len(test_loader.dataset)))

In [7]:
train_loader = torch.utils.data.DataLoader(datasets.MNIST('../data',train=True,download=True,transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.1307,),(0.3081,))])),batch_size=64,shuffle=True)
test_loader = torch.utils.data.DataLoader(datasets.MNIST('../data',train=False,transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.1307,),(0.3081,))])),batch_size=1000,shuffle=True) 

In [6]:
torch.manual_seed(1)
device = torch.device("cuda")
model = Net().to(device)
optimizer = optim.SGD(model.parameters(),lr=0.01,momentum=0.5)

for epoch in range(5):
    train(model,device,train_loader,optimizer,epoch)
    test(model,device,test_loader)

input shape  torch.Size([64, 1, 28, 28])
Train Epoch : 0 	 Loss: 2.311686
input shape  torch.Size([64, 1, 28, 28])
input shape  torch.Size([64, 1, 28, 28])
input shape  torch.Size([64, 1, 28, 28])
input shape  torch.Size([64, 1, 28, 28])
input shape  torch.Size([64, 1, 28, 28])
input shape  torch.Size([64, 1, 28, 28])
input shape  torch.Size([64, 1, 28, 28])
input shape  torch.Size([64, 1, 28, 28])
input shape  torch.Size([64, 1, 28, 28])
input shape  torch.Size([64, 1, 28, 28])
input shape  torch.Size([64, 1, 28, 28])
input shape  torch.Size([64, 1, 28, 28])
input shape  torch.Size([64, 1, 28, 28])
input shape  torch.Size([64, 1, 28, 28])
input shape  torch.Size([64, 1, 28, 28])
input shape  torch.Size([64, 1, 28, 28])
input shape  torch.Size([64, 1, 28, 28])
input shape  torch.Size([64, 1, 28, 28])
input shape  torch.Size([64, 1, 28, 28])
input shape  torch.Size([64, 1, 28, 28])
input shape  torch.Size([64, 1, 28, 28])
input shape  torch.Size([64, 1, 28, 28])
input shape  torch.Size(

In [1]:
import os
os.getcwd()

'/home/udayanga/Udaya_Research_stuff/2024_GAP8_work'

In [2]:
torch.save(model.state_dict(),"nmnist_bnorm_model.pth")

NameError: name 'torch' is not defined

In [9]:
model2 = Net().to('cpu')

In [10]:
model2.load_state_dict(torch.load("nmnist_bnorm_model.pth"))

  model2.load_state_dict(torch.load("nmnist_bnorm_model.pth"))


<All keys matched successfully>

In [11]:
model2.eval()
for epoch in range(1):
    test(model2,'cpu',test_loader)

input shape  torch.Size([1000, 1, 28, 28])
input shape  torch.Size([1000, 1, 28, 28])
input shape  torch.Size([1000, 1, 28, 28])
input shape  torch.Size([1000, 1, 28, 28])
input shape  torch.Size([1000, 1, 28, 28])
input shape  torch.Size([1000, 1, 28, 28])
input shape  torch.Size([1000, 1, 28, 28])
input shape  torch.Size([1000, 1, 28, 28])
input shape  torch.Size([1000, 1, 28, 28])
input shape  torch.Size([1000, 1, 28, 28])

Test set: Average loss: 0.0393, Accuracy: 9873/10000 (99%)



In [10]:
#model_fp32 = copy.deepcopy(model2)

In [12]:
model2.cpu()

Net(
  (nnn): Sequential(
    (0): Conv2d(1, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(10, 20, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): BatchNorm2d(20, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (8): Conv2d(20, 20, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU()
    (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (11): Flatten(start_dim=1, end_dim=-1)
    (12): Linear(in_features=180, out_features=50, bias=True)
    (13): ReLU()
    (14): Linear(in_features=50, out_features=10, bias=True)
    (15): LogSoftmax(dim=1)
  )
  (quant): QuantStub()
  (dequant): DeQuantStub()
)

In [13]:
model2.eval()

# attach a global qconfig, which contains information about what kind
# of observers to attach. Use 'x86' for server inference and 'qnnpack'
# for mobile inference. Other quantization configurations such as selecting
# symmetric or asymmetric quantization and MinMax or L2Norm calibration techniques
# can be specified here.
# Note: the old 'fbgemm' is still available but 'x86' is the recommended default
# for server inference.
# model_fp32.qconfig = torch.ao.quantization.get_default_qconfig('fbgemm')
model2.qconfig = torch.ao.quantization.get_default_qconfig('qnnpack')

In [14]:
# Fuse the activations to preceding layers, where applicable.
# This needs to be done manually depending on the model architecture.
# Common fusions include `conv + relu` and `conv + batchnorm + relu`
fused_model_fp32 = torch.ao.quantization.fuse_modules(model2.nnn, [['0','1','2'],['4','5','6'],['8','9']])

In [15]:
print(fused_model_fp32)

Sequential(
  (0): ConvReLU2d(
    (0): Conv2d(1, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
  )
  (1): Identity()
  (2): Identity()
  (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (4): ConvReLU2d(
    (0): Conv2d(10, 20, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
  )
  (5): Identity()
  (6): Identity()
  (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (8): ConvReLU2d(
    (0): Conv2d(20, 20, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
  )
  (9): Identity()
  (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (11): Flatten(start_dim=1, end_dim=-1)
  (12): Linear(in_features=180, out_features=50, bias=True)
  (13): ReLU()
  (14): Linear(in_features=50, out_features=10, bias=True)
  (15): LogSoftmax(dim=1)
)


In [16]:
# Prepare the model for static quantization. This inserts observers in
# the model that will observe activation tensors during calibration.
model_fp32_prepared = torch.ao.quantization.prepare(fused_model_fp32)



In [17]:
with torch.no_grad():
    for images, _ in train_loader:
        # Pass a batch of images through the model
        model_fp32_prepared(images)

In [18]:
# Convert the observed model to a quantized model. This does several things:
# quantizes the weights, computes and stores the scale and bias value to be
# used with each activation tensor, and replaces key operators with quantized
# implementations.
model_int8 = torch.ao.quantization.convert(model_fp32_prepared)

In [14]:
model_int8[0][0].weight

Parameter containing:
tensor([[[[ 0.9006, -0.2407, -0.1252],
          [ 0.4204, -0.9503,  0.1520],
          [ 0.0657,  0.5205,  0.2810]]],


        [[[ 0.1207,  0.5317,  0.7760],
          [ 0.1942, -0.2327, -0.4553],
          [-0.2382, -0.3260, -0.3472]]],


        [[[ 0.6747,  0.2265, -0.3288],
          [-0.3590, -0.4090, -0.4766],
          [-0.2587, -0.1818,  0.3934]]],


        [[[ 0.4866, -0.0913,  0.2343],
          [ 0.1138,  0.6543,  0.3543],
          [-0.5688, -0.5572, -0.1460]]],


        [[[ 0.2467,  0.0797,  0.2356],
          [-0.0712,  0.5287, -0.2476],
          [ 0.4762,  0.0151, -0.3083]]],


        [[[ 0.1219, -0.3622,  0.2847],
          [-0.2434, -0.1396, -0.6042],
          [-0.2922,  0.2701, -0.1721]]],


        [[[ 0.5390,  0.4231, -0.1339],
          [-0.3449,  0.2951,  0.0676],
          [-0.3420,  0.2230,  0.2748]]],


        [[[ 0.7885, -0.2149, -0.2521],
          [ 0.2788, -0.0479, -0.5006],
          [-0.0481,  0.1247, -0.3771]]],


        [[

In [19]:
for epoch in range(1):
    test(model_int8,'cpu',test_loader)


Test set: Average loss: 0.0393, Accuracy: 9873/10000 (99%)



In [20]:
model_int8

Sequential(
  (0): ConvReLU2d(
    (0): Conv2d(1, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
  )
  (1): Identity()
  (2): Identity()
  (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (4): ConvReLU2d(
    (0): Conv2d(10, 20, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
  )
  (5): Identity()
  (6): Identity()
  (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (8): ConvReLU2d(
    (0): Conv2d(20, 20, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
  )
  (9): Identity()
  (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (11): Flatten(start_dim=1, end_dim=-1)
  (12): Linear(in_features=180, out_features=50, bias=True)
  (13): ReLU()
  (14): Linear(in_features=50, out_features=10, bias=True)
  (15): LogSoftmax(dim=1)
)

##Quanitization aware training

In [1]:
import torch
import torch.nn as nn
from torch.ao.quantization import QuantStub, DeQuantStub
import torch

import torch.nn as nn
from torchvision import datasets, transforms
import torch.optim as optim
import torch.nn.functional as F
import copy

In [None]:
def train(model,device,train_loader,optimizer,epoch):
    model.train()
    for batch_idx,(data,target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output,target)
        loss.backward()
        optimizer.step()
        if batch_idx % 1000 == 0:
            print('Train Epoch : {} \t Loss: {:.6f}'.format(epoch,loss.item()))
        
def test(model,device,test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data,target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output,target,reduction='sum') #sum up batch loss
            pred = output.max(1,keepdim=True)[1]
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:,.0f}%)\n'.format(test_loss,correct,len(test_loader.dataset),100.*correct / len(test_loader.dataset)))

In [3]:
class Net(nn.Module):
    def __init__(self):
        super(Net,self).__init__()
        self.nnn =nn.Sequential(
        nn.Conv2d(1,10,kernel_size=3,padding=1),
        nn.BatchNorm2d(10),
        nn.ReLU(),
        nn.MaxPool2d(2, 2),
        nn.Conv2d(10,20,kernel_size=3,padding=1),
        nn.BatchNorm2d(20),
        nn.ReLU(),
        nn.MaxPool2d(2, 2),
        nn.Conv2d(20,20,kernel_size=3,padding=1),
        nn.ReLU(),
        nn.MaxPool2d(2, 2),
        nn.Flatten(),
        nn.Linear(20*3*3,50),
        nn.ReLU(),
        nn.Linear(50,10),
        nn.LogSoftmax(dim=1))

        #self.conv1 = nn.Conv2d(1,10,kernel_size=5)
        #self.conv2 = nn.Conv2d(10,20,kernel_size=5)
        #self.conv2_drop = nn.Dropout2d()
        #self.fc1 = nn.Linear(320,50)
        #self.fc2 = nn.Linear(50,10)

        self.quant = QuantStub()
        self.dequant = DeQuantStub()

    def forward(self,x):
        """x = self.conv1(x)
        x = F.relu(x)
        x = F.max_pool2d(x,2,2)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x,2,2)
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        #x = F.dropout(x, training=self.training)
        x = self.fc2(x)"""
        print("input shape ",x.shape)
        #x = self.quant(x)
        out = self.nnn(x)
        #out = self.dequant(out)
        return out


In [4]:
train_loader = torch.utils.data.DataLoader(datasets.MNIST('../data',train=True,download=True,transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.1307,),(0.3081,))])),batch_size=64,shuffle=True)
test_loader = torch.utils.data.DataLoader(datasets.MNIST('../data',train=False,transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.1307,),(0.3081,))])),batch_size=1000,shuffle=False) 

In [5]:
qat_model = Net()
#qat_model.load_state_dict(torch.load("nmnist_bnorm_model.pth"))
#qat_model.eval()
qat_model.qconfig = torch.ao.quantization.get_default_qat_qconfig('x86') #x86 or 'qnnpack'
#qat_model.fuse_model(is_qat=True) #is_qat=True only for quantization aware training.
qat_model = torch.ao.quantization.fuse_modules_qat(qat_model.nnn, [['0','1','2'],['4','5','6'],['8','9']])
#optimizer = optim.SGD(qat_model.parameters(),lr=0.01,momentum=0.5)
qat_model = torch.ao.quantization.prepare_qat(qat_model)



In [6]:
optimizer = optim.SGD(qat_model.parameters(),lr=0.01,momentum=0.5)
for epoch in range(5):
    train(qat_model.to(torch.device('cuda:0')),device=torch.device('cuda:0'),train_loader=train_loader,optimizer=optimizer,epoch=epoch)
    #test(model,device,test_loader)
    qat_model.to(torch.device('cpu'))
    quantized_model = torch.quantization.convert(qat_model)
    quantized_model.eval()
    test(quantized_model,torch.device('cpu'),test_loader)


Train Epoch : 0 	 Loss: 2.330222

Test set: Average loss: 0.1195, Accuracy: 9648/10000 (96%)

Train Epoch : 1 	 Loss: 0.185938


In [7]:
print(quantized_model)

Sequential(
  (0): ConvBnReLU2d(
    (0): Conv2d(1, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
  )
  (1): Identity()
  (2): Identity()
  (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (4): ConvBnReLU2d(
    (0): Conv2d(10, 20, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(20, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
  )
  (5): Identity()
  (6): Identity()
  (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (8): ConvReLU2d(
    (0): Conv2d(20, 20, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
  )
  (9): Identity()
  (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (11): Flatten(start_dim=1, end_dim=-1)
  (12): Linear(in_features=180, out_features=50, bias=True)
  (13): ReLU()
  (14): Line

In [8]:
import io

In [9]:
from torch.ao.quantization.quantize_fx import prepare_fx,convert_fx

In [13]:
torch.jit.save(torch.jit.script(quantized_model), "aaa")

In [15]:
mmm = torch.jit.load("aaa", map_location="cpu")

In [None]:
mmm

In [16]:
test(mmm,torch.device('cpu'),test_loader)


Test set: Average loss: 0.0363, Accuracy: 9879/10000 (99%)



In [None]:
m = quantized_model.eval()
example_inputs = torch.rand(1, 1, 28, 28)
prepare_orig = prepare_fx(m, torch.ao.quantization.get_default_qat_qconfig('x86'), example_inputs)

In [None]:
m = quantized_model.eval()
example_inputs = torch.rand(1, 1, 28, 28)
prepare_orig = prepare_fx(m, torch.ao.quantization.get_default_qat_qconfig('x86'), example_inputs)
#prepare_orig = prepare_fx(m, {'' : torch.ao.quantization.get_default_qat_qconfig('x86')})
prepare_orig(torch.rand(1, 1,28,28))
quantized_orig = convert_fx(prepare_orig)

# Save/load using state_dict
b = io.BytesIO()
torch.save(quantized_orig.state_dict(), b)

m2 = quantized_model.eval()
prepared = prepare_fx(m2, {'' : torch.ao.quantization.get_default_qat_qconfig('x86')})
quantized = convert_fx(prepared)
b.seek(0)
quantized.load_state_dict(torch.load(b))

In [None]:
prepare_orig = torch.ao.quantization.prepare_fx(m, {'' : torch.ao.quantization.get_default_qconfig('x86')})
prepare_orig(torch.rand(5, 5))
quantized_orig = torch.ao.quantization.convert_fx(prepare_orig)

In [None]:
from torch.ao.quantization import quantize_fx

In [None]:
model_filepath = "./"

torch.jit.save(torch.jit.script(model), model_filepath)