In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.quantization import fuse_modules
from torch.nn.quantized import FloatFunctional
from torch import Tensor
import torchvision
from torchvision import datasets, transforms

import os
import random
import time
import copy
import numpy as np
from resnet import resnet18

# Set up warnings
import warnings
warnings.filterwarnings(
    action='ignore',
    category=DeprecationWarning,
    module=r'.*'
)
warnings.filterwarnings(
    action='default',
    module=r'torch.ao.quantization'
)

In [2]:
def set_random_seeds(random_seed=0):
    torch.manual_seed(random_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)

In [3]:
def prepare_dataloader(num_workers=8, train_batch_size=128, eval_batch_size=256):

    train_transform = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        # transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
        transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
    ])

    test_transform = transforms.Compose([
        transforms.ToTensor(),
        # transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
        transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
    ])

    train_set = torchvision.datasets.CIFAR10(root="data", train=True, download=True, transform=train_transform) 
    # We will use test set for validation and test in this project.
    # Do not use test set for validation in practice!
    test_set = torchvision.datasets.CIFAR10(root="data", train=False, download=True, transform=test_transform)

    train_sampler = torch.utils.data.RandomSampler(train_set)
    test_sampler = torch.utils.data.SequentialSampler(test_set)

    train_loader = torch.utils.data.DataLoader(
        dataset=train_set, batch_size=train_batch_size,
        sampler=train_sampler, num_workers=num_workers)

    test_loader = torch.utils.data.DataLoader(
        dataset=test_set, batch_size=eval_batch_size,
        sampler=test_sampler, num_workers=num_workers)

    return train_loader, test_loader

In [4]:
def evaluate_model(model, test_loader, device, criterion=None):

    model.eval()
    model.to(device)

    running_loss = 0
    running_corrects = 0

    for inputs, labels in test_loader:

        inputs = inputs.to(device)
        labels = labels.to(device)

        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)

        if criterion is not None:
            loss = criterion(outputs, labels).item()
        else:
            loss = 0

        # statistics
        running_loss += loss * inputs.size(0)
        running_corrects += torch.sum(preds == labels.data)

    eval_loss = running_loss / len(test_loader.dataset)
    eval_accuracy = running_corrects / len(test_loader.dataset)

    return eval_loss, eval_accuracy

In [5]:
def train_model(model, train_loader, test_loader, device, learning_rate=1e-1, num_epochs=200):

    # The training configurations were not carefully selected.

    criterion = nn.CrossEntropyLoss()

    model.to(device)

    # It seems that SGD optimizer is better than Adam optimizer for ResNet18 training on CIFAR10.
    optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[100, 150], gamma=0.1, last_epoch=-1)
    
    # Evaluation
    model.eval()
    eval_loss, eval_accuracy = evaluate_model(model=model, test_loader=test_loader, device=device, criterion=criterion)
    print("Epoch: {:02d} Eval Loss: {:.3f} Eval Acc: {:.3f}".format(-1, eval_loss, eval_accuracy))

    for epoch in range(num_epochs):

        # Training
        model.train()

        running_loss = 0
        running_corrects = 0

        for inputs, labels in train_loader:

            inputs = inputs.to(device)
            labels = labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # statistics
            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)

        train_loss = running_loss / len(train_loader.dataset)
        train_accuracy = running_corrects / len(train_loader.dataset)

        # Evaluation
        model.eval()
        eval_loss, eval_accuracy = evaluate_model(model=model, test_loader=test_loader, device=device, criterion=criterion)

        # Set learning rate scheduler
        scheduler.step()

        print("Epoch: {:03d} Train Loss: {:.3f} Train Acc: {:.3f} Eval Loss: {:.3f} Eval Acc: {:.3f}".format(epoch, train_loss, train_accuracy, eval_loss, eval_accuracy))

    return model

In [6]:
def calibrate_model(model, loader, device=torch.device("cpu:0")):

    model.to(device)
    model.eval()

    for inputs, labels in loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        _ = model(inputs)

def measure_inference_latency(model,
                              device,
                              input_size=(1, 3, 32, 32),
                              num_samples=100,
                              num_warmups=10):

    model.to(device)
    model.eval()

    x = torch.rand(size=input_size).to(device)

    with torch.no_grad():
        for _ in range(num_warmups):
            _ = model(x)
    torch.cuda.synchronize()

    with torch.no_grad():
        start_time = time.time()
        for _ in range(num_samples):
            _ = model(x)
            torch.cuda.synchronize()
        end_time = time.time()
    elapsed_time = end_time - start_time
    elapsed_time_ave = elapsed_time / num_samples

    return elapsed_time_ave

In [7]:
def save_model(model, model_dir, model_filename):

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    model_filepath = os.path.join(model_dir, model_filename)
    torch.save(model.state_dict(), model_filepath)

def load_model(model, model_filepath, device):

    model.load_state_dict(torch.load(model_filepath, map_location=device))

    return model

In [8]:
def save_torchscript_model(model, model_dir, model_filename):

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    model_filepath = os.path.join(model_dir, model_filename)
    torch.jit.save(torch.jit.script(model), model_filepath)
    
def load_torchscript_model(model_filepath, device):

    model = torch.jit.load(model_filepath, map_location=device)

    return model

In [9]:
def create_model(num_classes=10):

    model = resnet18(num_classes=num_classes, pretrained=False)
    return model

In [10]:
class QuantizedResNet18(nn.Module):
    def __init__(self, model_fp32):
        super(QuantizedResNet18, self).__init__()
        
        self.quant = torch.quantization.QuantStub()
        self.dequant = torch.quantization.DeQuantStub()
        self.model_fp32 = model_fp32

    def forward(self, x):
        
        x = self.quant(x)
        x = self.model_fp32(x)
        x = self.dequant(x)
        return x

In [11]:
def model_equivalence(model_1, model_2, device, rtol=1e-05, atol=1e-08, num_tests=100, input_size=(1,3,32,32)):

    model_1.to(device)
    model_2.to(device)

    for _ in range(num_tests):
        x = torch.rand(size=input_size).to(device)
        y1 = model_1(x).detach().cpu().numpy()
        y2 = model_2(x).detach().cpu().numpy()
        if np.allclose(a=y1, b=y2, rtol=rtol, atol=atol, equal_nan=False) == False:
            print("Model equivalence test sample failed: ")
            print(y1)
            print(y2)
            return False

    return True

In [12]:
random_seed = 0
num_classes = 10
cuda_device = torch.device("cuda:0")
cpu_device = torch.device("cpu:0")
model_dir = "saved_models"
model_filename = "resnet18_cifar10.pt"
quantized_model_filename = "resnet18_quantized_cifar10.pt"
model_filepath = os.path.join(model_dir, model_filename)
quantized_model_filepath = os.path.join(model_dir, quantized_model_filename)
set_random_seeds(random_seed=random_seed)

In [13]:
train_loader, test_loader = prepare_dataloader(num_workers=8, train_batch_size=128, eval_batch_size=256)

Files already downloaded and verified
Files already downloaded and verified


In [14]:
# Create an untrained model.
model = create_model(num_classes=num_classes)

print("Training Model...")
model = train_model(model=model, train_loader=train_loader, test_loader=test_loader, device=cuda_device, learning_rate=1e-1, num_epochs=200)

# Save model.
save_model(model=model, model_dir=model_dir, model_filename=model_filename)

Training Model...
Epoch: -1 Eval Loss: 2.325 Eval Acc: 0.098
Epoch: 000 Train Loss: 2.127 Train Acc: 0.275 Eval Loss: 1.653 Eval Acc: 0.397
Epoch: 001 Train Loss: 1.553 Train Acc: 0.423 Eval Loss: 1.374 Eval Acc: 0.494
Epoch: 002 Train Loss: 1.385 Train Acc: 0.496 Eval Loss: 1.245 Eval Acc: 0.543
Epoch: 003 Train Loss: 1.232 Train Acc: 0.552 Eval Loss: 1.144 Eval Acc: 0.594
Epoch: 004 Train Loss: 1.121 Train Acc: 0.600 Eval Loss: 1.030 Eval Acc: 0.634
Epoch: 005 Train Loss: 1.027 Train Acc: 0.635 Eval Loss: 0.998 Eval Acc: 0.660
Epoch: 006 Train Loss: 0.968 Train Acc: 0.658 Eval Loss: 0.958 Eval Acc: 0.662
Epoch: 007 Train Loss: 0.910 Train Acc: 0.679 Eval Loss: 0.875 Eval Acc: 0.700
Epoch: 008 Train Loss: 0.869 Train Acc: 0.696 Eval Loss: 0.885 Eval Acc: 0.696
Epoch: 009 Train Loss: 0.821 Train Acc: 0.713 Eval Loss: 0.829 Eval Acc: 0.715
Epoch: 010 Train Loss: 0.783 Train Acc: 0.726 Eval Loss: 0.839 Eval Acc: 0.720
Epoch: 011 Train Loss: 0.756 Train Acc: 0.738 Eval Loss: 0.788 Eval Ac

In [15]:
# 1. floating point 타입으로 모델을 학습하거나 pre-trained 모델 불러오기
model = load_model(model=create_model(num_classes=num_classes), model_filepath=model_filepath, device=cuda_device)

# 2. 모델을 CPU 상태로 두고 학습 모드로 변환
model.to(cpu_device)
model.train()


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu1): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (skip_add): FloatFunctional(
        (activation_post_process): Identity()
      )
      (relu2): ReLU(inplace=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    

In [16]:
# Make a copy of the model for layer fusion
fused_model = copy.deepcopy(model)

# The model has to be switched to training mode before any layer fusion.
# Otherwise the quantization aware training will not work correctly.
fused_model.train()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu1): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (skip_add): FloatFunctional(
        (activation_post_process): Identity()
      )
      (relu2): ReLU(inplace=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    

In [17]:
# 3. layer fusion 적용
# Fuse the model in place rather manually.

fused_model = torch.quantization.fuse_modules(fused_model, [["conv1", "bn1", "relu"]], inplace=True)
for module_name, module in fused_model.named_children():
    if "layer" in module_name:
        for basic_block_name, basic_block in module.named_children():
            torch.quantization.fuse_modules(basic_block, [["conv1", "bn1", "relu1"], ["conv2", "bn2"]], inplace=True)
            for sub_block_name, sub_block in basic_block.named_children():
                if sub_block_name == "downsample":
                    torch.quantization.fuse_modules(sub_block, [["0", "1"]], inplace=True)

In [18]:
# 4. 모델을 평가 모드로 변환 후 layer fusion이 잘 적용되었는지 확인

model.eval()
fused_model.eval()
assert model_equivalence(model_1=model, model_2=fused_model, device=cpu_device, rtol=1e-03, atol=1e-06, num_tests=100, input_size=(1,3,32,32)), "Fused model is not equivalent to the original model!"

In [19]:
# 5. input에는 torch.quantization.QuantStub() 적용
#    output에는 torch.quantization.DeQuantStub() 적용

quantized_model = QuantizedResNet18(model_fp32=fused_model)

# 6. quantization configuration 지정
quantized_model.qconfig = torch.quantization.get_default_qconfig("fbgemm")

torch.quantization.prepare_qat(quantized_model, inplace=True)



QuantizedResNet18(
  (quant): QuantStub(
    (activation_post_process): HistogramObserver()
  )
  (dequant): DeQuantStub()
  (model_fp32): ResNet(
    (conv1): ConvBnReLU2d(
      3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
      (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (weight_fake_quant): PerChannelMinMaxObserver(min_val=tensor([]), max_val=tensor([]))
      (activation_post_process): HistogramObserver()
    )
    (bn1): Identity()
    (relu): Identity()
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): ConvBnReLU2d(
          64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False
          (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (weight_fake_quant): PerChannelMinMaxObserver(min_val=tensor([]), max_val=tensor([]))
          (activation

In [20]:
# 7. QAT를 하기 위한 quantization 모델
# 8. CUDA 상태로 적용 후, QAT 모델 학습
print("Training QAT Model...")
quantized_model.train()
train_model(model=quantized_model, train_loader=train_loader, test_loader=test_loader, device=cuda_device, learning_rate=1e-3, num_epochs=10)

Training QAT Model...
Epoch: -1 Eval Loss: 0.636 Eval Acc: 0.874
Epoch: 000 Train Loss: 0.025 Train Acc: 0.992 Eval Loss: 0.636 Eval Acc: 0.874
Epoch: 001 Train Loss: 0.025 Train Acc: 0.991 Eval Loss: 0.642 Eval Acc: 0.873
Epoch: 002 Train Loss: 0.024 Train Acc: 0.992 Eval Loss: 0.638 Eval Acc: 0.873
Epoch: 003 Train Loss: 0.024 Train Acc: 0.992 Eval Loss: 0.642 Eval Acc: 0.874
Epoch: 004 Train Loss: 0.024 Train Acc: 0.992 Eval Loss: 0.647 Eval Acc: 0.872
Epoch: 005 Train Loss: 0.024 Train Acc: 0.992 Eval Loss: 0.643 Eval Acc: 0.874
Epoch: 006 Train Loss: 0.024 Train Acc: 0.992 Eval Loss: 0.646 Eval Acc: 0.874
Epoch: 007 Train Loss: 0.023 Train Acc: 0.992 Eval Loss: 0.647 Eval Acc: 0.874
Epoch: 008 Train Loss: 0.022 Train Acc: 0.993 Eval Loss: 0.645 Eval Acc: 0.875
Epoch: 009 Train Loss: 0.023 Train Acc: 0.992 Eval Loss: 0.648 Eval Acc: 0.875


QuantizedResNet18(
  (quant): QuantStub(
    (activation_post_process): HistogramObserver()
  )
  (dequant): DeQuantStub()
  (model_fp32): ResNet(
    (conv1): ConvBnReLU2d(
      3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
      (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (weight_fake_quant): PerChannelMinMaxObserver(
        min_val=tensor([-2.4343e-02, -5.5356e-02, -1.6924e-01, -2.2380e-02, -1.0226e-02,
                -2.5949e-01, -1.4813e-02, -3.3876e-01, -1.4831e-01, -2.5777e-02,
                -1.2778e-01, -1.2449e-01, -6.8747e-03, -2.0834e-02, -3.2141e-01,
                -6.0829e-02, -1.1857e-01, -3.3610e-01, -3.3183e-02, -7.0651e-02,
                -1.4683e-05, -1.4628e-01, -3.6514e-01, -8.4780e-02, -2.1904e-01,
                -2.5619e-01, -7.9354e-02, -2.3710e-01, -5.7769e-02, -1.0227e-03,
                -5.1797e-02, -3.8092e-02, -1.1359e-01, -8.0964e-02, -1.6891e-02,
                -1.3513e-01, -2

In [21]:
# 9. 다시 CPU 상태로 두고 
#    QAT가 적용된 floating point 모델을 quantized integer model로 변환    
quantized_model.to(cpu_device)
quantized_model = torch.quantization.convert(quantized_model, inplace=True)

In [22]:
quantized_model.eval()
save_torchscript_model(model=quantized_model, model_dir=model_dir, model_filename=quantized_model_filename)

In [23]:
# Load quantized model.
quantized_jit_model = load_torchscript_model(model_filepath=quantized_model_filepath, device=cpu_device)

In [24]:
_, fp32_eval_accuracy = evaluate_model(model=model, test_loader=test_loader, device=cpu_device, criterion=None)
_, int8_eval_accuracy = evaluate_model(model=quantized_jit_model, test_loader=test_loader, device=cpu_device, criterion=None)

In [25]:
# Evaluate

print("FP32 evaluation accuracy: {:.3f}".format(fp32_eval_accuracy))
print("INT8 evaluation accuracy: {:.3f}".format(int8_eval_accuracy))

fp32_cpu_inference_latency = measure_inference_latency(model=model, device=cpu_device, input_size=(1,3,32,32), num_samples=100)
int8_cpu_inference_latency = measure_inference_latency(model=quantized_model, device=cpu_device, input_size=(1,3,32,32), num_samples=100)
int8_jit_cpu_inference_latency = measure_inference_latency(model=quantized_jit_model, device=cpu_device, input_size=(1,3,32,32), num_samples=100)
fp32_gpu_inference_latency = measure_inference_latency(model=model, device=cuda_device, input_size=(1,3,32,32), num_samples=100)

print("FP32 CPU Inference Latency: {:.2f} ms / sample".format(fp32_cpu_inference_latency * 1000))
print("FP32 CUDA Inference Latency: {:.2f} ms / sample".format(fp32_gpu_inference_latency * 1000))
print("INT8 CPU Inference Latency: {:.2f} ms / sample".format(int8_cpu_inference_latency * 1000))
print("INT8 JIT CPU Inference Latency: {:.2f} ms / sample".format(int8_jit_cpu_inference_latency * 1000))

FP32 evaluation accuracy: 0.874
INT8 evaluation accuracy: 0.873
FP32 CPU Inference Latency: 4.00 ms / sample
FP32 CUDA Inference Latency: 3.48 ms / sample
INT8 CPU Inference Latency: 3.11 ms / sample
INT8 JIT CPU Inference Latency: 1.62 ms / sample
