## 4.1 环境配置

首先，我们导入必须的环境，数据集和model使用和前几章相同的minist数据集和LeNet网络。


In [62]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.quantization
import random
import numpy as np
import torch.nn as nn
import copy
from torch.optim import *
from torch.optim.lr_scheduler import *
from torch.utils.data import DataLoader
from torchprofile import profile_macs
from torchvision.datasets import *
from torchvision.transforms import *
import torch.nn.functional as F
from torchvision import datasets
from utils import LeNet,train,evaluate,get_model_size,get_model_flops,MiB

random.seed(0)
np.random.seed(0)
torch.manual_seed(0)

<torch._C.Generator at 0x7f14fe352af0>

## 4.2 构建模型和数据集

In [63]:
class LeNet(nn.Module):
    def __init__(self, num_classes=10):
        super(LeNet, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5)
        self.relu1 = nn.ReLU()
    
        self.maxpool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5)
        self.relu2 = nn.ReLU()
        
        self.maxpool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        
        self.fc1 = nn.Linear(in_features=16 * 4 * 4, out_features=120)
        self.relu3 = nn.ReLU()
        
        self.fc2 = nn.Linear(in_features=120, out_features=84)
        self.relu4 = nn.ReLU()
        
        self.fc3 = nn.Linear(in_features=84, out_features=num_classes)
        
        self.quant = torch.quantization.QuantStub()
        self.dequant = torch.quantization.DeQuantStub()
       
    def forward(self, x):
        x = self.quant(x)
        x = self.maxpool1(self.relu1(self.conv1(x)))
        x = self.maxpool2(self.relu2(self.conv2(x)))

        x = x.contiguous().view(x.shape[0], -1)
        x = self.relu3(self.fc1(x))
        x = self.relu4(self.fc2(x))
        x = self.fc3(x)
        x = self.dequant(x)
        return x

In [64]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LeNet()#.to(device=device)
print(model)
# 加载模型的状态字典
# checkpoint = torch.load('../ch02/model.pt')
checkpoint = torch.load('./model.pt')

# 加载状态字典到模型
model.load_state_dict(checkpoint)
fp32_model = copy.deepcopy(model)

LeNet(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (relu1): ReLU()
  (maxpool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (relu2): ReLU()
  (maxpool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=256, out_features=120, bias=True)
  (relu3): ReLU()
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (relu4): ReLU()
  (fc3): Linear(in_features=84, out_features=10, bias=True)
  (quant): QuantStub()
  (dequant): DeQuantStub()
)


In [65]:
# 设置归一化
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])

# 获取数据集
# train_dataset = datasets.MNIST(root='../ch02/data/mnist', train=True, download=True, transform=transform)  
# test_dataset = datasets.MNIST(root='../ch02/data/mnist', train=False, download=True, transform=transform)  # train=True训练集，=False测试集
train_dataset = datasets.MNIST(root='./data/mnist', train=True, download=True, transform=transform)  
test_dataset = datasets.MNIST(root='./data/mnist', train=False, download=True, transform=transform)  # train=True训练集，=False测试集
# 设置DataLoader
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## 4.3 验证 FP32 模型的精度以及模型大小

In [66]:
fp32_model_accuracy = evaluate(fp32_model, test_loader)
fp32_model_size = get_model_size(fp32_model)
print(f"fp32 model has accuracy={fp32_model_accuracy:.5f}%")
print(f"fp32 model has size={fp32_model_size/MiB:.2f} MiB")

HBox(children=(FloatProgress(value=0.0, description='eval', max=157.0, style=ProgressStyle(description_width='…

fp32 model has accuracy=97.99000%
fp32 model has size=0.17 MiB


## 4.4 将模型转化为量化模型

In [67]:
backend = torch.backends.quantized.supported_engines #运行时可以使用这句命令检查自己支持的后端
print(backend)

['qnnpack', 'none', 'fbgemm']


In [75]:
# 将模型量化为QAT模型
quant_model = copy.deepcopy(model)
quant_model.qconfig = torch.quantization.get_default_qat_qconfig(backend[-1])
torch.quantization.prepare_qat(quant_model, inplace=True)
print(quant_model)

LeNet(
  (conv1): Conv2d(
    1, 6, kernel_size=(5, 5), stride=(1, 1)
    (weight_fake_quant): FakeQuantize(
      fake_quant_enabled=tensor([1], dtype=torch.uint8), observer_enabled=tensor([1], dtype=torch.uint8), quant_min=-128, quant_max=127, dtype=torch.qint8, qscheme=torch.per_channel_symmetric, ch_axis=0, scale=tensor([1.]), zero_point=tensor([0])
      (activation_post_process): MovingAveragePerChannelMinMaxObserver(min_val=tensor([]), max_val=tensor([]))
    )
    (activation_post_process): FakeQuantize(
      fake_quant_enabled=tensor([1], dtype=torch.uint8), observer_enabled=tensor([1], dtype=torch.uint8), quant_min=0, quant_max=255, dtype=torch.quint8, qscheme=torch.per_tensor_affine, ch_axis=-1, scale=tensor([1.]), zero_point=tensor([0])
      (activation_post_process): MovingAverageMinMaxObserver(min_val=inf, max_val=-inf)
    )
  )
  (relu1): ReLU()
  (maxpool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(
    6, 16, kerne

可以看到上边的model中添加了FakeQuantize这个函数，这个函数就是用来进行模拟量化的。

## 4.5 验证量化模型的精度

In [82]:
quant_model_accuracy = evaluate(quant_model, test_loader)
print(f"quant model has accuracy={quant_model_accuracy:.5f}%")


HBox(children=(FloatProgress(value=0.0, description='eval', max=157.0, style=ProgressStyle(description_width='…

quant model has accuracy=98.37000%


## 4.6 对量化后的模型进行训练

In [77]:
num_finetune_epochs = 3 #   这里一般微调5～10个epoch
optimizer = torch.optim.SGD(quant_model.parameters(), lr=0.001, momentum=0.9) #学习率调低一些
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, num_finetune_epochs)
criterion = nn.CrossEntropyLoss()
best_accuracy = 0
epoch = num_finetune_epochs
while epoch > 0:
    train(quant_model, train_loader, criterion, optimizer, scheduler)
    model_accuracy = evaluate(quant_model, test_loader)
    is_best = model_accuracy > best_accuracy
    best_accuracy = max(model_accuracy, best_accuracy)
    print(f'        Epoch {num_finetune_epochs-epoch} Accuracy {model_accuracy:.2f}% / Best Accuracy: {best_accuracy:.2f}%')
    epoch -= 1

HBox(children=(FloatProgress(value=0.0, description='train', max=938.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='eval', max=157.0, style=ProgressStyle(description_width='…

        Epoch 0 Accuracy 98.34% / Best Accuracy: 98.34%


HBox(children=(FloatProgress(value=0.0, description='train', max=938.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='eval', max=157.0, style=ProgressStyle(description_width='…

        Epoch 1 Accuracy 98.32% / Best Accuracy: 98.34%


HBox(children=(FloatProgress(value=0.0, description='train', max=938.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='eval', max=157.0, style=ProgressStyle(description_width='…

        Epoch 2 Accuracy 98.28% / Best Accuracy: 98.34%


## 4.7 将量化模型转化为量化部署模型并验证其精度

这里重新定义一个验证函数。因为模型在量化后，输入输出也需要进行量化。具体添加的内容如下：
    outputs = dequant(outputs)
    inputs = quant(inputs)

In [78]:
from tqdm.auto import tqdm
quant = torch.quantization.QuantStub() 
dequant = torch.quantization.DeQuantStub()

def evaluate_quant(
  model: nn.Module,
  dataloader: DataLoader,
  extra_preprocess = None
) -> float:
  model.eval()

  num_samples = 0
  num_correct = 0
  
  for inputs, targets in tqdm(dataloader, desc="eval", leave=False):
    # Move the data from CPU to GPU
    # inputs = inputs.to('mps')
    if extra_preprocess is not None:
        for preprocess in extra_preprocess:
            inputs = quant(inputs)
            inputs = preprocess(inputs)

    # targets = targets.to('mps')

    # Inference
    outputs = model(inputs)
    # print(outputs)
    outputs = dequant(outputs)
    # Convert logits to class indices
    # print(outputs)
    outputs = outputs.to("cpu")
    # outputs = outputs.argmax(dim=1)
    outputs=torch.max(outputs,1)[1]

    # Update metrics
    num_samples += targets.size(0)
    num_correct += (outputs == targets).sum()

  return (num_correct / num_samples * 100).item()

将量化训练后的模型中的Fake节点去掉，这样可以得到一个真正的int8模型。下边模型中的节点算子都变成了Quantized算子。

In [79]:
torch.quantization.convert(quant_model, inplace=True)
print(quant_model)



LeNet(
  (conv1): QuantizedConv2d(1, 6, kernel_size=(5, 5), stride=(1, 1), scale=0.16204923391342163, zero_point=36)
  (relu1): ReLU()
  (maxpool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): QuantizedConv2d(6, 16, kernel_size=(5, 5), stride=(1, 1), scale=0.3756241202354431, zero_point=54)
  (relu2): ReLU()
  (maxpool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): QuantizedLinear(in_features=256, out_features=120, scale=0.28476396203041077, zero_point=53, qscheme=torch.per_channel_affine)
  (relu3): ReLU()
  (fc2): QuantizedLinear(in_features=120, out_features=84, scale=0.2204333394765854, zero_point=55, qscheme=torch.per_channel_affine)
  (relu4): ReLU()
  (fc3): QuantizedLinear(in_features=84, out_features=10, scale=0.28054389357566833, zero_point=60, qscheme=torch.per_channel_affine)
  (quant): Quantize(scale=tensor([0.0256]), zero_point=tensor([17]), dtype=torch.quint8)
  (dequant): DeQuantize()
)


## 验证全int8模型的精度

可以看到全int8模型的精度和量化训练时的最后一个epoch的精度几乎一致。

In [81]:
final_quant_model_accuracy = evaluate_quant(quant_model, test_loader)
print(f"final_quant model has accuracy={final_quant_model_accuracy:.5f}%")

HBox(children=(FloatProgress(value=0.0, description='eval', max=157.0, style=ProgressStyle(description_width='…

final_quant model has accuracy=98.37000%
