In [10]:
import torch
from torch import nn
from torch.ao import quantization
from torchvision import models
from torchvision import transforms
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder


class QuantizedVGG16(nn.Module):
    def __init__(self, model_fp32):
        super(QuantizedVGG16, self).__init__()
        self.quant = quantization.QuantStub()
        self.dequant = quantization.DeQuantStub()
        self.model_fp32 = model_fp32

    def forward(self, x):
        x = self.quant(x)
        x = self.model_fp32(x)
        x = self.dequant(x)
        return x

hyperparams = {
    "batch_size": 4,
    "learning_rate": 0.0001,
    "epochs": 5,
    "transform": transforms.Compose(
        [
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(
                mean=[0.48235, 0.45882, 0.40784],
                std=[1.0 / 255.0, 1.0 / 255.0, 1.0 / 255.0],
            ),
        ]
    ),
}

model = models.vgg16(num_classes=2)
model.load_state_dict(torch.load("./models/VGG16.pt"))

device = "cuda" if torch.cuda.is_available() else "cpu"
quantized_model = QuantizedVGG16(model).to(device)

quantization_backend = "fbgemm"
quantized_model.qconfig = quantization.get_default_qconfig(quantization_backend)
model_static_quantized = quantization.prepare(quantized_model)

calibartion_dataset = ImageFolder(
    "./datasets/pet/test",
    transform=hyperparams["transform"]
)
calibartion_dataloader = DataLoader(
    calibartion_dataset,
    batch_size=hyperparams["batch_size"]
)

for i, (images, target) in enumerate(calibartion_dataloader):
    if i >= 10:
        break
    model_static_quantized(images.to(device))

model_static_quantized.to("cpu")
model_static_quantized = quantization.convert(model_static_quantized)

torch.jit.save(torch.jit.script(model_static_quantized), "./models/PTSQ_VGG16.pt")

  model.load_state_dict(torch.load("./models/VGG16.pt"))


RuntimeError: Unsupported qscheme: per_channel_affine

In [9]:
import os
import time
import torch
from PIL import Image
from torchvision import models
from torchvision import transforms


transform = transforms.Compose(
    [
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(
            mean=[0.48235, 0.45882, 0.40784],
            std=[1.0 / 255.0, 1.0 / 255.0, 1.0 / 255.0],
        ),
    ]
)

image = Image.open("./datasets/images/cat.jpg")
inputs = transform(image).unsqueeze(0)

model = models.vgg16(num_classes=2)
model.load_state_dict(torch.load("./models/VGG16.pt"))

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model.eval()

model_static_quantized = torch.jit.load("./models/PTSQ_VGG16.pt")

with torch.no_grad():
    start_time = time.time()
    outputs = model(inputs.to(device))
    file_size = os.path.getsize("./models/VGG16.pt") / 1e6
    print("양자화 적용 전:")
    print(f"출력 결과: {outputs}")
    print(f"추론 시간: {time.time() - start_time:.4f}s")
    print(f"파일 크기: {file_size:.2f} MB")
    print("\n")

start_time = time.time()
outputs = model_static_quantized(inputs)
file_size = os.path.getsize("./models/PTSQ_VGG16.pt") / 1e6
end_time = time.time() - start_time
print("양자화 적용 후:")
print(f"출력 결과: {outputs}")
print(f"추론 시간: {time.time() - start_time:.4f}s")
print(f"파일 크기: {file_size:.2f} MB")

  model.load_state_dict(torch.load("./models/VGG16.pt"))


양자화 적용 전:
출력 결과: tensor([[ 25.6511, -22.3752]], device='cuda:0')
추론 시간: 0.0027s
파일 크기: 537.08 MB


양자화 적용 후:
출력 결과: tensor([[ 26.0821, -22.6373]])
추론 시간: 0.0415s
파일 크기: 134.55 MB
