### Квантует DeepLabV3 MobilenetV3

Стартуем с трейнлупа, который нам выдали pytorch

Датасет COCO, https://cocodataset.org/#download 
Качаем train2017 и val2017

Можно использовать [сабсет](https://drive.google.com/file/d/1qdtAbK-iOsgJZxjbBva0pw2Vi5penjPc/view?usp=sharing) трейна на 20000, но тогда заранее залезте в класс датасета, и добавте работу с пропущенными картинками

Баллы: 20 баллов Static Quantization + 20 баллов Quantization Aware Training + 10 баллов Distillation

In [1]:
import datetime
import os
import pickle
import time
from copy import deepcopy
from pathlib import Path

import torch
import torch.utils.data
from torch import nn
from torch.ao.quantization.quantize_fx import convert_fx
from torch.ao.quantization.quantize_fx import fuse_fx
from torch.optim.lr_scheduler import PolynomialLR
from torchvision.models.segmentation import DeepLabV3_MobileNet_V3_Large_Weights, deeplabv3_mobilenet_v3_large
from tqdm import tqdm

import utils
from quantization_utils.fake_quantization import fake_quantization
from quantization_utils.static_quantization import quantize_static
from train import evaluate
from train import get_dataset
from train import train_one_epoch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Вытащил дефолтные аргументы, чтобы не упражняться с argparse в ноутбуке
with Path('./torch_default_args.pickle').open('rb') as file:
    args = pickle.load(file)

In [3]:
# Подобирайте под ваше железо
args.data_path = '/home/gvasserm/data/coco2017/'
args.epochs = 1
args.batch_size = 32
args.workers = 8

In [4]:
args

Namespace(data_path='/home/gvasserm/data/coco2017/', dataset='coco', model='deeplabv3_mobilenet_v3_large', aux_loss=False, device='cuda', batch_size=32, epochs=1, workers=8, lr=0.01, momentum=0.9, weight_decay=0.0001, lr_warmup_epochs=0, lr_warmup_method='linear', lr_warmup_decay=0.01, print_freq=10, output_dir='.', resume='', start_epoch=0, test_only=False, use_deterministic_algorithms=False, world_size=1, dist_url='env://', weights=None, weights_backbone=None, amp=False, backend='pil', use_v2=False)

### Сначала просто валидация обычной сетки, прям на гпу

In [5]:
model = deeplabv3_mobilenet_v3_large(weights=DeepLabV3_MobileNet_V3_Large_Weights.DEFAULT)
model.eval();

In [6]:
if args.output_dir:
    utils.mkdir(args.output_dir)

utils.init_distributed_mode(args)

device = torch.device(args.device)

dataset_test, num_classes = get_dataset(args, is_train=False)

test_sampler = torch.utils.data.SequentialSampler(dataset_test)

data_loader_test = torch.utils.data.DataLoader(
    dataset_test, batch_size=16, sampler=test_sampler, num_workers=args.workers, collate_fn=utils.collate_fn
)

Not using distributed mode
loading annotations into memory...
Done (t=0.27s)
creating index...
index created!


In [7]:
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
model.cuda()
confmat = evaluate(model, data_loader_test, device=device, num_classes=num_classes)
print(confmat)

Test:  [  0/209]  eta: 0:08:53    time: 2.5547  data: 1.3405  max mem: 3971
Test:  [100/209]  eta: 0:01:02    time: 0.4956  data: 0.0267  max mem: 11393
Test:  [200/209]  eta: 0:00:05    time: 0.5780  data: 0.0276  max mem: 11393
Test: Total time: 0:01:57
global correct: 91.2
average row correct: ['94.8', '84.0', '69.2', '70.9', '58.8', '45.3', '74.2', '59.5', '92.3', '31.4', '78.1', '54.0', '80.2', '78.6', '78.9', '87.7', '47.4', '87.2', '51.7', '84.3', '61.4']
IoU: ['90.3', '67.7', '56.0', '53.1', '42.7', '34.8', '67.1', '48.2', '74.7', '26.7', '63.3', '32.7', '61.8', '65.8', '66.8', '76.9', '24.9', '68.4', '42.4', '68.9', '50.8']
mean IoU: 56.4


  return torch.tensor(val)


### Заквантуем статические сетку, посмотрим на точность и скорость

In [7]:
# Квантуем
# Делаем fuse, делаем quantize_static и quantize_utils (посмотрите что там с кодом)
# Можно покрутить параметр num_batches, чтобы посмотреть сколько нужно данных на калибровку
model_fused = fuse_fx(model)
dataset_train, num_classes = get_dataset(args, is_train=True)
train_sampler = torch.utils.data.SequentialSampler(dataset_train)
data_loader_train = torch.utils.data.DataLoader(
        dataset_train, batch_size=24, sampler=train_sampler, num_workers=args.workers, collate_fn=utils.collate_fn
    )
q_model = quantize_static(model_fused, data_loader_train, num_batches=24, device='cuda:0')


loading annotations into memory...
Done (t=7.48s)
creating index...
index created!




In [9]:
# Замерим скорость квантованной модели на CPU
# Не забываем, от размера батча будет зависить буст!
def profile(model, bs=24, device='cpu'):
    # Ensure the model is in evaluation mode to disable layers like dropout and batchnorm during inference
    model.eval()
    model.to(device)

    input_tensor = torch.rand(bs, 3, 520, 520, dtype=torch.float)
    input_tensor = input_tensor.to(device)

    # Warm-up (optional, but recommended for more accurate timing, especially on GPU)
    with torch.no_grad():
        for _ in range(10):
            _ = model(input_tensor)

    # Timing starts here
    start_time = time.time()

    # Forward pass
    n = 10
    with torch.no_grad():
        for _ in range(10):
            output = model(input_tensor)

    # Timing ends here
    end_time = time.time()

    # Calculate and print the elapsed time in milliseconds
    elapsed_time_ms = (end_time - start_time) * 1000/n/bs
    print(f"Elapsed time for the forward pass: {elapsed_time_ms:.2f}ms, batch size: {bs}")

In [10]:
# Замеряем с бачтом 1, буста нет
# Замеряем с батчом 32, буст есть
# Мораль, latency != throughput. В сетке всегда есть накладные расходы, кроме перемалывания матричек
profile(q_model, bs=24, device='cpu')

Elapsed time for the forward pass: 127.67ms, batch size: 24


In [11]:
profile(q_model, bs=1, device='cpu')

Elapsed time for the forward pass: 147.68ms, batch size: 1


In [12]:
# Замерим скорость оригинальной модели на CPU
# У меня на intel i9 при батчсайзе 32 получился x2 буст
profile(model, bs=24, device='cpu')

Elapsed time for the forward pass: 98.06ms, batch size: 24


### Here I've received faster runtime for fp32 then for quantized model. Currently no idea how to improve it.

In [8]:
# Посчитаем метрики квантованной модели
# У меня была просадка где-то до 56 IoU
q_model.cpu()
confmat = evaluate(q_model, data_loader_test, device='cpu', num_classes=num_classes)
print(confmat)

'''
global correct: 89.8
average row correct: ['94.9', '77.5', '66.5', '67.9', '44.0', '25.9', '61.0', '45.2', '83.1', '20.9', '68.2', '57.0', '63.6', '62.1', '69.6', '80.9', '37.2', '75.0', '32.1', '67.7', '49.2']
IoU: ['88.9', '64.5', '52.6', '32.2', '34.5', '22.7', '57.8', '38.7', '62.0', '19.0', '55.6', '31.9', '51.9', '55.3', '61.5', '71.3', '22.3', '60.6', '29.0', '61.0', '44.0']
mean IoU: 48.4
'''

Test:  [  0/313]  eta: 0:42:53    time: 8.2228  data: 1.6694  max mem: 1000
Test:  [100/313]  eta: 0:19:24    time: 5.0432  data: 0.0012  max mem: 1000
Test:  [200/313]  eta: 0:10:16    time: 5.4540  data: 0.0011  max mem: 1000
Test:  [300/313]  eta: 0:01:11    time: 5.3760  data: 0.0013  max mem: 1000
Test: Total time: 0:28:29
global correct: 89.1
average row correct: ['94.5', '64.8', '60.4', '60.6', '32.8', '36.4', '62.4', '43.6', '79.0', '25.1', '44.8', '56.6', '52.6', '45.9', '62.4', '79.6', '32.2', '66.9', '26.5', '70.5', '53.2']
IoU: ['88.2', '58.0', '50.5', '33.7', '29.4', '27.5', '58.8', '37.7', '59.4', '21.8', '37.9', '31.4', '41.4', '42.8', '56.6', '69.4', '21.8', '54.7', '23.4', '61.7', '28.6']
mean IoU: 44.5


  return torch.tensor(val)


"\nglobal correct: 89.8\naverage row correct: ['94.9', '77.5', '66.5', '67.9', '44.0', '25.9', '61.0', '45.2', '83.1', '20.9', '68.2', '57.0', '63.6', '62.1', '69.6', '80.9', '37.2', '75.0', '32.1', '67.7', '49.2']\nIoU: ['88.9', '64.5', '52.6', '32.2', '34.5', '22.7', '57.8', '38.7', '62.0', '19.0', '55.6', '31.9', '51.9', '55.3', '61.5', '71.3', '22.3', '60.6', '29.0', '61.0', '44.0']\nmean IoU: 48.4\n"

In [None]:

'''
average row correct: ['94.0', '83.5', '71.0', '63.7', '56.1', '46.3', '77.0', '57.3', '89.2', '38.3', '76.4', '68.4', '77.8', '77.9', '78.5', '86.5', '46.9', '82.4', '66.6', '84.0', '64.9']
IoU: ['90.0', '67.6', '58.2', '53.5', '43.8', '34.8', '67.9', '45.9', '74.0', '28.9', '63.0', '37.2', '62.2', '63.9', '69.0', '75.5', '31.0', '63.4', '48.1', '67.5', '53.7']
mean IoU: 57.1
'''
'''
global correct: 90.9
average row correct: ['94.8', '78.2', '65.8', '60.7', '49.3', '44.9', '72.8', '56.1', '89.9', '31.3', '73.4', '59.6', '74.3', '74.0', '76.1', '84.9', '40.3', '77.6', '58.4', '79.2', '65.9']
IoU: ['90.0', '63.1', '54.7', '52.3', '40.8', '32.4', '65.4', '45.8', '67.7', '25.5', '61.7', '35.4', '55.4', '62.7', '65.7', '75.0', '24.7', '67.0', '44.1', '66.2', '50.4']
mean IoU: 54.6
'''