In [1]:
import cv2
import logging
import functools
import numpy as np
from PIL import Image
from typing import (
    Tuple, 
    Optional,
    Callable
)

from pathlib import Path
import albumentations as albu
import matplotlib.pyplot as plt
from albumentations.pytorch.transforms import ToTensorV2
from pytorch_lightning.callbacks import QuantizationAwareTraining

import torch

from nncf import NNCFConfig  # Important - should be imported directly after torch
from nncf.common.utils.logger import set_log_level

set_log_level(logging.ERROR)  # Disables all NNCF info and warning messages
from nncf.torch import create_compressed_model, register_default_init_args
from openvino.inference_engine import IECore
from torch.jit import TracerWarning

import tensorboard
import torch.nn as nn
from torchmetrics import IoU
from torch.nn.utils import prune
import segmentation_models_pytorch as smp
from torch.utils.data import Dataset, DataLoader
from torchmetrics.detection.map import MeanAveragePrecision

import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint

import warnings
warnings.filterwarnings(action="ignore")

%load_ext tensorboard

2022-02-22 13:07:49.834150: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib/python3.7/site-packages/cv2/../../lib64:/usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2022-02-22 13:07:49.834192: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
  curr=torch.__version__


# Functions

In [2]:
def get_preprocessing(preprocessing_fn):
    """Construct preprocessing transform

    Args:
        preprocessing_fn (callbale): data normalization function
            (can be specific for each pretrained neural network)
    Return:
        transform: albumentations.Compose

    """

    _transform = [
        albu.Lambda(image=preprocessing_fn),
        ToTensorV2(transpose_mask=True),
    ]
    return albu.Compose(_transform)


class Transforms:
    def __init__(self, segment="train"):
        if segment == "train":
            transforms = [
                albu.LongestMaxSize(max_size=230, always_apply=True, p=1),
                albu.OneOf(
                    [
                        albu.ColorJitter(hue=0.01, saturation=0.01),
                        albu.RandomBrightnessContrast(brightness_limit=0.05, contrast_limit=0.05)
                    ], p=0.3
                ),
                albu.ShiftScaleRotate(border_mode=1, rotate_limit=30, p=0.3),
                albu.PadIfNeeded(
                    min_height=230,
                    min_width=230,
                    always_apply=True,
                    border_mode=0,
                    value=(255, 255, 255),
                ),
                albu.OneOf(
                    [
                        albu.RandomCrop(width=224, height=224),
                        albu.Resize(width=224, height=224)
                    ], p=1
                ),
                albu.HorizontalFlip(p=0.5),
            ]
        else:
            transforms = [
                albu.LongestMaxSize(max_size=224, always_apply=True, p=1),
                albu.PadIfNeeded(
                    min_height=224,
                    min_width=224,
                    always_apply=True,
                    border_mode=0,
                    value=(255, 255, 255),
                ),
            ]

        self.transforms = albu.Compose(transforms)

    def __call__(self, img, msk, *args, **kwargs):
        return self.transforms(image=np.array(img), mask=np.array(msk))


class SegmentationDataSet(Dataset):
    def __init__(
        self,
        images: list,
        masks: list,
        transform: Optional[Callable] = None,
        preprocessing: Optional[Callable] = None,
    ):
        super().__init__()

        # determine path lists
        self.images = sorted(images)
        self.masks = sorted(masks)

        # transformation
        self.transform = transform

        # preprocessing
        self.preprocessing = preprocessing

        # getting len info
        self.len_ = len(self.images)

    def __getitem__(self, index):
        # read data
        image = Image.open(str(self.images[index]))
        image = np.array(image)
        if len(image.shape) == 2:
            image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
        else:
            if image.shape[2] == 4:
                image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
        mask = Image.open(str(self.masks[index])).convert('L')
        mask = np.array(mask)
        # print(str(self.images[index]), f": shape {image.shape}, {mask.shape} \n")

        # Preprocessing
        if self.transform is not None:
            try:
                sample = self.transform(img=image, msk=mask)
                image, mask = sample["image"], sample["mask"]
            except:
                print(str(self.images[index]))
                print(image.shape)
                print(mask.shape)
                raise AssertionError()
           

        # apply preprocessing
        if self.preprocessing:
            try:
                mask = mask[..., np.newaxis]
                sample = self.preprocessing(image=image, mask=mask)
                image, mask = sample["image"], sample["mask"]
            except Exception:
                print(str(self.images[index]))
                print(image.shape)
                print(str(self.masks[index]))
                print(mask.shape)

        return image.float(), mask

    def __len__(self):
        return self.len_


In [3]:
BATCH_SIZE = 16


def BarcodeSegmentation(
    classes=None,
    model="FPN",
    encoder_name: str = "efficientnet-b3",
    encoder_weights: str = "imagenet",
    activation: str = "sigmoid",
    decoder_attention_type: str = "scse",
    model_weights_path=None,
):
    if classes is None:
        classes = ["barcode"]
    if hasattr(smp, model):
        model_class = getattr(smp, model)
        if isinstance(model_class, type):
            kwargs = {}
            if model == "UnetPlusPlus":
                kwargs = {"decoder_attention_type": decoder_attention_type}
            model = model_class(
                encoder_name=encoder_name,
                encoder_weights=encoder_weights,
                classes=len(classes),
                activation=activation,
                **kwargs,
            )
    else:
        raise ValueError(f"Unsupported model name {model}")

    if model_weights_path is not None:
        model.load_state_dict(torch.load(model_weights_path))

    preprocessing_fn = smp.encoders.get_preprocessing_fn(encoder_name, encoder_weights)

    return model, preprocessing_fn

In [4]:
def process_cnts(cnts):
    if cnts:
        cnts = sorted(cnts, key=cv2.contourArea, reverse=True)
        for i, cnt in enumerate(cnts):
            area = cv2.contourArea(cnt)
            if area > 50:
                continue
            else:
                cnts = cnts[:i]
                break
    return cnts

In [5]:
def detect_barcode(masks, target):
    
    masks = masks.squeeze().detach().cpu().numpy()
    
    detections = []

    for i, mask in enumerate(masks):
        mask = np.uint8(mask.squeeze()*255)
        (cnts, _) = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        cnts = process_cnts(cnts)
        
        # find all bboxes
        if cnts:
            barcodes = []
            for cnt in cnts:
                coords = list(cv2.boundingRect(cnt))
                coords[2] += coords[0]
                coords[3] += coords[1]
                barcodes.append(coords)
            
            # append them to list
            if target:
                dict_ = dict(
                    boxes=torch.Tensor(barcodes),
                    labels=torch.zeros(len(barcodes))
                )
            else:                   
                dict_ = dict(
                    boxes=torch.Tensor(barcodes),
                    scores=torch.ones(len(barcodes)) * 0.9,
                    labels=torch.zeros(len(barcodes))
                )

            detections.append(dict_)
                

        else:
            # create empy predictions list
            if target:
                dict_ = dict(
                    boxes=torch.Tensor([]),
                    labels=torch.Tensor([])
                )

            else:
                dict_ = dict(
                    boxes=torch.Tensor([]),
                    scores=torch.Tensor([]),
                    labels=torch.Tensor([])
                )
                
            detections.append(dict_)
            
    return detections

In [6]:
class Runner(pl.LightningModule):
    def __init__(self, model, classes, lr: float = 1e-3, scheduler_T=1000, is_quant=False) -> None:

        super().__init__()
        self.is_quant=is_quant
        self.model = model
        self.classes = classes
        self.lr = lr
        self.scheduler_T = scheduler_T
        self.criterion = smp.utils.losses.DiceLoss()

        # define metric
        self.metrics = torch.nn.ModuleDict(
            {
                "IOUScore": IoU(num_classes=2),
                "mAP": MeanAveragePrecision(box_format="xyxy")
            }
        )

    def training_step(
        self,
        batch: Tuple[torch.Tensor, torch.Tensor],
        batch_idx
    ) -> torch.Tensor:
        images, target_masks = batch
        predicted_masks = self.model(images.float())
        loss = self.criterion(predicted_masks, target_masks // 255)
        bboxes_target = detect_barcode(target_masks // 255, target=True)
        bboxes_predicted = detect_barcode(predicted_masks, target=False)
            
        for i, metric in enumerate(self.metrics.values()):
            if i == 1:
                metric.update(bboxes_predicted, bboxes_target)
            else:
                metric.update(predicted_masks, target_masks // 255)

        self.log("Train/Loss", loss.item(), on_step=True, batch_size=BATCH_SIZE)
        self.log(
            "Train/LR",
            self.lr_schedulers().get_last_lr()[0],
            on_step=True,
            batch_size=BATCH_SIZE,
        )

        return loss

    def validation_step(
        self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx
    ) -> torch.Tensor:

        images, target_masks = batch

        predicted_masks = self.model(images.float())
        loss = self.criterion(predicted_masks, target_masks // 255)
        
        bboxes_target = detect_barcode(target_masks // 255, target=True)
        bboxes_predicted = detect_barcode(predicted_masks, target=False)        

        for i, metric in enumerate(self.metrics.values()):
            if i == 1:
                metric.update(bboxes_predicted, bboxes_target)
            else:
                metric.update(predicted_masks, target_masks // 255)

        self.log(
            "Validation/Classification Loss",
            loss.item(),
            on_step=True,
            batch_size=BATCH_SIZE,
        )

        return loss

    def validation_epoch_end(self, something) -> None:
        # print("validation_epoch_end\n", something, "\n")
        print("\nValidation End \n", "******"*10,)
        for name, metric in self.metrics.items():
            metric_val = metric.compute()
            self.log(f"Validation/{name}", metric_val, on_step=False, on_epoch=True)
            metric.reset()

    def test_step(
        self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx
    ) -> torch.Tensor:

        images, target_masks = batch
        images = images.float()
        if self.is_quant:
            images = self.quant(images)
        predicted_masks = self.model(images)
        if self.is_quant:
            predicted_masks = self.dequant(predicted_masks)
        loss = self.criterion(predicted_masks, target_masks // 255)

        bboxes_target = detect_barcode(target_masks // 255, target=True) 
        bboxes_predicted = detect_barcode(predicted_masks, target=False)

        for i, metric in enumerate(self.metrics.values()):

            if i == 1:
                metric.update(bboxes_predicted, bboxes_target)
            else:
                metric.update(predicted_masks, target_masks // 255)

        self.log(
            "Test/Classification Loss",
            loss.item(),
            on_step=True,
            batch_size=BATCH_SIZE,
        )

        return loss

    def test_epoch_end(self, something) -> None:
        for name, metric in self.metrics.items():
            metric_val = metric.compute()
            self.log(f"Test/{name}", metric_val, on_step=False, on_epoch=True)
            metric.reset()

    def configure_optimizers(self):
        params = list(filter(lambda p: p.requires_grad, self.model.parameters()))
        optimizer = torch.optim.Adam(params, lr=self.lr)

        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer=optimizer, T_max=self.scheduler_T, eta_min=1e-8
        )

        return {
            "optimizer": optimizer,
            "lr_scheduler": {"scheduler": scheduler},
        }


# Dataset

In [7]:
model_unet, preprocessing_fn_unet = BarcodeSegmentation(model="UnetPlusPlus")
preprocessing_unet = get_preprocessing(preprocessing_fn_unet)

In [8]:
# gather paths to images
dataset_folder_to_use = Path("./Barcodes_v1.1_merged")

images_train = sorted([
    file for file in (dataset_folder_to_use / "train" / "Images").glob("*")
])
images_val = sorted([
    file for file in (dataset_folder_to_use / "val" / "Images").glob("*")
])
images_test = sorted([
    file for file in (dataset_folder_to_use / "test" / "Images").glob("*")
])
print("images_train: ", images_train[:1])

masks_train = sorted([
    file for file in (dataset_folder_to_use / "train" / "Mask").glob("*")
])
masks_val = sorted([
    file for file in (dataset_folder_to_use / "val" / "Mask").glob("*")
])
masks_test = sorted([
    file for file in (dataset_folder_to_use / "test" / "Mask").glob("*")
])
print("masks_train: ", masks_train[:1])

check_trian = [x.stem for x in masks_train]
check_val = [x.stem for x in masks_val]
check_test = [x.stem for x in masks_test]

images_train = [x for x in images_train if x.stem in check_trian]
images_val = [x for x in images_val if x.stem in check_val]
images_test = [x for x in images_test if x.stem in check_test]


print(f"is len of train images == len of train masks: {len(images_train) == len(masks_train)}")
print(f"is len of val images == len of val masks: {len(images_train) == len(masks_train)}")
print(f"is len of test images == len of test masks: {len(images_train) == len(masks_train)}")

for img, msk in zip(images_train, masks_train):
    if Path(img).stem != Path(msk).stem:
        print("Error!")
        raise AssertionError()

images_train:  [PosixPath('Barcodes_v1.1_merged/train/Images/0011210009585_1.jpg')]
masks_train:  [PosixPath('Barcodes_v1.1_merged/train/Mask/0_0_0.png')]
is len of train images == len of train masks: True
is len of val images == len of val masks: True
is len of test images == len of test masks: True


In [9]:
train_dataset = SegmentationDataSet(
    images=images_train,
    masks=masks_train,
    transform=Transforms(),
    preprocessing=preprocessing_unet,
)
val_dataset = SegmentationDataSet(
    images=images_val,
    masks=masks_val,
    transform=Transforms(segment="val"),
    preprocessing=preprocessing_unet,
)
test_dataset = SegmentationDataSet(
    images=images_test,
    masks=masks_test,
    transform=Transforms(segment="val"),
    preprocessing=preprocessing_unet
)

train_dl_unet = DataLoader(
    train_dataset,
    BATCH_SIZE,
    pin_memory=False,
    num_workers=4,
    drop_last=True,
)

val_dl_unet = DataLoader(
    val_dataset,
    BATCH_SIZE,
    pin_memory=False,
    shuffle=False,
    num_workers=4,
    drop_last=False,
)

test_dl_unet = DataLoader(
    test_dataset,
    BATCH_SIZE,
    pin_memory=False,
    shuffle=False,
    num_workers=4,
    drop_last=False,
)

# Model

In [10]:
chkpt = torch.load("./checkpoint-UnetPlusPlus-epoch=09.ckpt")

prefix = 'model.'
n_clip = len(prefix)
adapted_chkpt = {k[n_clip:]: v for k, v in chkpt["state_dict"].items() if k.startswith(prefix)}
model_unet.load_state_dict(adapted_chkpt)

<All keys matched successfully>

# Model Metrics Before Optimization

In [11]:
max_epochs = 1

In [42]:
runner_unet = Runner(
    model=model_unet,
    classes=["barcode"],
    lr=1e-3,
    scheduler_T=max_epochs * len(train_dl_unet)
)


In [43]:
trainer = pl.Trainer(gpus=-1)
score = trainer.test(runner_unet, dataloaders=val_dl_unet)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'Test/Classification Loss': 0.053776539862155914,
 'Test/Classification Loss_epoch': 0.053776539862155914,
 'Test/IOUScore': 0.9542956948280334,
 'Test/mAP': {'map': tensor(0.5806, device='cuda:0'),
              'map_50': tensor(0.8760, device='cuda:0'),
              'map_75': tensor(0.6039, device='cuda:0'),
              'map_large': tensor(0.9222, device='cuda:0'),
              'map_medium': tensor(0.7587, device='cuda:0'),
              'map_per_class': tensor(-1., device='cuda:0'),
              'map_small': tensor(0.3731, device='cuda:0'),
              'mar_1': tensor(0.6283, device='cuda:0'),
              'mar_10': tensor(0.7152, device='cuda:0'),
              'mar_100': tensor(0.7152, device='cuda:0'),
              'mar_100_per_class': tensor(-1., device='cuda:0'),
              'mar_large': tensor(0.9500, device='cuda:0'),
              'mar_medium': tensor(0.8123

# Global pruning

"The point of PyTorch pruning, at the moment, is not necessarily to guarantee inference time speedups or memory savings.  
It’s more of an experimental feature to enable pruning research."

In [23]:
def prune_model_global_unstructured(model, layer_type, proportion):
    module_tups = []
    for module in model.modules():
        if isinstance(module, layer_type):
            module_tups.append((module, 'weight'))

    prune.global_unstructured(
        parameters=module_tups, pruning_method=prune.L1Unstructured,
        amount=proportion
    )
    
    return model


def remove_params(model, layer_type):
    module_tups = []
    for module in model.modules():
        if isinstance(module, layer_type):
            module_tups.append((module, 'weight'))
            
    for module, _ in module_tups:
        prune.remove(module, 'weight')
    return model

In [24]:
def measure_module_sparsity(module, weight=True, bias=False, use_mask=False):

    num_zeros = 0
    num_elements = 0

    if use_mask == True:
        for buffer_name, buffer in module.named_buffers():
            if "weight_mask" in buffer_name and weight == True:
                num_zeros += torch.sum(buffer == 0).item()
                num_elements += buffer.nelement()
            if "bias_mask" in buffer_name and bias == True:
                num_zeros += torch.sum(buffer == 0).item()
                num_elements += buffer.nelement()
    else:
        for param_name, param in module.named_parameters():
            if "weight" in param_name and weight == True:
                num_zeros += torch.sum(param == 0).item()
                num_elements += param.nelement()
            if "bias" in param_name and bias == True:
                num_zeros += torch.sum(param == 0).item()
                num_elements += param.nelement()

    sparsity = num_zeros / num_elements

    return num_zeros, num_elements, sparsity

In [25]:
def measure_global_sparsity(model,
                            weight=True,
                            bias=False,
                            conv2d_use_mask=False,
                            linear_use_mask=False):

    num_zeros = 0
    num_elements = 0

    for module_name, module in model.named_modules():

        if isinstance(module, torch.nn.Conv2d):

            module_num_zeros, module_num_elements, _ = measure_module_sparsity(
                module, weight=weight, bias=bias, use_mask=conv2d_use_mask)
            num_zeros += module_num_zeros
            num_elements += module_num_elements

        elif isinstance(module, torch.nn.Linear):

            module_num_zeros, module_num_elements, _ = measure_module_sparsity(
                module, weight=weight, bias=bias, use_mask=linear_use_mask)
            num_zeros += module_num_zeros
            num_elements += module_num_elements

    sparsity = num_zeros / num_elements

    return num_zeros, num_elements, sparsity

In [46]:
def iterative_pruning_finetuning(
    model=runner_unet,
    train_loader=train_dl_unet,
    test_loader=val_dl_unet,
    conv2d_prune_proportion=0.1,
    num_iterations=2,
    num_epochs_per_iteration=2,
    fntn=False
):
    trainer = pl.Trainer(
        gpus=-1, 
        max_epochs=num_epochs_per_iteration
    )

    for i in range(num_iterations):
        model = model.cuda()
        print("\n\n", "========="*5)
        print("Pruning and Finetuning {}/{}".format(i + 1, num_iterations))

        print("Pruning...")
        model = prune_model_global_unstructured(model=model,
                                                layer_type=nn.Conv2d, 
                                                proportion=conv2d_prune_proportion)
        
        print("Measure the quality after Pruning...")
        num_zeros, num_elements, sparsity = measure_global_sparsity(
            model,
            weight=True,
            bias=False,
            conv2d_use_mask=True,
            linear_use_mask=False)
        print(f"Global Sparsity: {sparsity:0.02}")
        score = trainer.test(model, dataloaders=test_loader)

        if fntn:
            print("Fine-tuning the model...")
            # trainer.fit(model, train_loader, test_loader)

            print("Measure the quality after Fine-tuning...")
            num_zeros, num_elements, sparsity = measure_global_sparsity(
                model,
                weight=True,
                bias=False,
                conv2d_use_mask=True,
                linear_use_mask=False)
            print(f"Global Sparsity: {sparsity:0.02}")
            score = trainer.test(model, dataloaders=test_loader)


        
    # remove params
    model = remove_params(model, layer_type=nn.Conv2d)
    
    return model

In [47]:
model_pruned = iterative_pruning_finetuning(
    model=runner_unet,
    train_loader=train_dl_unet,
    test_loader=val_dl_unet,
    conv2d_prune_proportion=0.3,
    num_iterations=5,
    num_epochs_per_iteration=3
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs




Pruning and Finetuning 1/5
Pruning...


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Measure the quality after Pruning...
Global Sparsity: 0.3


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'Test/Classification Loss': 0.05466263368725777,
 'Test/Classification Loss_epoch': 0.05466263368725777,
 'Test/IOUScore': 0.9534895420074463,
 'Test/mAP': {'map': tensor(0.5792, device='cuda:0'),
              'map_50': tensor(0.9027, device='cuda:0'),
              'map_75': tensor(0.6047, device='cuda:0'),
              'map_large': tensor(0.9045, device='cuda:0'),
              'map_medium': tensor(0.7604, device='cuda:0'),
              'map_per_class': tensor(-1., device='cuda:0'),
              'map_small': tensor(0.3491, device='cuda:0'),
              'mar_1': tensor(0.6299, device='cuda:0'),
              'mar_10': tensor(0.7114, device='cuda:0'),
              'mar_100': tensor(0.7114, device='cuda:0'),
              'mar_100_per_class': tensor(-1., device='cuda:0'),
              'mar_large': tensor(0.9500, device='cuda:0'),
              'mar_medium': tensor(0.8132, 

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]




Pruning and Finetuning 2/5
Pruning...
Measure the quality after Pruning...
Global Sparsity: 0.51


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'Test/Classification Loss': 0.06966167688369751,
 'Test/Classification Loss_epoch': 0.06966167688369751,
 'Test/IOUScore': 0.9441142678260803,
 'Test/mAP': {'map': tensor(0.4131, device='cuda:0'),
              'map_50': tensor(0.6625, device='cuda:0'),
              'map_75': tensor(0.4191, device='cuda:0'),
              'map_large': tensor(0.7236, device='cuda:0'),
              'map_medium': tensor(0.6279, device='cuda:0'),
              'map_per_class': tensor(-1., device='cuda:0'),
              'map_small': tensor(0.2684, device='cuda:0'),
              'mar_1': tensor(0.5913, device='cuda:0'),
              'mar_10': tensor(0.6989, device='cuda:0'),
              'mar_100': tensor(0.6989, device='cuda:0'),
              'mar_100_per_class': tensor(-1., device='cuda:0'),
              'mar_large': tensor(0.9667, device='cuda:0'),
              'mar_medium': tensor(0.8000, 

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]




Pruning and Finetuning 3/5
Pruning...
Measure the quality after Pruning...
Global Sparsity: 0.66


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'Test/Classification Loss': 0.6694976687431335,
 'Test/Classification Loss_epoch': 0.6694976687431335,
 'Test/IOUScore': 0.562198281288147,
 'Test/mAP': {'map': tensor(0.1759, device='cuda:0'),
              'map_50': tensor(0.3553, device='cuda:0'),
              'map_75': tensor(0.1496, device='cuda:0'),
              'map_large': tensor(0.1366, device='cuda:0'),
              'map_medium': tensor(0.3010, device='cuda:0'),
              'map_per_class': tensor(-1., device='cuda:0'),
              'map_small': tensor(0.1539, device='cuda:0'),
              'mar_1': tensor(0.3515, device='cuda:0'),
              'mar_10': tensor(0.3770, device='cuda:0'),
              'mar_100': tensor(0.3770, device='cuda:0'),
              'mar_100_per_class': tensor(-1., device='cuda:0'),
              'mar_large': tensor(0.1333, device='cuda:0'),
              'mar_medium': tensor(0.4170, dev

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]




Pruning and Finetuning 4/5
Pruning...
Measure the quality after Pruning...
Global Sparsity: 0.76


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'Test/Classification Loss': 0.9994770288467407,
 'Test/Classification Loss_epoch': 0.9994770288467407,
 'Test/IOUScore': 0.46478089690208435,
 'Test/mAP': {'map': tensor(0., device='cuda:0'),
              'map_50': tensor(0., device='cuda:0'),
              'map_75': tensor(0., device='cuda:0'),
              'map_large': tensor(-1., device='cuda:0'),
              'map_medium': tensor(0., device='cuda:0'),
              'map_per_class': tensor(-1., device='cuda:0'),
              'map_small': tensor(-1., device='cuda:0'),
              'mar_1': tensor(0., device='cuda:0'),
              'mar_10': tensor(0., device='cuda:0'),
              'mar_100': tensor(0., device='cuda:0'),
              'mar_100_per_class': tensor(-1., device='cuda:0'),
              'mar_large': tensor(-1., device='cuda:0'),
              'mar_medium': tensor(0., device='cuda:0'),
              'mar_small

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]




Pruning and Finetuning 5/5
Pruning...
Measure the quality after Pruning...
Global Sparsity: 0.83


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'Test/Classification Loss': 0.9992843866348267,
 'Test/Classification Loss_epoch': 0.9992843866348267,
 'Test/IOUScore': 0.46478089690208435,
 'Test/mAP': {'map': tensor(-1., device='cuda:0'),
              'map_50': tensor(-1., device='cuda:0'),
              'map_75': tensor(-1., device='cuda:0'),
              'map_large': tensor(-1., device='cuda:0'),
              'map_medium': tensor(-1., device='cuda:0'),
              'map_per_class': tensor(-1., device='cuda:0'),
              'map_small': tensor(-1., device='cuda:0'),
              'mar_1': tensor(-1., device='cuda:0'),
              'mar_10': tensor(-1., device='cuda:0'),
              'mar_100': tensor(-1., device='cuda:0'),
              'mar_100_per_class': tensor(-1., device='cuda:0'),
              'mar_large': tensor(-1., device='cuda:0'),
              'mar_medium': tensor(-1., device='cuda:0'),
              'm

### Results:

| Sparsity level      | IoU Score |
| :-----------: | :-----------: |
| 0.3      | 0.95       |
| 0.51   | 0.945        |
| 0.66   | 0.56        |
| 0.76   | 0.46        |
| 0.83   | 0.46        |

As can be seen from the table, we can zero out about 50% of the weights in the convolutional layers of the model without significant loss of quality.

# Quantization

In [98]:
model_unet_fp32, preprocessing_fn_unet = BarcodeSegmentation(model="UnetPlusPlus")
preprocessing_unet = get_preprocessing(preprocessing_fn_unet)

chkpt = torch.load("./checkpoint-UnetPlusPlus-epoch=09.ckpt")

prefix = 'model.'
n_clip = len(prefix)
adapted_chkpt = {k[n_clip:]: v for k, v in chkpt["state_dict"].items() if k.startswith(prefix)}
model_unet_fp32.load_state_dict(adapted_chkpt)

<All keys matched successfully>

Notes:  
Quantization is useful when it is required to serve large models on machines with limited memory, or when there’s a need to switch between models and reducing the I/O time is important.

Teoretical reductions:
* 2-4x reduction in memory bandwidth
* 2-4x faster inference due to savings in memory bandwidth and compute

https://spell.ml/blog/pytorch-quantization-X8e7wBAAACIAHPhT

## Dynamic quantization

An important limitation of Dynamic Quantization, while it is the easiest workflow if you do not have a pre-trained quantized model ready for use, is that it currently only supports nn.Linear and nn.LSTM in qconfig_spec

In [26]:
import torch.quantization

quantized_model = torch.quantization.quantize_dynamic(
    model_unet, {torch.nn.Linear}, dtype=torch.qint8
)

So, this type of quantization is absolutely useless for the segmentation model

## Static quantization

Static quantization works by fine-tuning the quantization algorithm on a test dataset after initial model training is complete. This additional scoring process is not used to fine-tune the model—only to adjust the quantization algorithm parameters

Еo get the most performance out of static quantization, you need to also use module fusion. Module fusion is the technique of combining ("fusing") sequences of high-level layers, e.g. Conv2d + Batchnorm, into a single combined layer. 
This improves performance by pushing the combined sequence of operations into the low-level library, allowing it to be computed in one shot, e.g. without having to surface an intermediate representation back to the PyTorch Python process. This speeds things up and leads to more accurate results, albeit at the cost of debuggability.

### Pytorch Lighting quantization

In [82]:
model_unet_fp32 = Runner(
    model=model_unet_fp32.eval(),
    classes=["barcode"],
    is_quant=True
)

In [83]:
# QuantStub converts tensors from floating point to quantized
model_unet_fp32.quant = torch.quantization.QuantStub()
# DeQuantStub converts tensors from quantized to floating point
model_unet_fp32.dequant = torch.quantization.DeQuantStub()

In [84]:
# modify forward pass
# model_unet_fp32.forward = wrap_quantize_forward_context(model=model_unet_fp32, func=model_unet_fp32.forward)

# def wrap_quantize_forward_context(model: "pl.LightningModule", func):
#     """Decorator to wrap forward path as it is needed to quantize inputs and dequantize outputs for in/out
#     compatibility."""
#     # todo: consider using registering hook before/after forward
#     @functools.wraps(func)
#     def wrapper(data):
#         data = model.quant(data)
#         data = func(data)
#         data = model.dequant(data)
#         return data

#     return wrapper

In [85]:
# set config
model_unet_fp32.qconfig = torch.quantization.get_default_qconfig('fbgemm')

In [86]:
# # model fuze 

# from pytorch_lightning.utilities.exceptions import MisconfigurationException

# modules_to_fuse=[['_blocks']]

# def _recursive_hasattr(obj, attribs: str, state: bool = True) -> bool:
#     """recursive check if model has some layers denoted with '.'."""
#     print(attribs)
#     if "." in attribs:
#         attrib, attribs = attribs.split(".", 1)
#         if hasattr(obj, attrib):
#             return _recursive_hasattr(getattr(obj, attrib), attribs, state)
#         return False
#     return state and hasattr(obj, attribs)

# def _check_feasible_fuse(model: "pl.LightningModule") -> bool:
#     if not modules_to_fuse:
#         return False
#     for group in modules_to_fuse:
#         if not all(_recursive_hasattr(model, m) for m in group):
#             raise MisconfigurationException(
#                 f"You have requested to fuse {group} but one or more of them is not your model attributes"
#             )
            
# _check_feasible_fuse(model=model_unet_fp32.model.encoder)

# # fuse modules
# model_fp32_fused = torch.quantization.fuse_modules(model_unet_fp32.model, [['_conv_stem', 'BatchNorm2d']])

In [87]:
# prepare model
model_unet_fp32_prepared = torch.quantization.prepare(model_unet_fp32)

In [None]:
trainer_static = pl.Trainer(
    accelerator="cpu"
)

metrics = trainer_static.test(model_unet_fp32_prepared, val_dl_unet)

In [89]:
# convert model to model_int8
model_int8_static = torch.quantization.convert(model_unet_fp32_prepared)

In [90]:
print(model_int8_static.quant)

Quantize(scale=tensor([0.0374]), zero_point=tensor([57]), dtype=torch.quint8)


In [95]:
batch = iter(val_dl_unet).next()[0][0].squeeze().float()
model_int8_static.model(model_int8_static.quant(batch[None, ...]))

NotImplementedError: Could not run 'aten::thnn_conv2d_forward' with arguments from the 'QuantizedCPU' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'aten::thnn_conv2d_forward' is only available for these backends: [CPU, CUDA, BackendSelect, Named, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradXLA, UNKNOWN_TENSOR_TYPE_ID, AutogradMLC, AutogradHPU, AutogradNestedTensor, AutogradPrivateUse1, AutogradPrivateUse2, AutogradPrivateUse3, Tracer, Autocast, Batched, VmapMode].

CPU: registered at /opt/conda/conda-bld/pytorch_1623448265233/work/build/aten/src/ATen/RegisterCPU.cpp:16286 [kernel]
CUDA: registered at /opt/conda/conda-bld/pytorch_1623448265233/work/build/aten/src/ATen/RegisterCUDA.cpp:20674 [kernel]
BackendSelect: fallthrough registered at /opt/conda/conda-bld/pytorch_1623448265233/work/aten/src/ATen/core/BackendSelectFallbackKernel.cpp:3 [backend fallback]
Named: registered at /opt/conda/conda-bld/pytorch_1623448265233/work/aten/src/ATen/core/NamedRegistrations.cpp:7 [backend fallback]
ADInplaceOrView: fallthrough registered at /opt/conda/conda-bld/pytorch_1623448265233/work/aten/src/ATen/core/VariableFallbackKernel.cpp:60 [backend fallback]
AutogradOther: registered at /opt/conda/conda-bld/pytorch_1623448265233/work/torch/csrc/autograd/generated/VariableType_0.cpp:9848 [autograd kernel]
AutogradCPU: registered at /opt/conda/conda-bld/pytorch_1623448265233/work/torch/csrc/autograd/generated/VariableType_0.cpp:9848 [autograd kernel]
AutogradCUDA: registered at /opt/conda/conda-bld/pytorch_1623448265233/work/torch/csrc/autograd/generated/VariableType_0.cpp:9848 [autograd kernel]
AutogradXLA: registered at /opt/conda/conda-bld/pytorch_1623448265233/work/torch/csrc/autograd/generated/VariableType_0.cpp:9848 [autograd kernel]
UNKNOWN_TENSOR_TYPE_ID: registered at /opt/conda/conda-bld/pytorch_1623448265233/work/torch/csrc/autograd/generated/VariableType_0.cpp:9848 [autograd kernel]
AutogradMLC: registered at /opt/conda/conda-bld/pytorch_1623448265233/work/torch/csrc/autograd/generated/VariableType_0.cpp:9848 [autograd kernel]
AutogradHPU: registered at /opt/conda/conda-bld/pytorch_1623448265233/work/torch/csrc/autograd/generated/VariableType_0.cpp:9848 [autograd kernel]
AutogradNestedTensor: registered at /opt/conda/conda-bld/pytorch_1623448265233/work/torch/csrc/autograd/generated/VariableType_0.cpp:9848 [autograd kernel]
AutogradPrivateUse1: registered at /opt/conda/conda-bld/pytorch_1623448265233/work/torch/csrc/autograd/generated/VariableType_0.cpp:9848 [autograd kernel]
AutogradPrivateUse2: registered at /opt/conda/conda-bld/pytorch_1623448265233/work/torch/csrc/autograd/generated/VariableType_0.cpp:9848 [autograd kernel]
AutogradPrivateUse3: registered at /opt/conda/conda-bld/pytorch_1623448265233/work/torch/csrc/autograd/generated/VariableType_0.cpp:9848 [autograd kernel]
Tracer: registered at /opt/conda/conda-bld/pytorch_1623448265233/work/torch/csrc/autograd/generated/TraceType_0.cpp:9750 [kernel]
Autocast: fallthrough registered at /opt/conda/conda-bld/pytorch_1623448265233/work/aten/src/ATen/autocast_mode.cpp:255 [backend fallback]
Batched: registered at /opt/conda/conda-bld/pytorch_1623448265233/work/aten/src/ATen/BatchingRegistrations.cpp:1019 [backend fallback]
VmapMode: fallthrough registered at /opt/conda/conda-bld/pytorch_1623448265233/work/aten/src/ATen/VmapModeRegistrations.cpp:33 [backend fallback]


In [96]:
# trainer_static.test(model_int8_static, val_dl_unet)

### Pure pytorch quantization process

In [None]:
# add quant and dequant modules to the model
model_unet_fp32.add_module(name="quant", module=torch.quantization.QuantStub())
model_unet_fp32.add_module(name="dequant", module=torch.quantization.DeQuantStub())

In [19]:
# Set the backend on which the quantized kernels need to be run
torch.backends.quantized.engine='fbgemm'


# attach a global qconfig, which contains information about what kind
# of observers to attach. Use 'fbgemm' for server inference and
# 'qnnpack' for mobile inference. Other quantization configurations such
# as selecting symmetric or assymetric quantization and MinMax or L2Norm
# calibration techniques can be specified here.
model_unet_fp32.qconfig = torch.quantization.get_default_qconfig('fbgemm')

In [None]:
# Fuse the activations to preceding layers, where applicable.
# This needs to be done manually depending on the model architecture.
# Common fusions include `conv + relu` and `conv + batchnorm + relu`
model_unet_fp32 = torch.quantization.fuse_modules(model_unet_fp32, [['conv', 'relu']])

In [20]:
# Prepare the model for static quantization. This inserts observers in
# the model that will observe activation tensors during calibration.
model_unet_fp32_prepared = torch.quantization.prepare(model_unet_fp32)

https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html  
https://pytorch.org/blog/introduction-to-quantization-on-pytorch/

In [None]:
# calibrate the prepared model to determine quantization parameters for activations
# in a real world setting, the calibration would be done with a representative dataset
model_unet_fp32_prepared(input_fp32)

In [22]:
# Convert the observed model to a quantized model. This does several things:
# quantizes the weights, computes and stores the scale and bias value to be
# used with each activation tensor, and replaces key operators with quantized
# implementations.
model_int8 = torch.quantization.convert(model_unet_fp32_prepared)

## QAT

Quantization Aware Training (QAT) mimics the effects of quantization during training: The computations are carried-out in floating-point precision but the subsequent quantization effect is taken into account. The weights and activations are quantized into lower precision only for inference, when training is completed.

### Pure pytorch qat

In [None]:
# specify quantization config for QAT
qat_model.qconfig=torch.quantization.get_default_qat_qconfig('fbgemm')

# prepare QAT
torch.quantization.prepare_qat(qat_model, inplace=True)

# run training

# convert to quantized version, removing dropout, to check for accuracy on each
epochquantized_model=torch.quantization.convert(qat_model.eval(), inplace=False)

### Pytorch Lighting QAT

In [99]:
qmodel = Runner(
    model=model_unet_fp32,
    classes=["barcode"],
    lr=1e-5,
    scheduler_T=3 * len(train_dl_unet),
)


qcb = QuantizationAwareTraining(
    quantize_on_fit_end=True,
    input_compatible=True
)


trainer_unet = pl.Trainer(
    max_epochs=1,
    gpus=-1,
    callbacks=[qcb],
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [100]:
trainer_unet.fit(qmodel, train_dl_unet)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type         | Params
-------------------------------------------
0 | model     | UnetPlusPlus | 13.7 M
1 | criterion | DiceLoss     | 0     
2 | metrics   | ModuleDict   | 0     
3 | quant     | QuantStub    | 0     
4 | dequant   | DeQuantStub  | 0     
-------------------------------------------
13.7 M    Trainable params
0         Non-trainable params
13.7 M    Total params
54.861    Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

In [101]:
batch = iter(val_dl_unet).next()[0][0].squeeze().float()
qmodel.model(qmodel.quant(batch[None, ...]))

NotImplementedError: Could not run 'aten::thnn_conv2d_forward' with arguments from the 'QuantizedCPU' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'aten::thnn_conv2d_forward' is only available for these backends: [CPU, CUDA, BackendSelect, Named, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradXLA, UNKNOWN_TENSOR_TYPE_ID, AutogradMLC, AutogradHPU, AutogradNestedTensor, AutogradPrivateUse1, AutogradPrivateUse2, AutogradPrivateUse3, Tracer, Autocast, Batched, VmapMode].

CPU: registered at /opt/conda/conda-bld/pytorch_1623448265233/work/build/aten/src/ATen/RegisterCPU.cpp:16286 [kernel]
CUDA: registered at /opt/conda/conda-bld/pytorch_1623448265233/work/build/aten/src/ATen/RegisterCUDA.cpp:20674 [kernel]
BackendSelect: fallthrough registered at /opt/conda/conda-bld/pytorch_1623448265233/work/aten/src/ATen/core/BackendSelectFallbackKernel.cpp:3 [backend fallback]
Named: registered at /opt/conda/conda-bld/pytorch_1623448265233/work/aten/src/ATen/core/NamedRegistrations.cpp:7 [backend fallback]
ADInplaceOrView: fallthrough registered at /opt/conda/conda-bld/pytorch_1623448265233/work/aten/src/ATen/core/VariableFallbackKernel.cpp:60 [backend fallback]
AutogradOther: registered at /opt/conda/conda-bld/pytorch_1623448265233/work/torch/csrc/autograd/generated/VariableType_0.cpp:9848 [autograd kernel]
AutogradCPU: registered at /opt/conda/conda-bld/pytorch_1623448265233/work/torch/csrc/autograd/generated/VariableType_0.cpp:9848 [autograd kernel]
AutogradCUDA: registered at /opt/conda/conda-bld/pytorch_1623448265233/work/torch/csrc/autograd/generated/VariableType_0.cpp:9848 [autograd kernel]
AutogradXLA: registered at /opt/conda/conda-bld/pytorch_1623448265233/work/torch/csrc/autograd/generated/VariableType_0.cpp:9848 [autograd kernel]
UNKNOWN_TENSOR_TYPE_ID: registered at /opt/conda/conda-bld/pytorch_1623448265233/work/torch/csrc/autograd/generated/VariableType_0.cpp:9848 [autograd kernel]
AutogradMLC: registered at /opt/conda/conda-bld/pytorch_1623448265233/work/torch/csrc/autograd/generated/VariableType_0.cpp:9848 [autograd kernel]
AutogradHPU: registered at /opt/conda/conda-bld/pytorch_1623448265233/work/torch/csrc/autograd/generated/VariableType_0.cpp:9848 [autograd kernel]
AutogradNestedTensor: registered at /opt/conda/conda-bld/pytorch_1623448265233/work/torch/csrc/autograd/generated/VariableType_0.cpp:9848 [autograd kernel]
AutogradPrivateUse1: registered at /opt/conda/conda-bld/pytorch_1623448265233/work/torch/csrc/autograd/generated/VariableType_0.cpp:9848 [autograd kernel]
AutogradPrivateUse2: registered at /opt/conda/conda-bld/pytorch_1623448265233/work/torch/csrc/autograd/generated/VariableType_0.cpp:9848 [autograd kernel]
AutogradPrivateUse3: registered at /opt/conda/conda-bld/pytorch_1623448265233/work/torch/csrc/autograd/generated/VariableType_0.cpp:9848 [autograd kernel]
Tracer: registered at /opt/conda/conda-bld/pytorch_1623448265233/work/torch/csrc/autograd/generated/TraceType_0.cpp:9750 [kernel]
Autocast: fallthrough registered at /opt/conda/conda-bld/pytorch_1623448265233/work/aten/src/ATen/autocast_mode.cpp:255 [backend fallback]
Batched: registered at /opt/conda/conda-bld/pytorch_1623448265233/work/aten/src/ATen/BatchingRegistrations.cpp:1019 [backend fallback]
VmapMode: fallthrough registered at /opt/conda/conda-bld/pytorch_1623448265233/work/aten/src/ATen/VmapModeRegistrations.cpp:33 [backend fallback]


# OpenVino

In [7]:
# !pip install openvino
# !pip install nncf

In [8]:
IMAGE_SIZE = [224, 224]
fp32_onnx_path = "./model_unet_fp32.onnx"
int8_onnx_path = "./model_unet_int8.onnx"
OUTPUT_DIR = "./output"

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

Using cuda device


In [10]:
model_unet_fp32, preprocessing_fn_unet = BarcodeSegmentation(model="UnetPlusPlus")
preprocessing_unet = get_preprocessing(preprocessing_fn_unet)

chkpt = torch.load("./checkpoint-UnetPlusPlus-epoch=09.ckpt")

prefix = 'model.'
n_clip = len(prefix)
adapted_chkpt = {k[n_clip:]: v for k, v in chkpt["state_dict"].items() if k.startswith(prefix)}
model_unet_fp32.load_state_dict(adapted_chkpt)

<All keys matched successfully>

In [11]:
# gather paths to images
dataset_folder_to_use = Path("./Barcodes_v1.1_merged")

images_train = sorted([
    file for file in (dataset_folder_to_use / "train" / "Images").glob("*")
])
images_val = sorted([
    file for file in (dataset_folder_to_use / "val" / "Images").glob("*")
])
images_test = sorted([
    file for file in (dataset_folder_to_use / "test" / "Images").glob("*")
])
print("images_train: ", images_train[:1])

masks_train = sorted([
    file for file in (dataset_folder_to_use / "train" / "Mask").glob("*")
])
masks_val = sorted([
    file for file in (dataset_folder_to_use / "val" / "Mask").glob("*")
])
masks_test = sorted([
    file for file in (dataset_folder_to_use / "test" / "Mask").glob("*")
])
print("masks_train: ", masks_train[:1])

check_trian = [x.stem for x in masks_train]
check_val = [x.stem for x in masks_val]
check_test = [x.stem for x in masks_test]

images_train = [x for x in images_train if x.stem in check_trian]
images_val = [x for x in images_val if x.stem in check_val]
images_test = [x for x in images_test if x.stem in check_test]


print(f"is len of train images == len of train masks: {len(images_train) == len(masks_train)}")
print(f"is len of val images == len of val masks: {len(images_train) == len(masks_train)}")
print(f"is len of test images == len of test masks: {len(images_train) == len(masks_train)}")

for img, msk in zip(images_train, masks_train):
    if Path(img).stem != Path(msk).stem:
        print("Error!")
        raise AssertionError()

images_train:  [PosixPath('Barcodes_v1.1_merged/train/Images/0011210009585_1.jpg')]
masks_train:  [PosixPath('Barcodes_v1.1_merged/train/Mask/0_0_0.png')]
is len of train images == len of train masks: True
is len of val images == len of val masks: True
is len of test images == len of test masks: True


In [12]:
train_dataset = SegmentationDataSet(
    images=images_train,
    masks=masks_train,
    transform=Transforms(),
    preprocessing=preprocessing_unet,
)
val_dataset = SegmentationDataSet(
    images=images_val,
    masks=masks_val,
    transform=Transforms(segment="val"),
    preprocessing=preprocessing_unet,
)
test_dataset = SegmentationDataSet(
    images=images_test,
    masks=masks_test,
    transform=Transforms(segment="val"),
    preprocessing=preprocessing_unet
)

train_dl_unet = DataLoader(
    train_dataset,
    BATCH_SIZE,
    pin_memory=False,
    num_workers=4,
    drop_last=True,
)

val_dl_unet = DataLoader(
    val_dataset,
    BATCH_SIZE,
    pin_memory=False,
    shuffle=False,
    num_workers=4,
    drop_last=False,
)

test_dl_unet = DataLoader(
    test_dataset,
    BATCH_SIZE,
    pin_memory=False,
    shuffle=False,
    num_workers=4,
    drop_last=False,
)

In [12]:
runner_unet = Runner(
    model=model_unet_fp32,
    classes=["barcode"],
)

trainer = pl.Trainer(gpus=-1)
score = trainer.test(runner_unet, dataloaders=test_dl_unet)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'Test/Classification Loss': 0.0664256140589714,
 'Test/Classification Loss_epoch': 0.0664256140589714,
 'Test/IOUScore': 0.9522693157196045,
 'Test/mAP': {'map': tensor(0.5790, device='cuda:0'),
              'map_50': tensor(0.8629, device='cuda:0'),
              'map_75': tensor(0.6746, device='cuda:0'),
              'map_large': tensor(0.7689, device='cuda:0'),
              'map_medium': tensor(0.7370, device='cuda:0'),
              'map_per_class': tensor(-1., device='cuda:0'),
              'map_small': tensor(0.4314, device='cuda:0'),
              'mar_1': tensor(0.6757, device='cuda:0'),
              'mar_10': tensor(0.7186, device='cuda:0'),
              'mar_100': tensor(0.7186, device='cuda:0'),
              'mar_100_per_class': tensor(-1., device='cuda:0'),
              'mar_large': tensor(0.8727, device='cuda:0'),
              'mar_medium': tensor(0.8234, de

Export the FP32 model to ONNX, which is supported by OpenVINO™ Toolkit, to benchmark it in comparison with the INT8 model.

In [13]:
dummy_input = torch.randn(1, 3, *IMAGE_SIZE).to(device)
model_unet_fp32.encoder.set_swish(memory_efficient=False)
model_unet_fp32 = model_unet_fp32.to(device)
torch.onnx.export(model_unet_fp32, dummy_input, fp32_onnx_path, opset_version=10)
print(f"FP32 ONNX model was exported to {fp32_onnx_path}.")

Configure NNCF parameters to specify compression

In [15]:
nncf_config_dict = {
    "input_info": {"sample_size": [1, 3, *IMAGE_SIZE]},
    "log_dir": str(OUTPUT_DIR),
    "compression": {
        "algorithm": "quantization",
        "initializer": {
            "range": {"num_init_samples": 15000},
            "batchnorm_adaptation": {"num_bn_adaptation_samples": 4000},
        },
    },
}

nncf_config = NNCFConfig.from_dict(nncf_config_dict)

In [16]:
nncf_config = register_default_init_args(nncf_config, train_dl_unet)

Create a quantized model from a pre-trained FP32 model and configuration object.

In [17]:
compression_ctrl, model = create_compressed_model(model_unet_fp32, nncf_config)

In [19]:
runner_unet = Runner(
    model=model,
    classes=["barcode"],
)

trainer = pl.Trainer(accelerator="cpu")
score = trainer.test(runner_unet, dataloaders=test_dl_unet)

GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'Test/Classification Loss': 0.0980178639292717,
 'Test/Classification Loss_epoch': 0.0980178639292717,
 'Test/IOUScore': 0.9451008439064026,
 'Test/mAP': {'map': tensor(0.6616),
              'map_50': tensor(0.8492),
              'map_75': tensor(0.7644),
              'map_large': tensor(0.8664),
              'map_medium': tensor(0.8352),
              'map_per_class': tensor(-1.),
              'map_small': tensor(0.4125),
              'mar_1': tensor(0.7536),
              'mar_10': tensor(0.7810),
              'mar_100': tensor(0.7810),
              'mar_100_per_class': tensor(-1.),
              'mar_large': tensor(0.9091),
              'mar_medium': tensor(0.9019),
              'mar_small': tensor(0.4940)}}
--------------------------------------------------------------------------------


**IOUScore of fp32 model**: 0.9523  
**IOUScore of int8 model**: 0.9451

Export INT8 model to ONNX

In [37]:
warnings.filterwarnings("ignore", category=TracerWarning)  # Ignore export warnings
warnings.filterwarnings("ignore", category=UserWarning)
compression_ctrl.export_model(int8_onnx_path)
print(f"INT8 ONNX model exported to {int8_onnx_path}.")

INT8 ONNX model exported to ./model_unet_int8.onnx.


In [41]:
input_shape = [1, 3, *IMAGE_SIZE]
input_shape

[1, 3, 224, 224]

In [39]:
fp32_onnx_path = "./model_unet_fp32.onnx"
int8_onnx_path = "./model_unet_int8.onnx"

'$fp32_onnx_path'

In [16]:
preprocessing_fn_unet

functools.partial(<function preprocess_input at 0x7f0fd8a28950>, input_space='RGB', input_range=[0, 1], mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

In [22]:
!mo --input_model "model_unet_fp32.onnx" --input_shape "[1, 3, 224, 224]" --scale "255" --mean_values "[0.485, 0.456, 0.406]" --scale_values "[0.229, 0.224, 0.225]" --data_type FP16 --output_dir "output/fp32"

Model Optimizer arguments:
Common parameters:
	- Path to the Input Model: 	/home/jupyter/barcodes/segnet/model_unet_fp32.onnx
	- Path for generated IR: 	/home/jupyter/barcodes/segnet/output/fp32
	- IR output name: 	model_unet_fp32
	- Log level: 	ERROR
	- Batch: 	Not specified, inherited from the model
	- Input layers: 	Not specified, inherited from the model
	- Output layers: 	Not specified, inherited from the model
	- Input shapes: 	[1, 3, 224, 224]
	- Mean values: 	[123.675, 116.28 , 103.53]
	- Scale values: 	[58.395, 57.12 , 57.375]
	- Scale factor: 	Not specified
	- Precision of IR: 	FP16
	- Enable fusing: 	True
	- Enable grouped convolutions fusing: 	True
	- Move mean values to preprocess section: 	None
	- Reverse input channels: 	False
ONNX specific parameters:
	- Inference Engine found in: 	/opt/conda/lib/python3.7/site-packages/openvino
Inference Engine version: 	2021.4.2-3976-0943ed67223-refs/pull/539/head
Model Optimizer version: 	2021.4.2-3976-0943ed67223-refs/pull/539/head


In [23]:
!mo --input_model "model_unet_int8.onnx" --input_shape "[1, 3, 224, 224]" --scale "255" --mean_values "[0.485, 0.456, 0.406]" --scale_values "[0.229, 0.224, 0.225]" --data_type FP16 --output_dir "output/int8"

Model Optimizer arguments:
Common parameters:
	- Path to the Input Model: 	/home/jupyter/barcodes/segnet/model_unet_int8.onnx
	- Path for generated IR: 	/home/jupyter/barcodes/segnet/output/int8
	- IR output name: 	model_unet_int8
	- Log level: 	ERROR
	- Batch: 	Not specified, inherited from the model
	- Input layers: 	Not specified, inherited from the model
	- Output layers: 	Not specified, inherited from the model
	- Input shapes: 	[1, 3, 224, 224]
	- Mean values: 	[123.675, 116.28, 103.53]
	- Scale values: 	[58.395, 57.12, 57.375]
	- Scale factor: 	Not specified
	- Precision of IR: 	FP16
	- Enable fusing: 	True
	- Enable grouped convolutions fusing: 	True
	- Move mean values to preprocess section: 	None
	- Reverse input channels: 	False
ONNX specific parameters:
	- Inference Engine found in: 	/opt/conda/lib/python3.7/site-packages/openvino
Inference Engine version: 	2021.4.2-3976-0943ed67223-refs/pull/539/head
Model Optimizer version: 	2021.4.2-3976-0943ed67223-refs/pull/539/head
[ 

we will measure the inference performance of the FP32 and INT8 models. To do this, we use Benchmark Tool - OpenVINO’s inference performance measurement tool. By default, Benchmark Tool runs inference for 60 seconds in asynchronous mode on CPU. It returns inference speed as latency (milliseconds per image) and throughput (frames per second) values.

In [31]:
def parse_benchmark_output(benchmark_output: str):
    """Prints the output from benchmark_app in human-readable format"""
    parsed_output = [line for line in benchmark_output if not (line.startswith(r"[") or line.startswith("  ") or line == "")]
    print(*parsed_output, sep='\n')


print('Benchmark FP32 model (IR)')
benchmark_output = !benchmark_app -m "./output/fp32/model_unet_fp32.xml" -d CPU -api async -t 60
parse_benchmark_output(benchmark_output)

print('\nBenchmark INT8 model (IR)')
benchmark_output = !benchmark_app -m "./output/int8//model_unet_int8.xml" -d CPU -api async -t 60
parse_benchmark_output(benchmark_output)

Benchmark FP32 model (IR)
Count:      520 iterations
Duration:   60833.47 ms
Latency:    465.50 ms
Throughput: 8.55 FPS

Benchmark INT8 model (IR)
Count:      776 iterations
Duration:   60435.36 ms
Latency:    301.43 ms
Throughput: 12.84 FPS


In [30]:
ie = IECore()
ie.get_metric(device_name="CPU", metric_name="FULL_DEVICE_NAME")

'           Intel(R) Xeon(R) CPU @ 2.30GHz'