In [1]:
from vision.ssd.mobilenetv1_ssd import create_mobilenetv1_ssd, create_mobilenetv1_ssd_predictor

In [2]:
import torch
from glob import glob
import cv2
from vision.ssd.data_preprocessing import PredictionTransform
import numpy as np

In [3]:
label_path = 'models/voc.txt'
class_names = [name.strip() for name in open(label_path).readlines()]

In [4]:
net = create_mobilenetv1_ssd(len(class_names), is_test=True, device='cpu', quantized=True)
net.load('models/mobilenet-v1-ssd-mp-0_675.pth')
net.cpu().eval()

  self.load_state_dict(torch.load(model, map_location=lambda storage, loc: storage))


SSD(
  (base_net): Sequential(
    (0): Sequential(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
    )
    (1): Sequential(
      (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (5): ReLU(inplace=True)
    )
    (2): Sequential(
      (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=64, bias=False)
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): Conv2d(64, 128, kernel_size=(1, 1), st

In [5]:
transform = PredictionTransform(300, np.array([127, 127, 127]), 128.0)

orig_image = cv2.imread("imgs/photo_2024-11-20_15-40-16.jpg")
image = cv2.cvtColor(orig_image, cv2.COLOR_BGR2RGB)

In [6]:
predictor = create_mobilenetv1_ssd_predictor(net, candidate_size=200, device='cpu')

In [7]:
boxes, labels, probs = predictor.predict(image, 10, 0.4)

Inference time:  0.10707521438598633


In [8]:
boxes

tensor([[ 85.8516, 223.7919, 227.4944, 370.2946],
        [235.7860,   0.6387, 499.8279, 377.8770]])

In [9]:
predictor.trace(image, "checkpoints/traced_m_v1.pt", onnx=False)

Trace time:  1.5798430442810059


In [10]:
net

SSD(
  (base_net): Sequential(
    (0): Sequential(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
    )
    (1): Sequential(
      (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (5): ReLU(inplace=True)
    )
    (2): Sequential(
      (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=64, bias=False)
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): Conv2d(64, 128, kernel_size=(1, 1), st

In [11]:
ffuse_list = [
    ["base_net.0.0", "base_net.0.1", "base_net.0.2"],

    ["base_net.1.0", "base_net.1.1", "base_net.1.2"],
    ["base_net.1.3", "base_net.1.4", "base_net.1.5"],

    ["base_net.2.0", "base_net.2.1", "base_net.2.2"],
    ["base_net.2.3", "base_net.2.4", "base_net.2.5"],

    ["base_net.3.0", "base_net.3.1", "base_net.3.2"],
    ["base_net.3.3", "base_net.3.4", "base_net.3.5"],

    ["base_net.4.0", "base_net.4.1", "base_net.4.2"],
    ["base_net.4.3", "base_net.4.4", "base_net.4.5"],   

    ["base_net.5.0", "base_net.5.1", "base_net.5.2"],
    ["base_net.5.3", "base_net.5.4", "base_net.5.5"], 

    ["base_net.6.0", "base_net.6.1", "base_net.6.2"],
    ["base_net.6.3", "base_net.6.4", "base_net.6.5"],

    ["base_net.7.0", "base_net.7.1", "base_net.7.2"],
    ["base_net.7.3", "base_net.7.4", "base_net.7.5"],

    ["base_net.8.0", "base_net.8.1", "base_net.8.2"],
    ["base_net.8.3", "base_net.8.4", "base_net.8.5"],

    ["base_net.9.0", "base_net.9.1", "base_net.9.2"],
    ["base_net.9.3", "base_net.9.4", "base_net.9.5"],

    ["base_net.10.0", "base_net.10.1", "base_net.10.2"],
    ["base_net.10.3", "base_net.10.4", "base_net.10.5"],

    ["base_net.11.0", "base_net.11.1", "base_net.11.2"],
    ["base_net.11.3", "base_net.11.4", "base_net.11.5"],

    ["base_net.12.0", "base_net.12.1", "base_net.12.2"],
    ["base_net.12.3", "base_net.12.4", "base_net.12.5"],

    ["base_net.13.0", "base_net.13.1", "base_net.13.2"],
    ["base_net.13.3", "base_net.13.4", "base_net.13.5"],

    ['extras.0.0', 'extras.0.1'],
    ['extras.0.2', 'extras.0.3'],

    ['extras.1.0', 'extras.1.1'],
    ['extras.1.2', 'extras.1.3'],

    ['extras.2.0', 'extras.2.1'],
    ['extras.2.2', 'extras.2.3'],

    ['extras.3.0', 'extras.3.1'],
    ['extras.3.2', 'extras.3.3'],
]

torch.quantization.fuse_modules(net, ffuse_list, inplace=True)

SSD(
  (base_net): Sequential(
    (0): Sequential(
      (0): ConvReLU2d(
        (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        (1): ReLU(inplace=True)
      )
      (1): Identity()
      (2): Identity()
    )
    (1): Sequential(
      (0): ConvReLU2d(
        (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32)
        (1): ReLU(inplace=True)
      )
      (1): Identity()
      (2): Identity()
      (3): ConvReLU2d(
        (0): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1))
        (1): ReLU(inplace=True)
      )
      (4): Identity()
      (5): Identity()
    )
    (2): Sequential(
      (0): ConvReLU2d(
        (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=64)
        (1): ReLU(inplace=True)
      )
      (1): Identity()
      (2): Identity()
      (3): ConvReLU2d(
        (0): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1))
        (1): ReLU(inplace=True)
      )
      (4): Identit

In [12]:
predictor.trace(image, "checkpoints/traced_m_f_v1.pt", onnx=False)

Trace time:  1.487236499786377


In [13]:
net.qconfig = torch.quantization.get_default_qconfig("qnnpack")

In [14]:
for name, value in net.__dict__.items():
    print(f"{name}: {type(value)}")

training: <class 'bool'>
_parameters: <class 'collections.OrderedDict'>
_buffers: <class 'collections.OrderedDict'>
_non_persistent_buffers_set: <class 'set'>
_backward_pre_hooks: <class 'collections.OrderedDict'>
_backward_hooks: <class 'collections.OrderedDict'>
_is_full_backward_hook: <class 'NoneType'>
_forward_hooks: <class 'collections.OrderedDict'>
_forward_hooks_with_kwargs: <class 'collections.OrderedDict'>
_forward_hooks_always_called: <class 'collections.OrderedDict'>
_forward_pre_hooks: <class 'collections.OrderedDict'>
_forward_pre_hooks_with_kwargs: <class 'collections.OrderedDict'>
_state_dict_hooks: <class 'collections.OrderedDict'>
_state_dict_pre_hooks: <class 'collections.OrderedDict'>
_load_state_dict_pre_hooks: <class 'collections.OrderedDict'>
_load_state_dict_post_hooks: <class 'collections.OrderedDict'>
_modules: <class 'collections.OrderedDict'>
num_classes: <class 'int'>
source_layer_indexes: <class 'list'>
is_test: <class 'bool'>
config: <class 'module'>
quan

In [15]:
model_fp32_prepared = torch.quantization.prepare(net, inplace=True)

In [16]:
model_fp32_prepared

SSD(
  (base_net): Sequential(
    (0): Sequential(
      (0): ConvReLU2d(
        (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        (1): ReLU(inplace=True)
        (activation_post_process): HistogramObserver(min_val=inf, max_val=-inf)
      )
      (1): Identity()
      (2): Identity()
    )
    (1): Sequential(
      (0): ConvReLU2d(
        (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32)
        (1): ReLU(inplace=True)
        (activation_post_process): HistogramObserver(min_val=inf, max_val=-inf)
      )
      (1): Identity()
      (2): Identity()
      (3): ConvReLU2d(
        (0): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1))
        (1): ReLU(inplace=True)
        (activation_post_process): HistogramObserver(min_val=inf, max_val=-inf)
      )
      (4): Identity()
      (5): Identity()
    )
    (2): Sequential(
      (0): ConvReLU2d(
        (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), 

In [17]:
images = glob("imgs/*")
images

['imgs/photo_2024-11-20_15-40-16.jpg',
 'imgs/photo_2024-11-20_15-40-15.jpg',
 'imgs/photo_2024-11-20_15-40-12.jpg',
 'imgs/photo_2024-11-20_15-40-09.jpg',
 'imgs/photo_2024-11-20_15-40-14.jpg']

In [18]:
for image_c in images:
    image_c = cv2.imread(image_c)
    image_c = cv2.cvtColor(orig_image, cv2.COLOR_BGR2RGB)
    image_c = transform(image_c)
    image_c = image_c.unsqueeze(0).cpu()
    with torch.no_grad():
        out = model_fp32_prepared(image_c) 

In [19]:
model_fp32_prepared

SSD(
  (base_net): Sequential(
    (0): Sequential(
      (0): ConvReLU2d(
        (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        (1): ReLU(inplace=True)
        (activation_post_process): HistogramObserver(min_val=0.0, max_val=2.597787857055664)
      )
      (1): Identity()
      (2): Identity()
    )
    (1): Sequential(
      (0): ConvReLU2d(
        (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32)
        (1): ReLU(inplace=True)
        (activation_post_process): HistogramObserver(min_val=0.0, max_val=5.082990646362305)
      )
      (1): Identity()
      (2): Identity()
      (3): ConvReLU2d(
        (0): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1))
        (1): ReLU(inplace=True)
        (activation_post_process): HistogramObserver(min_val=0.0, max_val=6.782979965209961)
      )
      (4): Identity()
      (5): Identity()
    )
    (2): Sequential(
      (0): ConvReLU2d(
        (0): Conv2d(64, 64, kernel_size=

In [20]:
model_quantized = torch.quantization.convert(model_fp32_prepared, inplace=True)

In [21]:
predictor = create_mobilenetv1_ssd_predictor(model_quantized, candidate_size=200, device='cpu')
boxes, labels, probs = predictor.predict(image, 10, 0.3)

Inference time:  0.06466794013977051


In [22]:
predictor.trace(image, "checkpoints/traced_m_f_q_v1.pt", onnx=False)

Trace time:  2.215386152267456


In [23]:
boxes

tensor([[ 86.3826, 224.7834, 227.4442, 370.9722],
        [238.4839,  -2.0321, 497.7625, 377.0321]])