```
!git clone https://github.com/amdegroot/ssd.pytorch.git
!mv ssd300_mAP_77.43_v2.pth ssd.pytorch
import os
os.chdir('ssd.pytorch')
!ls
```

In [1]:
!wget https://s3.amazonaws.com/amdegroot-models/ssd300_mAP_77.43_v2.pth

--2021-11-21 17:27:24--  https://s3.amazonaws.com/amdegroot-models/ssd300_mAP_77.43_v2.pth
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.4.30
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.4.30|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 105151288 (100M) [application/x-www-form-urlencoded]
Saving to: ‘ssd300_mAP_77.43_v2.pth’


2021-11-21 17:27:25 (57.2 MB/s) - ‘ssd300_mAP_77.43_v2.pth’ saved [105151288/105151288]



In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn as nn
import torch.nn.init as init
from torch.autograd import Function, Variable
import os

import cv2
import numpy as np

In [2]:
base = {
    '300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
            512, 512, 512],
    '512': [],
}
dataset_mean = (104, 117, 123)

In [3]:
class L2Norm(nn.Module):
    def __init__(self,n_channels, scale):
        super(L2Norm,self).__init__()
        self.n_channels = n_channels
        self.gamma = scale or None
        self.eps = 1e-10
        self.weight = nn.Parameter(torch.Tensor(self.n_channels))
        self.reset_parameters()

    def reset_parameters(self):
        init.constant_(self.weight,self.gamma)

    def forward(self, x):
        norm = x.pow(2).sum(dim=1, keepdim=True).sqrt()+self.eps
        #x /= norm
        x = torch.div(x,norm)
        out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x
        return out

In [4]:
class SSD(nn.Module):

    def __init__(self, base):

        super(SSD, self).__init__()

        # SSD network
        self.vgg = nn.ModuleList(base)

        self.L2Norm = L2Norm(512, 20)

    def forward(self, x):

        # apply vgg up to conv4_3 relu
        for k in range(23):
            x = self.vgg[k](x)

        s = self.L2Norm(x)

        # apply vgg up to fc7
        for k in range(23, len(self.vgg)):
            x = self.vgg[k](x)
        return x

In [5]:
# This function is derived from torchvision VGG make_layers()
# https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py
def vgg(cfg, i, batch_norm=False):
    layers = []
    in_channels = i
    for v in cfg:
        if v == 'M':
            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
        elif v == 'C':
            layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
        else:
            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
            if batch_norm:
                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
            else:
                layers += [conv2d, nn.ReLU(inplace=True)]
            in_channels = v
    pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
    conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
    conv7 = nn.Conv2d(1024, 1024, kernel_size=1)
    layers += [pool5, conv6,
               nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)]
    return layers

In [6]:
device = torch.device("cuda")

In [7]:
the_model = SSD(vgg(base['300'], 3))

In [8]:
the_model.load_state_dict(torch.load('ssd300_mAP_77.43_v2.pth'), strict=False)

_IncompatibleKeys(missing_keys=[], unexpected_keys=['extras.0.weight', 'extras.0.bias', 'extras.1.weight', 'extras.1.bias', 'extras.2.weight', 'extras.2.bias', 'extras.3.weight', 'extras.3.bias', 'extras.4.weight', 'extras.4.bias', 'extras.5.weight', 'extras.5.bias', 'extras.6.weight', 'extras.6.bias', 'extras.7.weight', 'extras.7.bias', 'loc.0.weight', 'loc.0.bias', 'loc.1.weight', 'loc.1.bias', 'loc.2.weight', 'loc.2.bias', 'loc.3.weight', 'loc.3.bias', 'loc.4.weight', 'loc.4.bias', 'loc.5.weight', 'loc.5.bias', 'conf.0.weight', 'conf.0.bias', 'conf.1.weight', 'conf.1.bias', 'conf.2.weight', 'conf.2.bias', 'conf.3.weight', 'conf.3.bias', 'conf.4.weight', 'conf.4.bias', 'conf.5.weight', 'conf.5.bias'])

In [9]:
the_model.to(device)

SSD(
  (vgg): ModuleList(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, cei

In [10]:
with torch.no_grad(), open('/mnt/data/VOCdevkit/VOC2012/ImageSets/Main/train.txt', 'r') as fp:

    for i, line in enumerate(fp):

        line = line.strip()
        if not line:

            continue
        img = cv2.imread('/mnt/data/VOCdevkit/VOC2012/JPEGImages/' + line + '.jpg')
        img = cv2.resize(img, (300, 300)).astype(np.float32)
        img -= dataset_mean
        img = img.astype(np.float32)
        # to rgb
        img = img[:, :, (2, 1, 0)]
        x = torch.as_tensor(img, device=device).permute(2, 0, 1)
        y = the_model(torch.unsqueeze(x, axis=0))
        np.save('./data/coco_voc/train2012/' + line + '.npy', y.detach().cpu().numpy())

        if i % 100 == 0:

            print(f'{i} images processed')

0 images processed
100 images processed
200 images processed
300 images processed
400 images processed
500 images processed
600 images processed
700 images processed
800 images processed
900 images processed
1000 images processed
1100 images processed
1200 images processed
1300 images processed
1400 images processed
1500 images processed
1600 images processed
1700 images processed
1800 images processed
1900 images processed
2000 images processed
2100 images processed
2200 images processed
2300 images processed
2400 images processed
2500 images processed
2600 images processed
2700 images processed
2800 images processed
2900 images processed
3000 images processed
3100 images processed
3200 images processed
3300 images processed
3400 images processed
3500 images processed
3600 images processed
3700 images processed
3800 images processed
3900 images processed
4000 images processed
4100 images processed
4200 images processed
4300 images processed
4400 images processed
4500 images processed


In [12]:
with torch.no_grad(), open('/mnt/data/VOCdevkit/VOC2012/ImageSets/Main/val.txt', 'r') as fp:

    for i, line in enumerate(fp):

        line = line.strip()
        if not line:

            continue
        img = cv2.imread('/mnt/data/VOCdevkit/VOC2012/JPEGImages/' + line + '.jpg')
        img = cv2.resize(img, (300, 300)).astype(np.float32)
        img -= dataset_mean
        img = img.astype(np.float32)
        # to rgb
        img = img[:, :, (2, 1, 0)]
        x = torch.as_tensor(img, device=device).permute(2, 0, 1)
        y = the_model(torch.unsqueeze(x, axis=0))
        np.save('./data/coco_voc/val2012/' + line + '.npy', y.detach().cpu().numpy())

        if i % 500 == 0:

            print(f'{i} images processed')

0 images processed
500 images processed
1000 images processed
1500 images processed
2000 images processed
2500 images processed
3000 images processed
3500 images processed
4000 images processed
4500 images processed
5000 images processed
5500 images processed


## Testing stuff below
** Stop running here **
(Can remove in later iterations)

In [10]:
img = cv2.imread('/mnt/data/VOCdevkit/VOC2012/JPEGImages/2012_002961.jpg')
img = cv2.resize(img, (300, 300)).astype(np.float32)
img -= dataset_mean
img = img.astype(np.float32)
# to rgb
img = img[:, :, (2, 1, 0)]

In [11]:
img.shape

(300, 300, 3)

In [12]:
x = torch.from_numpy(img).permute(2, 0, 1)

In [13]:
y = the_model(torch.unsqueeze(x, axis=0))

In [14]:
y.shape

torch.Size([1, 1024, 19, 19])