## Demo 

### Directory

First look at the project directory:

*VOCdevkit:* holds the training data

*weights:* the weights file

*Config.py:* some default configuration

*Test.py:* test the recognition of a single photo

*Train.py:* the py file for training

*augmentation.py:* py file for data augmentation, the main function is to expand the training data

*detection.py:* partial filtering of the data from the recognition results, which is transferred to the Test.py file for use in its calls

*l2norm.py:* performs l2 regularisation

*loss_function.py:* compute the loss function

*ssd_net_vgg.py:* implementation of the ssd model

*utils.py:* tool classes

*voc0712.py:* rewrite the dataset class to extract and regularize the data from the voc

*visdom_op.py:* define visdom class to visualize the training process

### Build Model

We build model in file *ssd_net_vgg.py* accordding to the logic of the paper which we implement.

### Calculate Default Box

The code is in *utils.py* file which is :

In [None]:
def default_prior_box():
    mean_layer = []
    for k,f in enumerate(Config.feature_map):
        mean = []
        for i,j in product(range(f),repeat=2):
            f_k = Config.image_size/Config.steps[k]
            cx = (j+0.5)/f_k
            cy = (i+0.5)/f_k

            s_k = Config.sk[k]/Config.image_size
            mean += [cx,cy,s_k,s_k]

            s_k_prime = sqrt(s_k * Config.sk[k+1]/Config.image_size)
            mean += [cx,cy,s_k_prime,s_k_prime]
            for ar in Config.aspect_ratios[k]:
                mean += [cx, cy, s_k * sqrt(ar), s_k/sqrt(ar)]
                mean += [cx, cy, s_k / sqrt(ar), s_k * sqrt(ar)]
        if Config.use_cuda:
            mean = torch.Tensor(mean).cuda().view(Config.feature_map[k], Config.feature_map[k], -1).contiguous()
        else:
            mean = torch.Tensor(mean).view( Config.feature_map[k],Config.feature_map[k],-1).contiguous()
        mean.clamp_(max=1, min=0)
        mean_layer.append(mean)

    return mean_layer

The function then generates boxes, corresponding to the number in the paper, and the final output is a list of 6, each list corresponding to the number of default boxes output by a feature layer

### Calculate Loss 

Loss function is in *loss_function.py* of which core part is:

In [None]:
class LossFun(nn.Module):
    def __init__(self):
        super(LossFun,self).__init__()
    def forward(self, prediction,targets,priors_boxes):
        loc_data , conf_data = prediction
        loc_data = torch.cat([o.view(o.size(0),-1,4) for o in loc_data] ,1)
        conf_data = torch.cat([o.view(o.size(0),-1,Config.class_num) for o in conf_data],1)
        priors_boxes = torch.cat([o.view(-1,4) for o in priors_boxes],0)
        if Config.use_cuda:
            loc_data = loc_data.cuda()
            conf_data = conf_data.cuda()
            priors_boxes = priors_boxes.cuda()
        # batch_size
        batch_num = loc_data.size(0)
        # default_box number
        box_num = loc_data.size(1)
        # store targets according to each prior_box date after transformation
        target_loc = torch.Tensor(batch_num,box_num,4)
        target_loc.requires_grad_(requires_grad=False)
        # store each type of prediction of default_box
        target_conf = torch.LongTensor(batch_num,box_num)
        target_conf.requires_grad_(requires_grad=False)
        if Config.use_cuda:
            target_loc = target_loc.cuda()
            target_conf = target_conf.cuda()
        # Since there may be multiple graphs in a batch, each loop computes the loc and conf of one box in the graph, i.e. 8732 boxes, which are stored in target_loc and target_conf
        for batch_id in range(batch_num):
            target_truths = targets[batch_id][:,:-1].data
            target_labels = targets[batch_id][:,-1].data
            if Config.use_cuda:
                target_truths = target_truths.cuda()
                target_labels = target_labels.cuda()
            # Calculate the box function, i.e. the formula for the loc loss function in Eq.
            utils.match(0.5,target_truths,priors_boxes,target_labels,target_loc,target_conf,batch_id)
        pos = target_conf > 0
        pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data)
        # Equivalent to the operation of multiplying xij by the L1 loss function in the paper
        pre_loc_xij = loc_data[pos_idx].view(-1,4)
        tar_loc_xij = target_loc[pos_idx].view(-1,4)
        # Smooth_li loss function by taking the calculated loc and prediction
        loss_loc = F.smooth_l1_loss(pre_loc_xij,tar_loc_xij,size_average=False)

        batch_conf = conf_data.view(-1,Config.class_num)

        # Referring to the conf calculation in the paper, find the ci
        loss_c = utils.log_sum_exp(batch_conf) - batch_conf.gather(1, target_conf.view(-1, 1))

        loss_c = loss_c.view(batch_num, -1)
        # Set positive sample to 0
        loss_c[pos] = 0

        # Sort the remaining negative samples and select the target number of negative samples
        _, loss_idx = loss_c.sort(1, descending=True)
        _, idx_rank = loss_idx.sort(1)

        num_pos = pos.long().sum(1, keepdim=True)
        num_neg = torch.clamp(3*num_pos, max=pos.size(1)-1)

        # Extraction of positive and negative samples
        neg = idx_rank < num_neg.expand_as(idx_rank)
        pos_idx = pos.unsqueeze(2).expand_as(conf_data)
        neg_idx = neg.unsqueeze(2).expand_as(conf_data)

        conf_p = conf_data[(pos_idx+neg_idx).gt(0)].view(-1, Config.class_num)
        targets_weighted = target_conf[(pos+neg).gt(0)]
        loss_c = F.cross_entropy(conf_p, targets_weighted, size_average=False)

        N = num_pos.data.sum().double()
        loss_l = loss_loc.double()
        loss_c = loss_c.double()
        loss_l /= N
        loss_c /= N
        return loss_l, loss_c


### Match function

We think this function is a difficult part.

In [None]:
def match(threshold, truths, priors, labels, loc_t, conf_t, idx):
    """
    Calculate the jaccard ratio between default box and actual location, calculate the maximum jaccard ratio of each box for each kind and the maximum jaccard ratio of each kind of box
    Args.
        threshold: (float) The threshold of the jaccard ratio.
        truths: (tensor) The actual position.
        priors: (tensor) default box
        labels: (tensor) The actual number of categories an image contains.
        loc_t: (tensor) The maximum jaccard ratio that needs to be stored for each box in each category.
        conf_t: (tensor) The category that stores the maximum jaccard ratio for each box.
        idx: (int) The current batch
    """
    # jaccard
    overlaps = jaccard(
        truths,
        # Convert priors to x_min,y_min,x_max and y_max
        point_form(priors)
    )
    # [1,num_objects] best prior for each ground truth
    # The actual category contained corresponds to the box with the largest jaccarb in the box and the corresponding index value, i.e. the best box for each category
    best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True)
    # [1,num_priors] best ground truth for each prior
    # For each box, the category with the largest jaccard ratio in the actual category, i.e. the optimal category for each box
    best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True)
    best_truth_idx.squeeze_(0)
    best_truth_overlap.squeeze_(0)
    best_prior_idx.squeeze_(1)
    best_prior_overlap.squeeze_(1)
    # Set the maximum box in each category to 2 to ensure that it does not affect later operations
    best_truth_overlap.index_fill_(0, best_prior_idx, 2)

    # Calculate the optimal class for each box, and the optimal loc for each class
    for j in range(best_prior_idx.size(0)):
        best_truth_idx[best_prior_idx[j]] = j
    matches = truths[best_truth_idx]          # Shape: [num_priors,4]
    conf = labels[best_truth_idx] + 1         # Shape: [num_priors]
    conf[best_truth_overlap < threshold] = 0  # label as background
    # Implement the conversion of loc, the specific conversion formula refer to the formula of the loss function of loc in the paper
    loc = encode(matches, priors,(0.1,0.2))
    loc_t[idx] = loc    # [num_priors,4] encoded offsets to learn
    conf_t[idx] = conf  # [num_priors] top class label for each prior


### Training

Core part of trainning like:

In [None]:
def train():
    dataset = voc0712.VOCDetection(root=Config.dataset_root,
                           transform=augmentations.SSDAugmentation(Config.image_size,
                                                     Config.MEANS))
    data_loader = data.DataLoader(dataset, Config.batch_size,
                                  num_workers=Config.data_load_number_worker,
                                  shuffle=True, collate_fn=detection_collate,
                                  pin_memory=True,generator=torch.Generator(device='cuda'))

    net = ssd_net_vgg.SSD()
    vgg_weights = torch.load('./weights/vgg16_reducedfc.pth')

    #visualization setting
    vis = setup_visdom()
    vis_step = 3

    net.apply(weights_init)
    net.vgg.load_state_dict(vgg_weights)
    # net.apply(weights_init)
    if Config.use_cuda:
        net = torch.nn.DataParallel(net)
        net = net.cuda()
    net.train()
    loss_fun = loss_function.LossFun()
    optimizer = optim.SGD(net.parameters(), lr=Config.lr, momentum=Config.momentum,
                          weight_decay=Config.weight_decacy)
    iter = 0
    step_index = 0
    before_epoch = -1
    for epoch in range(1000):
        for step,(img,target) in enumerate(data_loader):
            if Config.use_cuda:
                img = img.cuda()
                target = [ann.cuda() for ann in target]
            img = torch.Tensor(img)
            loc_pre,conf_pre = net(img)
            priors = utils.default_prior_box()
            optimizer.zero_grad()
            loss_l,loss_c = loss_fun((loc_pre,conf_pre),target,priors)
            loss = loss_l + loss_c
            loss.backward()
            optimizer.step()
            if iter % 3 == 0 or before_epoch!=epoch:
                print('epoch : ',epoch,' iter : ',iter,' step : ',step,' loss : ',loss.data,'loss_l:',loss_l.data,'loss_c:',loss_c.data)
                before_epoch = epoch
            iter+=1
            
            if vis and iter % vis_step == 0:
                visdom_line(vis, y=[loss], x=iter, win_name='loss')
                visdom_line(vis, y=[loss_c], x=iter, win_name='loss_c')
                visdom_line(vis, y=[loss_l], x=iter, win_name='loss_l')
            if iter in Config.lr_steps:
                step_index+=1
                adjust_learning_rate(optimizer,Config.gamma,step_index)
            if iter % 10000 == 0 and iter!=0:
                torch.save(net.state_dict(), 'weights/ssd300_VOC_' +
                           repr(iter) + '.pth')
        if iter >= Config.max_iter:
            break
    torch.save(net.state_dict(), 'weights/ssd_voc_120000.pth')

### Test for one image

**Run this part of code, you will see the result of dectection on an image.Be care that the jupyter notebook file should be put in the origin path.**

In [None]:
import torch

from torch.autograd import Variable
from detection import *
from ssd_net_vgg import *
from voc0712 import *
import torch.nn as nn
import numpy as np
import cv2
import utils
if torch.cuda.is_available():
    torch.set_default_tensor_type('torch.cuda.FloatTensor')
colors_tableau = [(255, 255, 255), (31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),
                 (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),
                 (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),
                 (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),
                 (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229),(158, 218, 229),(158, 218, 229)]

net = SSD()    # initialize SSD
net = torch.nn.DataParallel(net)
net.train(mode=False)
net.load_state_dict(torch.load('./weights/ssd300_VOC_120000.pth',map_location=lambda storage, loc: storage))
img_id = 60
image = cv2.imread('./test1.jpg', cv2.IMREAD_COLOR)
x = cv2.resize(image, (300, 300)).astype(np.float32)
x -= (104.0, 117.0, 123.0)
x = x.astype(np.float32)
x = x[:, :, ::-1].copy()
# plt.imshow(x)
x = torch.from_numpy(x).permute(2, 0, 1)
xx = Variable(x.unsqueeze(0))     # wrap tensor in Variable
if torch.cuda.is_available():
    xx = xx.cuda()
y = net(xx)
softmax = nn.Softmax(dim=-1)
detect = Detect(config.class_num, 0, 200, 0.01, 0.45)
priors = utils.default_prior_box()

loc,conf = y
loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)

detections = detect.forward(loc.view(loc.size(0), -1, 4),softmax(conf.view(conf.size(0), -1,config.class_num)), torch.cat([o.view(-1, 4) for o in priors], 0)).data

labels = VOC_CLASSES
top_k=10

# plt.imshow(rgb_image)  # plot the image for matplotlib

# scale each detection back up to the image
scale = torch.Tensor(image.shape[1::-1]).repeat(2)
for i in range(detections.size(1)):
    j = 0
    while detections[0,i,j,0] >= 0.4:
        score = detections[0,i,j,0]
        label_name = labels[i-1]
        display_txt = '%s: %.2f'%(label_name, score)
        pt = (detections[0,i,j,1:]*scale).cpu().numpy()
        coords = (pt[0], pt[1]), pt[2]-pt[0]+1, pt[3]-pt[1]+1
        color = colors_tableau[i]
        cv2.rectangle(image,(int(pt[0]),int(pt[1])), (int(pt[2]),int(pt[3])), color, 2)
        cv2.putText(image, display_txt, (int(pt[0]), int(pt[1]) + 10), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255,255,255), 1, 8)
        j+=1
cv2.imshow('test',image)
cv2.waitKey(100000)


The input image is ![avatar](test1.jpg)

The output image is ![avatar](result.jpg)