# Import Packages

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import numpy as np
import pandas as pd
import cv2

# Load Config File

In [2]:
def load_config():
    modules = []
    with open('yolov3.cfg', 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')
        lines = [line.rstrip().lstrip() for line in lines if line != '' if line[0] != '#']

        module = {}
        for line in lines:
            if line[0]=='[':
                if len(module) > 0:
                    modules.append(module)
                module = {}
                module['type'] = line[1:-1]
            else:
                key, value = line.split('=')
                module[key.strip()] = value.strip()
        modules.append(module)
    return modules

modules = load_config()

# Create Model

## Define model

In [3]:
class ConvLayer(nn.Module):
    def __init__(self, idx, in_f, module):
        super(ConvLayer, self).__init__()
        self.create_module(idx, in_f, module)
    
    def create_module(self, idx, in_f, module):
        self.out_f = int(module['filters'])
        kernel_size = int(module['size'])
        stride = int(module['stride'])
        padding = (kernel_size - 1) // 2
        self.add_module('conv', 
                        nn.Conv2d(in_f, 
                                  self.out_f, 
                                  kernel_size, 
                                  stride, 
                                  padding, 
                                  bias=False if 'batch_normalize' in module else True))
        if 'batch_normalize' in module:
            self.add_module('norm', nn.BatchNorm2d(self.out_f))
        if module['activation'] == 'leaky':
            self.add_module('leaky', nn.LeakyReLU(0.1, inplace=True))
    
    def forward(self, x):
        for sub_module in self.children():
            x = sub_module(x)
        return x

class ShortcutLayer(nn.Module):
    def __init__(self, pos):
        super(ShortcutLayer, self).__init__()
        self.pos = pos
    def forward(self, x, y):
        return x+y

class RouteLayer(nn.Module):
    def __init__(self, pos):
        super(RouteLayer, self).__init__()
        self.pos = [int(i) for i in pos.split(',')]
    def forward(self, x):
        return torch.cat(x, 1)

class UpsampleLayer(nn.Module):
    def __init__(self, scale):
        super(UpsampleLayer, self).__init__()
        self.up = nn.Upsample(scale_factor=scale)
    def forward(self, x):
        return self.up(x)

class YOLOLayer(nn.Module):
    def __init__(self, anchors=None):
        super(YOLOLayer, self).__init__()
        self.anchors = anchors
    
    def get_boxes(self, inputs=None):
        grid_x = (torch.arange(self.input_w)
                 .repeat(self.input_h,1)
                 .view(1, self.input_w, self.input_h, 1)
                 .repeat(BATCH_SIZE, 1, 1, NUM_BOX).to(DEVICE))
        grid_y = (torch.arange(self.input_w)
                 .repeat(self.input_h,1)
                 .t()
                 .view(1, self.input_w, self.input_h, 1)
                 .repeat(BATCH_SIZE, 1, 1, NUM_BOX).to(DEVICE))
        anchor_w = torch.tensor([w/self.stride for w, _ in self.anchors]).to(DEVICE)
        anchor_h = torch.tensor([h/self.stride for _, h in self.anchors]).to(DEVICE)
        
        #Calculate bx, by, bw, bh
        inputs[..., 0] += grid_x
        inputs[..., 1] += grid_y
        inputs[..., 2] = torch.exp(inputs[..., 2]) * anchor_w
        inputs[..., 3] = torch.exp(inputs[..., 3]) * anchor_h
        
        #Truth ground boxes
        inputs[..., :4] *= self.stride
        
        inputs = inputs.view(BATCH_SIZE, -1, 85)
        
        return inputs
        
    def forward(self, inputs=None, targets=None):
        self.input_w = inputs.shape[-2]
        self.input_h = inputs.shape[-1]
        inputs = inputs.view(BATCH_SIZE, NUM_BOX, -1, self.input_w, self.input_h).permute(0,3,4,1,2).contiguous()
        self.stride = WIDTH / self.input_w
        
        #Sigmoid x, y, po and pc
        inputs[..., :2] = torch.sigmoid(inputs[..., :2])
        inputs[..., 4:] = torch.sigmoid(inputs[..., 4:])
        
        return self.get_boxes(inputs=inputs)

    
class YOLO(nn.Module):
    def __init__(self, modules):
        super(YOLO, self).__init__()
        self.layers = nn.ModuleList()
        self.create_modules(modules)
    
    def create_modules(self, modules):
        out_fs = [3]
        for idx, module in enumerate(modules[1:]):
            if module['type'] == 'convolutional':
                in_f = out_fs[-1]
                t = ConvLayer(idx, in_f, module)
                f = t.out_f
                self.layers.append(t)
            elif module['type'] == 'shortcut':
                pos = int(module['from'])
                t = ShortcutLayer(pos)
                f = out_fs[-1]
                self.layers.append(t)
            elif module['type'] == 'route':
                pos = module['layers']
                t = RouteLayer(pos)
                f = sum([out_fs[i] for i in t.pos])
                self.layers.append(t)
            elif module['type'] == 'upsample':
                scale = module['stride']
                t = UpsampleLayer(scale)
                f = out_fs[-1]
                self.layers.append(t)
            elif module['type'] == 'yolo':
                mask = [int(i) for i in module['mask'].split(',')]
                anchors = [int(value) for value in module['anchors'].split(',')]
                anchors = [(anchors[2*i], anchors[2*i+1]) for i in mask]
                t = YOLOLayer(anchors=anchors)
                f = out_fs[-1]
                self.layers.append(t)
            out_fs.append(f)
            
    def load_weights(self, weight_path=None):
        with open(weight_path, 'rb') as f:
            header = np.fromfile(f, dtype = np.int32, count = 5)
            weights = np.fromfile(f, dtype = np.float32)
            
        idx_w = 0
        for idx, layer in enumerate(self.layers):
            if isinstance(layer, ConvLayer):
                conv_layer = layer.conv
                
                #Load weights to batch norm or conv bias
                if 'batch_normalize' in modules[idx+1]:
                    bn_layer = layer.norm
                    length = bn_layer.bias.numel()
                    for i in ['bias', 'weight', 'running_mean', 'running_var']:
                        x = getattr(bn_layer, i)
                        weight_to_load = torch.from_numpy(weights[idx_w: idx_w+length])
                        weight_to_load = weight_to_load.view_as(x.data)
                        if i in ['bias', 'weight']:
                            x.data.copy_(weight_to_load)
                        else:
                            x.copy_(weight_to_load)
                        idx_w += length
                else:
                    length = conv_layer.bias.numel()
                    weight_to_load = torch.from_numpy(weights[idx_w: idx_w+length])
                    weight_to_load = weight_to_load.view_as(layer.conv.bias.data)
                    conv_layer.bias.data.copy_(weight_to_load)
                    idx_w += length

                #Load to conv weight
                length = conv_layer.weight.numel()
                weight_to_load = torch.from_numpy(weights[idx_w: idx_w+length])
                weight_to_load = weight_to_load.view_as(conv_layer.weight.data)
                conv_layer.weight.data.copy_(weight_to_load)
                idx_w += length

                print('Loaded to Conv #{}, weight index is {}'.format(idx, idx_w))

    def forward(self, x):
        outputs = []
        yolo_outputs = []
        
        for idx, layer in enumerate(model.layers):
            if isinstance(layer, ConvLayer):
                x = layer(x)
            elif isinstance(layer, ShortcutLayer):
                x = layer(x, outputs[layer.pos])
            elif isinstance(layer, RouteLayer):
                temp = [outputs[i] for i in layer.pos]
                x = layer(temp)
            elif isinstance(layer, UpsampleLayer):
                x = layer(x)
            elif isinstance(layer, YOLOLayer):
                yolo_output = layer(inputs=x)
                yolo_outputs.append(yolo_output)
                x = outputs[-1]
                
            #print(x.shape)
            outputs.append(x)
            
        yolo_outputs = torch.cat(yolo_outputs, 1)
        
        return yolo_outputs

## Create model & load weights

In [4]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
model = YOLO(modules)
model.to(DEVICE)
model.load_weights('./weights/yolov3.weights')

Loaded to Conv #0, weight index is 992
Loaded to Conv #1, weight index is 19680
Loaded to Conv #2, weight index is 21856
Loaded to Conv #3, weight index is 40544
Loaded to Conv #5, weight index is 114784
Loaded to Conv #6, weight index is 123232
Loaded to Conv #7, weight index is 197472
Loaded to Conv #9, weight index is 205920
Loaded to Conv #10, weight index is 280160
Loaded to Conv #12, weight index is 576096
Loaded to Conv #13, weight index is 609376
Loaded to Conv #14, weight index is 905312
Loaded to Conv #16, weight index is 938592
Loaded to Conv #17, weight index is 1234528
Loaded to Conv #19, weight index is 1267808
Loaded to Conv #20, weight index is 1563744
Loaded to Conv #22, weight index is 1597024
Loaded to Conv #23, weight index is 1892960
Loaded to Conv #25, weight index is 1926240
Loaded to Conv #26, weight index is 2222176
Loaded to Conv #28, weight index is 2255456
Loaded to Conv #29, weight index is 2551392
Loaded to Conv #31, weight index is 2584672
Loaded to Conv 

# Test model

## Processing input

In [70]:
THRESH = 0.3
BATCH_SIZE = 1
WIDTH = 416
HEIGHT = 416
NUM_BOX = 3

LABELS = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", \
          "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", \
          "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", \
          "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", \
          "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", \
          "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", \
          "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", \
          "chair", "sofa", "pottedplant", "bed", "diningtable", "toilet", "tvmonitor", "laptop", "mouse", \
          "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", \
          "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"]


def letterbox_image(img, inp_dim):
    img_w, img_h = img.shape[1], img.shape[0]
    w, h = inp_dim
    new_w = int(img_w * min(w/img_w, h/img_h))
    new_h = int(img_h * min(w/img_w, h/img_h))
    resized_image = cv2.resize(img, (new_w,new_h), interpolation = cv2.INTER_CUBIC)
    
    canvas = np.full((inp_dim[1], inp_dim[0], 3), 128)

    canvas[(h-new_h)//2:(h-new_h)//2 + new_h,(w-new_w)//2:(w-new_w)//2 + new_w,  :] = resized_image
    
    return canvas / 255.0


img = cv2.imread('test_person.jpg')
'''
img_resized = cv2.resize(img[:,:,::-1], (WIDTH, HEIGHT))
img_tensor = torch.tensor(img_resized, dtype=torch.float32).unsqueeze(0).permute(0,3,1,2).contiguous()
img_tensor /= 255
'''
'''
img = cv2.resize(img, None, fx=0.4, fy=0.4)
blob = cv2.dnn.blobFromImage(img, scalefactor=0.00392, size=(WIDTH, HEIGHT), mean=(0, 0, 0), swapRB=True, crop=False)
img_tensor = torch.tensor(blob)
'''

img_ = letterbox_image(img, (WIDTH, HEIGHT))
img_ = img_[:,:,::-1].transpose((2,0,1)).copy()
img_tensor = torch.from_numpy(img_).float().unsqueeze(0).to(DEVICE)

with torch.no_grad():
    result = model(img_tensor)
    result = result[result[...,4] >= THRESH]
    
    result = result.detach().cpu()
    print(result.shape)
    


torch.Size([48, 85])


## Decode outputs

In [77]:
def xywh2wywy(inputs=None):
    x, y, w, h = inputs[:,0], inputs[:,1], inputs[:,2], inputs[:,3]
    x_min, x_max = x - w/2, x + w/2
    y_min, y_max = y - h/2, y + h/2
    
    return torch.cat((x_min.unsqueeze(1),
                      y_min.unsqueeze(1),
                      x_max.unsqueeze(1),
                      y_max.unsqueeze(1)
                     ), 1)


def iou(b1=None, b2=None):
    b1_x1, b1_y1, b1_x2, b1_y2 = b1
    b2_x1, b2_y1, b2_x2, b2_y2 = b2
    b1_area = (b1_x2-b1_x1+1) * (b1_y2-b1_y1+1)
    b2_area = (b2_x2-b2_x1+1) * (b2_y2-b2_y1+1)
    
    xx1 = max(b1_x1, b2_x1)
    yy1 = max(b1_y1, b2_y1)
    xx2 = min(b1_x2, b2_x2)
    yy2 = min(b1_y2, b2_y2)

    
    inter_area = (max(xx2-xx1+1,0)) * (max(yy2-yy1+1,0))
    union_area = b1_area + b2_area - inter_area
    
    return inter_area/union_area


def nms(inputs=None, NMS_TRESH=None):
    for i in range(inputs.shape[0]-1):
        b1 = inputs[i]
        if b1[4] == 0:
            continue
        for j in range(i+1, inputs.shape[0]):
            b2 = inputs[j]
            if b1[5] == b2[5] and b2[4] > 0: #If same label
                iou_score = iou(b1[:4], b2[:4])
                if iou_score > NMS_TRESH:
                    inputs[j,4] = 0
                #print('Box {}, {}\n{}\n{}\n{}'.format(i, j, b1[:4], b2[:4], iou(b1[:4], b2[:4])))
    
    return inputs[inputs[:,4] > 0]


def draw_boxes(img, inputs):
    scale = min(HEIGHT/img.shape[0], WIDTH/img.shape[1])
    offset_x = (WIDTH - img.shape[1]*scale)/2
    offset_y = (HEIGHT - img.shape[0]*scale)/2
    
    colors = np.random.uniform(0, 255, size=(80, 3))
    
    for b in inputs:
        x_min = int((b[0]-offset_x) / scale)
        x_max = int((b[2]-offset_x) / scale)
        y_min = int((b[1]-offset_y) / scale)
        y_max = int((b[3]-offset_y) / scale)
        
        label = int(b[5])
        color = colors[label]
        
        cv2.rectangle(img, (x_min, y_min), (x_max, y_max), color, 1)
        cv2.putText(img, 
                    LABELS[label], 
                    (x_min, y_min-5), 
                    cv2.FONT_HERSHEY_PLAIN, 
                    1, 
                    color,
                    1)
        
    return img


def process_output(result=None):
    # Select label & score at bouding boxes
    max_pos = torch.argmax(result[:,5:], 1) #labels with highest score
    max_score = result[torch.arange(result.size(0)), max_pos+5] #score of that labels
    result = torch.cat((result[:,:5], max_pos.float().unsqueeze(1), max_score.unsqueeze(1)), 1) #downside result
    
    # Turn xywh to xyxy and round it + Sort by po
    result[:,:4] = xywh2wywy(result[:,:4]).round()
    result = result[torch.argsort(result[:,4], descending=True)] #Sort by bouding box score
    
    
    # Do non max suppession
    result = nms(inputs=result, NMS_TRESH=NMS_THRESH)
    
    # Draw boxes
    _img = draw_boxes(img, result)
    
    return (_img, result)

In [78]:
_img, res = process_output(result)
cv2.imshow("test", _img)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [94]:
res[:,4:]

tensor([[ 0.9977,  0.0000,  0.9993],
        [ 0.9906,  0.0000,  0.9996],
        [ 0.9862,  0.0000,  0.9999],
        [ 0.9823,  0.0000,  0.9999],
        [ 0.9810,  0.0000,  0.9998],
        [ 0.8458, 26.0000,  0.9904],
        [ 0.7545,  0.0000,  0.9997],
        [ 0.7540, 13.0000,  0.9962],
        [ 0.5296,  1.0000,  0.9938],
        [ 0.4900, 73.0000,  0.9986],
        [ 0.4552,  0.0000,  0.9998],
        [ 0.4308, 73.0000,  0.9991],
        [ 0.4054, 13.0000,  0.9146],
        [ 0.3979, 73.0000,  0.9992],
        [ 0.3902, 73.0000,  0.9981],
        [ 0.3865, 73.0000,  0.9983],
        [ 0.3590, 73.0000,  0.9993],
        [ 0.3546, 73.0000,  0.9983],
        [ 0.3520, 73.0000,  0.9959],
        [ 0.3405, 73.0000,  0.9986],
        [ 0.3302, 73.0000,  0.9969],
        [ 0.3288, 73.0000,  0.9986],
        [ 0.3257, 73.0000,  0.9980],
        [ 0.3072, 73.0000,  0.9982],
        [ 0.3042, 73.0000,  0.9975]])

In [97]:
iou(res[7,:4], res[12,:4])

tensor(0.3606)