In [1]:
from ast import literal_eval

import cv2
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow.keras.layers import (
    Add,
    Input,
    Conv2D,
    Lambda,
    MaxPool2D,
    LeakyReLU,
    Concatenate,
    UpSampling2D,
    ZeroPadding2D,
    BatchNormalization)
from tensorflow.keras import Model
from tensorflow.keras.regularizers import l2

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if len(gpus) > 0:
    try: tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError: pass

In [3]:
YOLO_STRIDES                = [8, 16, 32]
YOLO_IOU_LOSS_THRESH        = 0.5
YOLO_ANCHOR_PER_SCALE       = 3
YOLO_MAX_BBOX_PER_SCALE     = 100
YOLO_INPUT_SIZE             = 416
YOLO_ANCHORS                = [[[10,  13], [16,   30], [33,   23]],
                               [[30,  61], [62,   45], [59,  119]],
                               [[116, 90], [156, 198], [373, 326]]]

STRIDES         = np.array(YOLO_STRIDES)
ANCHORS         = (np.array(YOLO_ANCHORS).T/STRIDES).T

In [4]:
df = pd.read_csv('data/data.csv')
df.region_shape_attributes = df.region_shape_attributes.apply(literal_eval)
le = LabelEncoder()
df.name = le.fit_transform(df.name)

In [5]:
INPUT_SIZE = 416
CLASSES = len(le.classes_)
BATCH_SIZE = 1

### Backbone of YoloV3

In [6]:
class BatchNormalization(BatchNormalization):

    def call(self, x, training=False):
        if not training:
            training = tf.constant(False)
        training = tf.logical_and(training, self.trainable)
        return super().call(x, training)


def DarknetConv(x, filters, size, downsample=False, activate=True, bn=True):

    if downsample:
        x = ZeroPadding2D(((1, 0), (1, 0)))(x)
        padding = 'valid'
        strides = 2
    else:
        padding = 'same'
        strides = 1

    x = Conv2D(filters=filters, kernel_size=size,
               strides=strides, padding=padding, use_bias=not bn,
               kernel_regularizer=l2(0.0005),
               kernel_initializer=tf.random_normal_initializer(stddev=0.01),
               bias_initializer=tf.constant_initializer(0.))(x)

    if bn:
        x = BatchNormalization()(x)
    if activate:
        x = LeakyReLU(alpha=0.1)(x)

    return x


def DarknetResidual(x, filters):

    short_cut = x
    x = DarknetConv(x, filters=filters//2, size=1)
    x = DarknetConv(x, filters=filters, size=3)
    x = Add()([short_cut, x])

    return x


def DarknetBlock(x, filters, blocks):
    x = DarknetConv(x, filters=filters, size=3, downsample=True)
    for _ in range(blocks):
        x = DarknetResidual(x, filters)

    return x


def Darknet(name=None):

    x = inputs = Input([None, None, 3])
    x = DarknetConv(x, filters=32, size=3)
    x = DarknetBlock(x, 64, 2)
    x = DarknetBlock(x, 128, 2)
    x = x_36 = DarknetBlock(x, 256, 8)
    x = x_61 = DarknetBlock(x, 512, 8)
    x = DarknetBlock(x, 1024, 4)

    return Model(inputs, (x_36, x_61, x), name=name)


### YoloV3 model

In [7]:
def YoloConv(filters, name=None):
    def yolo_conv(x_in):
        if isinstance(x_in, tuple):
            inputs = Input(x_in[0].shape[1:]), Input(x_in[1].shape[1:])
            x, x_skip = inputs

            # concat with skip connection
            x = DarknetConv(x, filters, 1)
            x = UpSampling2D(2)(x)
            x = Concatenate()([x, x_skip])
        else:
            x = inputs = Input(x_in.shape[1:])

        x = DarknetConv(x, filters, 1)
        x = DarknetConv(x, filters * 2, 3)
        x = DarknetConv(x, filters, 1)
        x = DarknetConv(x, filters * 2, 3)
        x = DarknetConv(x, filters, 1)
        return Model(inputs, x, name=name)(x_in)
    return yolo_conv


def YoloOutput(classes=80, masks=None, strides=None):
    def yolo_output(x):

        batch_size, output_size = tf.shape(x)[:2]
        x_output = tf.reshape(x, (-1, output_size, output_size,
                                  3, 5 + classes))

        x_dxdy = x_output[:, :, :, :, 0:2]
        x_dwdh = x_output[:, :, :, :, 2:4]
        x_conf = x_output[:, :, :, :, 4:5]
        x_prob = x_output[:, :, :, :, 5:]

        # Draw the grid
        y = tf.range(output_size, dtype=tf.int32)
        y = tf.expand_dims(y, axis=-1)
        y = tf.tile(y, [1, output_size])
        x = tf.range(output_size, dtype=tf.int32)
        x = tf.expand_dims(x, axis=0)
        x = tf.tile(x, [output_size, 1])

        xy_grid = tf.concat([x[:, :, tf.newaxis], y[:, :, tf.newaxis]],
                            axis=-1)
        xy_grid = tf.tile(xy_grid[tf.newaxis, :, :, tf.newaxis, :],
                          [batch_size, 1, 1, 3, 1])
        xy_grid = tf.cast(xy_grid, tf.float32)

        pred_xy = (tf.sigmoid(x_dxdy) + xy_grid) * strides
        pred_wh = tf.exp(x_dwdh) * masks * strides

        pred_xywh = tf.concat([pred_xy, pred_wh], axis=-1)
        pred_conf = tf.sigmoid(x_conf)
        pred_prob = tf.sigmoid(x_prob)

        return tf.concat([pred_xywh, pred_conf, pred_prob], axis=-1)
    return yolo_output


def YoloV3(size=None, classes=80, training=False):
    x = inputs = Input([size, size, 3], name='input')

    x_36, x_61, x = Darknet(name='yolo_darknet')(x)

    x = YoloConv(512, name='yolo_conv_0')(x)
    l_output = DarknetConv(x, filters=3*(classes + 5),
                           size=1, activate=False, bn=False)

    x = YoloConv(256, name='yolo_conv_1')((x, x_61))
    m_output = DarknetConv(x, filters=3*(classes + 5),
                           size=1, activate=False, bn=False)

    x = YoloConv(128, name='yolo_conv_2')((x, x_36))
    s_output = DarknetConv(x, filters=3*(classes + 5),
                           size=1, activate=False, bn=False)

    output_tensors = []
    for i, output_tensor in enumerate([s_output, m_output, l_output]):
        pred_tensor = YoloOutput(classes, masks=ANCHORS[i],
                                 strides=STRIDES[i])(output_tensor)
        if training:
            output_tensors.append(output_tensor)
        output_tensors.append(pred_tensor)

    return Model(inputs, output_tensors)

### Util functions

In [8]:
def image_preprocess(image, target_size, gt_boxes=None):
    ih = iw = target_size
    h, w, _ = image.shape

    scale = min(iw/w, ih/h)
    nw, nh = int(scale * w), int(scale * h)
    image_resized = cv2.resize(image, (nw, nh))

    image_paded = np.full(shape=[ih, iw, 3], fill_value=128.0)
    dw, dh = (iw - nw) // 2, (ih-nh) // 2
    image_paded[dh:nh+dh, dw:nw+dw, :] = image_resized
    image_paded = image_paded / 255.

    if gt_boxes is None:
        return image_paded
    else:
        gt_boxes[:, [0, 2]] = gt_boxes[:, [0, 2]] * scale + dw
        gt_boxes[:, [1, 3]] = gt_boxes[:, [1, 3]] * scale + dh
        return image_paded, gt_boxes

In [20]:
def bbox_iou(bboxes1, bboxes2):

    bboxes1_area = bboxes1[..., 2] * bboxes1[..., 3]
    bboxes2_area = bboxes2[..., 2] * bboxes2[..., 3]

    bboxes1 = tf.concat([bboxes1[..., 0:2] - bboxes1[..., 2:4] * 0.5,
                         bboxes1[..., 0:2] + bboxes1[..., 2:4] * 0.5], axis=-1)
    bboxes2 = tf.concat([bboxes2[..., 0:2] - bboxes2[..., 2:4] * 0.5,
                         bboxes2[..., 0:2] + bboxes2[..., 2:4] * 0.5], axis=-1)

    inter_box = tf.concat([tf.maximum(bboxes1[..., 0:2], bboxes2[..., 0:2]),
                           tf.minimum(bboxes1[..., 2:4], bboxes2[..., 2:4])],
                          axis=-1)
    inter_area = tf.maximum(
        (inter_box[..., 2] - inter_box[..., 0]) * (inter_box[..., 3] - inter_box[..., 1]), 0.0)

    union_area = bboxes1_area + bboxes2_area - inter_area
    return inter_area / union_area


def bbox_giou(bboxes1, bboxes2):

    bboxes1_area = bboxes1[..., 2] * bboxes1[..., 3]
    bboxes2_area = bboxes2[..., 2] * bboxes2[..., 3]

    bboxes1 = tf.concat([bboxes1[..., 0:2] - bboxes1[..., 2:4] * 0.5,
                         bboxes1[..., 0:2] + bboxes1[..., 2:4] * 0.5], axis=-1)
    bboxes2 = tf.concat([bboxes2[..., 0:2] - bboxes2[..., 2:4] * 0.5,
                         bboxes2[..., 0:2] + bboxes2[..., 2:4] * 0.5], axis=-1)

    inter_box = tf.concat([tf.maximum(bboxes1[..., 0:2], bboxes2[..., 0:2]),
                           tf.minimum(bboxes1[..., 2:4], bboxes2[..., 2:4])],
                          axis=-1)
    inter_area = tf.maximum(
        (inter_box[..., 2] - inter_box[..., 0]) * (inter_box[..., 3] - inter_box[..., 1]), 0.0)

    union_area = bboxes1_area + bboxes2_area - inter_area
    iou = inter_area / union_area

    g_box = tf.concat([tf.minimum(bboxes1[..., 0:2], bboxes2[..., 0:2]),
                       tf.maximum(bboxes1[..., 2:4], bboxes2[..., 2:4])],
                      axis=-1)
    g_area = tf.maximum(
        (g_box[..., 2] - g_box[..., 0]) * (g_box[..., 3] - g_box[..., 1]), 0.0)
    return iou - (g_area - union_area) / g_area

### Dataset

In [21]:
class Dataset(object):

    def __init__(self, df, num_classes, batch_size,
                 input_size=416, loadimg=False, augment=False):

        self.num_classes = num_classes
        self.batch_size = batch_size
        self.input_size = input_size
        self.loadimg = loadimg
        self.data_aug = augment

        self.strides = STRIDES
        self.anchors = ANCHORS
        self.anchor_per_scale = 3
        self.max_bbox_per_scale = 100
        self.output_size = self.input_size // STRIDES

        self.annotations = self.load_annotations(df, self.loadimg)
        self.num_samples = len(self.annotations)
        self.num_batchs = int(np.ceil(self.num_samples / self.batch_size))
        self.batch_count = 0

    def load_annotations(self, df, loadimg=False):

        annotations = []
        for img_path in df.img_path.unique():
            tmp = []
            for _, line in df[df.img_path == img_path].iterrows():
                rsa = line['region_shape_attributes']
                xmin = rsa['x']
                ymin = rsa['y']
                xmax = rsa['x'] + rsa['width']
                ymax = rsa['y'] + rsa['height']
                class_ = line['name']
                tmp.append([xmin, ymin, xmax, ymax, class_])

            annotations.append([img_path, tmp])
        return annotations

    def preprocess(self, annotaion):
        image, bboxes = annotaion
        bboxes = np.array(list(map(np.array, bboxes)))
        image, bboxes = image_preprocess(np.copy(cv2.imread(image)),
                                         target_size=self.input_size,
                                         gt_boxes=np.copy(bboxes))

        label = [np.zeros((self.output_size[i],
                           self.output_size[i],
                           self.anchor_per_scale,
                           5 + self.num_classes)) for i in range(3)]
        for bbox in bboxes:
            bbox_coor = bbox[:4]
            bbox_class_ind = bbox[4]

            onehot = np.zeros(self.num_classes, dtype=np.float)
            onehot[bbox_class_ind] = 1.0
            uniform_distribution = np.full(self.num_classes,
                                           1.0 / self.num_classes)
            deta = 0.01
            smooth_onehot = onehot * (1 - deta) + deta * uniform_distribution

            bbox_xywh = np.concatenate(
                [(bbox_coor[2:] + bbox_coor[:2]) * 0.5, bbox_coor[2:] - bbox_coor[:2]],
                axis=-1)
            bbox_xywh_scaled = 1.0 * bbox_xywh[np.newaxis, :] / \
                self.strides[:, np.newaxis]

            iou = []
            exist_positive = False
            for i in range(3):
                anchors_xywh = np.zeros((self.anchor_per_scale, 4))
                anchors_xywh[:, 0:2] = np.floor(bbox_xywh_scaled[i, 0:2]).astype(np.int32) + 0.5
                anchors_xywh[:, 2:4] = self.anchors[i]

                iou_scale = bbox_iou(bbox_xywh_scaled[i][np.newaxis, :],
                                     anchors_xywh)
                iou.append(iou_scale)
                iou_mask = iou_scale > 0.3

                if np.any(iou_mask):
                    xind, yind = np.floor(
                        bbox_xywh_scaled[i, 0:2]).astype(np.int32)

                    label[i][yind, xind, iou_mask, :] = 0
                    label[i][yind, xind, iou_mask, 0:4] = bbox_xywh
                    label[i][yind, xind, iou_mask, 4:5] = 1.0
                    label[i][yind, xind, iou_mask, 5:] = smooth_onehot

                    exist_positive = True

            if not exist_positive:
                best_anchor_ind = np.argmax(np.array(iou).reshape(-1), axis=-1)
                best_detect = int(best_anchor_ind / self.anchor_per_scale)
                best_anchor = int(best_anchor_ind % self.anchor_per_scale)
                xind, yind = np.floor(
                    bbox_xywh_scaled[best_detect, 0:2]).astype(np.int32)

                label[best_detect][yind, xind, best_anchor, :] = 0
                label[best_detect][yind, xind, best_anchor, 0:4] = bbox_xywh
                label[best_detect][yind, xind, best_anchor, 4:5] = 1.0
                label[best_detect][yind, xind, best_anchor, 5:] = smooth_onehot

        label_sbbox, label_mbbox, label_lbbox = label
        return image, label_sbbox, label_mbbox, label_lbbox

    def __repr__(self):
        return (f'Dataset Generator with {self.num_samples} datapoints '
                'in {self.num_batchs} batches')

    def __iter__(self):
        return self

    def __len__(self):
        return self.num_samples

    def __next__(self):
        batch_images = np.zeros((self.batch_size, self.input_size,
                                 self.input_size, 3), dtype=np.float32)
        batch_slabels = np.zeros(
            (self.batch_size, self.output_size[0],
             self.output_size[0], 3, 5 + self.num_classes),
            dtype=np.float32)
        batch_mlabels = np.zeros(
            (self.batch_size, self.output_size[1],
             self.output_size[1], 3, 5 + self.num_classes),
            dtype=np.float32)
        batch_llabels = np.zeros(
            (self.batch_size, self.output_size[2],
             self.output_size[2], 3, 5 + self.num_classes),
            dtype=np.float32)

        if self.batch_count < self.num_batchs:
            for i in range(self.batch_size):
                idx = self.batch_count * self.batch_size + i
                if idx > self.num_samples:
                    idx = idx - self.num_samples
                annotation = self.annotations[idx]

                image, slabel, mlabel, llabel = self.preprocess(annotation)
                batch_images[i, ...] = image
                batch_slabels[i, ...] = slabel
                batch_mlabels[i, ...] = mlabel
                batch_llabels[i, ...] = llabel

            self.batch_count += 1
            return batch_images, [batch_slabels, batch_mlabels, batch_llabels]
        else:
            self.batch_count = 0
            np.random.shuffle(self.annotations)
            raise StopIteration
