# Training Yolo V3

Here is an example of how to train a yolo v3 model

# Load Raw Dataset

In [None]:
import cv2
import numpy as np


# A sample of Dataset Generator
class DatasetGen(object):
    def __init__(self):
        self.imgs_cnt = 1
        self.imgs_idx = 0
    
    def __next__(self):
        if self.imgs_idx < self.imgs_cnt:
            self.imgs_idx += 1
            # read image from local
            x = cv2.imread('./data/girl.png')
            x = cv2.resize(x, (416, 416))
            # bounding boxes [x1, y1, x2, y2, class]
            y = np.array([
                [0.18494931, 0.03049111, 0.9435849,  0.96302897, 0],
                [0.22494931, 0.01949111, 0.3435849,  0.56302897, 2],
                [0.01586703, 0.35938117, 0.01686703, 0.36938117, 56]])
            return x, y
        else:
            raise StopIteration

    def __iter__(self):
        return self

# Test whether raw dataset can be loaded

In [None]:
for x, y in DatasetGen():
    print(x.shape, y.shape)

# Preprocess Dataset

In [None]:
import cv2
import numpy as np


class DatasetPre(object):
    def __init__(self, 
                 img_size = 416, 
                 anchors = np.array([[(116, 90), (156, 198), (373, 326)],
                                    [(30, 61), (62, 45),(59, 119)], 
                                     [(10, 13), (16, 30), (33, 23)]])):
        self.img_size = img_size
        self.anchors = anchors / img_size
        self.anchors_info = np.array([[x, y, (self.img_size // 32) * (2 ** x)] 
                                         for x in range(anchors.shape[1]) 
                                         for y in range(anchors.shape[0])])
        self.anchors_flat = self.anchors.reshape(-1, self.anchors.shape[-1])
        
    @staticmethod
    def expand_repeat_axis(var, expand_axis, repat_cnt):
        return np.expand_dims(var, expand_axis).repeat(repat_cnt, axis=expand_axis)
                 
    # Transform
    def transform(self, x, y):
        assert(x.shape == (self.img_size, self.img_size, 3))
        new_x = x / 255

        # calculate anchor index for true boxes
        box_xy = (y[..., 2:4] + y[..., 0:2]) / 2
        box_wh = y[..., 2:4] - y[..., 0:2]
        anchor_area = self.anchors_flat[..., 0] * self.anchors_flat[..., 1]
        box_area = box_wh[..., 0] * box_wh[..., 1]
        box_wh_er = self.expand_repeat_axis(box_wh, 1, self.anchors_flat.shape[0])
        intersection = np.minimum(box_wh_er[..., 0], self.anchors_flat[..., 0]) * \
                        np.minimum(box_wh_er[..., 1], self.anchors_flat[..., 1])
        iou = intersection / (self.expand_repeat_axis(box_area, -1, self.anchors_flat.shape[0]) + anchor_area - intersection)
        anchor_idxs = np.argmax(iou, axis=-1)

        # new_y: list [(grid, grid, anchors, [x1, y1, x2, y2, obj, class])]
        new_y = [np.zeros((grid_size, grid_size, self.anchors.shape[1], 6))
                 for grid_size in np.unique(self.anchors_info[:,2])]
        
        for idx in range(anchor_idxs.shape[0]):
            i, j, grid_size = self.anchors_info[anchor_idxs[idx]]
            s_grid_xy = (box_xy[idx] // (1 / grid_size)).astype(np.uint8)
            s_box_loc = y[..., 0:4][idx]
            """
            s_box_xy = box_xy[idx] * grid_size - s_grid_xy
            s_box_wh = np.log(box_wh[idx] / self.anchors[i, j])
            """
            new_y[i][s_grid_xy[1], s_grid_xy[0], j] = [s_box_loc[0], s_box_loc[1], s_box_loc[2], s_box_loc[3], 1, y[idx, 4]]
            #print(new_y[i][s_grid_xy[1], s_grid_xy[0], j])
        return new_x, new_y

# Test whether dataset can be preprocessed

In [None]:
dataset_preprocess = DatasetPre()
for x, y in DatasetGen():
    new_x, new_y = dataset_preprocess.transform(x, y)
    print(new_x.shape, [elem.shape for elem in new_y])

# Define Loss

In [None]:
import cv2
import numpy as np
import tensorflow as tf

def YoloLoss(anchors, classes=80, ignore_thresh=0.5):
    def yolo_boxes(pred):
        # pred: (batch_size, grid, grid, anchors, (x, y, w, h, obj, ...classes))
        grid_size = tf.shape(pred)[1]
        box_xy, box_wh, objectness, class_probs = tf.split(pred, (2, 2, 1, classes), axis=-1)

        box_xy = tf.sigmoid(box_xy)
        objectness = tf.sigmoid(objectness)
        class_probs = tf.sigmoid(class_probs)
        pred_box = tf.concat((box_xy, box_wh), axis=-1)  # original xywh for loss

        # !!! grid[x][y] == (y, x)
        grid = tf.meshgrid(tf.range(grid_size), tf.range(grid_size))
        grid = tf.expand_dims(tf.stack(grid, axis=-1), axis=2)  # [gx, gy, 1, 2]

        box_xy = (box_xy + tf.cast(grid, tf.float32)) / tf.cast(grid_size, tf.float32)
        box_wh = tf.exp(box_wh) * anchors

        box_x1y1 = box_xy - box_wh / 2
        box_x2y2 = box_xy + box_wh / 2
        bbox = tf.concat([box_x1y1, box_x2y2], axis=-1)

        return bbox, objectness, class_probs, pred_box

    def broadcast_iou(box_1, box_2):
        # box_1: (..., (x1, y1, x2, y2))
        # box_2: (N, (x1, y1, x2, y2))

        # broadcast boxes
        box_1 = tf.expand_dims(box_1, -2)
        box_2 = tf.expand_dims(box_2, 0)
        # new_shape: (..., N, (x1, y1, x2, y2))
        new_shape = tf.broadcast_dynamic_shape(tf.shape(box_1), tf.shape(box_2))
        box_1 = tf.broadcast_to(box_1, new_shape)
        box_2 = tf.broadcast_to(box_2, new_shape)

        int_w = tf.maximum(tf.minimum(box_1[..., 2], box_2[..., 2]) - tf.maximum(box_1[..., 0], box_2[..., 0]), 0)
        int_h = tf.maximum(tf.minimum(box_1[..., 3], box_2[..., 3]) - tf.maximum(box_1[..., 1], box_2[..., 1]), 0)
        int_area = int_w * int_h
        box_1_area = (box_1[..., 2] - box_1[..., 0]) * (box_1[..., 3] - box_1[..., 1])
        box_2_area = (box_2[..., 2] - box_2[..., 0]) * (box_2[..., 3] - box_2[..., 1])
        return int_area / (box_1_area + box_2_area - int_area)

    def yolo_loss(y_true, y_pred):
        # 1. transform all pred outputs
        # y_pred: (batch_size, grid, grid, anchors, (x, y, w, h, obj, ...cls))
        pred_box, pred_obj, pred_class, pred_xywh = yolo_boxes(y_pred)
        pred_xy = pred_xywh[..., 0:2]
        pred_wh = pred_xywh[..., 2:4]

        # 2. transform all true outputs
        # y_true: (batch_size, grid, grid, anchors, (x1, y1, x2, y2, obj, cls))
        true_box, true_obj, true_class_idx = tf.split(y_true, (4, 1, 1), axis=-1)
        true_xy = (true_box[..., 0:2] + true_box[..., 2:4]) / 2
        true_wh = true_box[..., 2:4] - true_box[..., 0:2]

        # give higher weights to small boxes
        box_loss_scale = 2 - true_wh[..., 0] * true_wh[..., 1]

        # 3. inverting the pred box equations
        grid_size = tf.shape(y_true)[1]
        grid = tf.meshgrid(tf.range(grid_size), tf.range(grid_size))
        grid = tf.expand_dims(tf.stack(grid, axis=-1), axis=2)
        true_xy = true_xy * tf.cast(grid_size, tf.float32) - tf.cast(grid, tf.float32)
        true_wh = tf.math.log(true_wh / anchors)
        true_wh = tf.where(tf.math.is_inf(true_wh), tf.zeros_like(true_wh), true_wh)

        # 4. calculate all masks
        obj_mask = tf.squeeze(true_obj, -1)
        # ignore false positive when iou is over threshold
        best_iou = tf.map_fn(
            lambda x: tf.reduce_max(broadcast_iou(x[0], tf.boolean_mask(
                x[1], tf.cast(x[2], tf.bool))), axis=-1),
            (pred_box, true_box, obj_mask),
            tf.float32)
        ignore_mask = tf.cast(best_iou < ignore_thresh, tf.float32)

        # 5. calculate all losses
        xy_loss = obj_mask * box_loss_scale * tf.reduce_sum(tf.square(true_xy - pred_xy), axis=-1)
        wh_loss = obj_mask * box_loss_scale * tf.reduce_sum(tf.square(true_wh - pred_wh), axis=-1)
        obj_loss = tf.keras.losses.binary_crossentropy(true_obj, pred_obj)
        obj_loss = obj_mask * obj_loss + (1 - obj_mask) * ignore_mask * obj_loss
        # TODO: use binary_crossentropy instead
        class_loss = obj_mask * tf.keras.losses.sparse_categorical_crossentropy(true_class_idx, pred_class)

        # 6. sum over (batch, gridx, gridy, anchors) => (batch, 1)
        xy_loss = tf.reduce_sum(xy_loss, axis=(1, 2, 3))
        wh_loss = tf.reduce_sum(wh_loss, axis=(1, 2, 3))
        obj_loss = tf.reduce_sum(obj_loss, axis=(1, 2, 3))
        class_loss = tf.reduce_sum(class_loss, axis=(1, 2, 3))
        return xy_loss + wh_loss + obj_loss + class_loss
    return yolo_loss

In [None]:
import cv2
import numpy as np
import tensorflow as tf


optimizer = tf.keras.optimizers.Adam(lr=0.01)
anchor_boxes = np.array([[(116, 90), (156, 198), (373, 326)], [(30, 61), (62, 45),(59, 119)],  [(10, 13), (16, 30), (33, 23)]])
num_classes = 80
loss = [YoloLoss(mask, classes=num_classes) for mask in anchor_boxes]
optimizer = tf.keras.optimizers.Adam(lr=0.01)

model = tf.keras.models.load_model('yolo3.h5')
model.compile(optimizer=optimizer, loss=loss)
callbacks = [
    ReduceLROnPlateau(verbose=1),
    EarlyStopping(patience=3, verbose=1),
    ModelCheckpoint('yolov3_train_{epoch}.tf',verbose=1, save_weights_only=True),
    TensorBoard(log_dir='logs')
]

#train_dataset =  (np.zeros(1, 416, 416, 3), (np.zeros((1, 13, 13, 3, 6)), np.zeros((1, 26, 26, 3, 6)), np.zeros((1, 52, 52, 3, 6))))
#validaion_data = (np.zeros(1, 416, 416, 3), (np.zeros((1, 13, 13, 3, 6)), np.zeros((1, 26, 26, 3, 6)), np.zeros((1, 52, 52, 3, 6))))
history = model.fit(train_dataset, epochs=100, callbacks=callbacks, validation_data=val_dataset)