# YOLO Train

## Read data

In [1]:
import os
import xml.etree.ElementTree as ET
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import random
from tqdm import tqdm
from utils import box_iou, letterbox_image

In [2]:
xml_path = '../oxford_data/annotations/xmls'
image_path = '../oxford_data/images/'
file_list = os.listdir(xml_path)
num_classes = 2

In [3]:
classes_name = ['cat', 'dog']
num_anchors = 3

In [4]:
# read anchors
with open('yolo_anchors.txt') as f:
    anchors = f.readline()
    anchors = [float(x) for x in anchors.split(',')]
    anchors = np.array(anchors).reshape(-1, 2)

Random sample data when testing

In [5]:
random.seed(2018)
choose_file_index = random.sample([i for i in range(len(file_list))], 1000)
file_list = [file_list[i] for i in choose_file_index]
print(len(file_list))
print(file_list[:10])

1000
['miniature_pinscher_185.xml', 'Bengal_120.xml', 'american_bulldog_112.xml', 'havanese_158.xml', 'pug_175.xml', 'Russian_Blue_158.xml', 'Maine_Coon_203.xml', 'english_cocker_spaniel_128.xml', 'beagle_189.xml', 'japanese_chin_181.xml']


### Read images

In [6]:
def read_image(file_list, image_path):
    """
    Arguments:
        file_list: xml file names.
        image_path: image file path.
    """
    image_wh = []
    for i, file in enumerate(tqdm(file_list)):
        image = Image.open(image_path + file[:-4] + '.jpg')
        image_wh.append(image.size)
        image = letterbox_image(image, (416, 416))
        image = np.array(image, dtype='uint8')
        image = np.expand_dims(image, 0)
        if i == 0:
            X = image
        else:
            X = np.concatenate([X, image], axis=0)
        
    return X, image_wh

In [7]:
X, image_wh = read_image(file_list, image_path)
print(X.shape)

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [02:11<00:00,  7.62it/s]


(1000, 416, 416, 3)


### Read boxes

In [8]:
def read_boxes(file_list, xml_path):
    """
    Arguments:
        file_list: xml file names.
        xml_path: xml file path.
    """
    boxes = []
    for file in tqdm(file_list):
        tree =  ET.ElementTree(file=xml_path + os.sep + file)
        xmin = []
        xmax = []
        ymin = []
        ymax = []
        for elem in tree.iterfind('object/bndbox/xmin'):
            xmin.append(int(elem.text))

        for elem in tree.iterfind('object/bndbox/xmax'):
            xmax.append(int(elem.text))

        for elem in tree.iterfind('object/bndbox/ymin'):
            ymin.append(int(elem.text))

        for elem in tree.iterfind('object/bndbox/ymax'):
            ymax.append(int(elem.text))

        xmin = np.array(xmin).reshape(-1, 1)
        xmax = np.array(xmax).reshape(-1, 1)
        ymin = np.array(ymin).reshape(-1, 1)
        ymax = np.array(ymax).reshape(-1, 1)
        box = np.concatenate([xmin, ymin, xmax, ymax], axis=-1) - 1
        boxes.append(box)
        
    return boxes

In [9]:
boxes = read_boxes(file_list, xml_path)
print(len(boxes))

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:02<00:00, 342.77it/s]


1000


Verify

In [10]:
for i in range(len(boxes)):
    for j in range(boxes[i].shape[0]):
        xmin = boxes[i][j, 0]
        ymin = boxes[i][j, 1]
        xmax = boxes[i][j, 2]
        ymax = boxes[i][j, 3]
        if xmin < 0:
            print(file_list[i], ' xmin=', xmin)
        if xmax >= image_wh[i][0]:
            print(file_list[i], ' xmax=', xmax)
        if ymin < 0:
            print(file_list[i], ' ymin=', ymin)
        if ymax >= image_wh[i][1]:
            print(file_list[i], ' ymax=', ymax)
            
        if xmax-xmin <= 0:
            print(file_list[i], ' xmax=', xmax, ' xmin=', xmin)
        if ymax-ymin <= 0:
            print(file_list[i], ' ymax=', ymax, ' ymin=', ymin)

### Read y classes

In [11]:
y_classes = np.zeros(len(file_list), dtype='int32')
for i, file in enumerate(file_list):
    if file[0].islower():
        y_classes[i] = 1
        
print(y_classes.shape)

(1000,)


## Transfer boxes to y format

In [12]:
def boxes_to_y(true_boxes, box_class, anchors, num_classes, image_wh):
    """
    transfer true boxes to yolo y format.
    Arguments:
        true_boxes: bbox absolute value in image_wh of one image, value as (xmin, ymin, xmax, ymax), shape(?, 4).
        box_class: 0/1 - cat/dog.
        anchors: anchor boxe size array, shape(num_anchors, 2).
        num_classes: total class num.
        image_wh: true input image size of (w, h).
        
    Returns:
        y_true: xywh fomat, shape(box_num, 4).
    """
    # transfer xy to range(0, 416).
    xymin, xymax = true_boxes[:, 0:2], true_boxes[:, 2:4]
    
    input_size = np.array([416, 416])
    zoom_scale = np.min(input_size/image_wh)
    image_wh = np.array(image_wh)
    nopadding_wh = np.round(image_wh * zoom_scale)  # w,h==416 or one of w,h < 416.
    padding_offset = (input_size - nopadding_wh)//2  # padding to 416.
    
    xymin = xymin * zoom_scale + padding_offset
    xymax = xymax * zoom_scale + padding_offset
    
    # calculate box center xy and wh, range(0, 416).
    boxes_wh = xymax - xymin
    boxes_xy = xymin + boxes_wh//2
    
    # normalize to range(0, 1)
    boxes_xy /= input_size
    
    # grid shape
    grid_wh = [input_size//32, input_size//16, input_size//8]  # [[13, 13], [26, 26], [52, 52]]
    grid_boxes_xy = [boxes_xy * grid_wh[i] for i in range(3)]  # to grid scale, range(0, grid_wh).
    grid_index = [np.floor(grid_boxes_xy[i]) for i in range(3)]
    # boxes_xy = [(boxes_xy[i] - grid_index[i]) for i in range(3)]  # size respect to one grid, range(0, 1).
    
    # true size of xy min max cordinates relative to grid left top corner.
    anchor_xymax = anchors/2
    anchor_xymin = -anchor_xymax
    box_xymax = boxes_wh/2
    box_xymin = -box_xymax
    
    # create y_true.
    y_true = [np.zeros((grid_wh[i][1], grid_wh[i][0], 3, 5+num_classes), dtype='float32') for i in range(3)]
    
    # iterate on each box
    num_boxes = true_boxes.shape[0]
    for box_index in range(num_boxes):
        # calculate iou.
        box1 = np.concatenate([box_xymin[box_index], box_xymax[box_index]]).reshape(1, -1)
        box2 = np.concatenate([anchor_xymin, anchor_xymax], axis=-1)
        iou = box_iou(box1, box2)
        
        # select the best anchor
        anchor_index = np.argmax(iou)
        layer_index = 2 - anchor_index//3
        layer_anchor_index = anchor_index % 3
        
        box_xy = boxes_xy[box_index]  # shape(2,)
        # box_wh = boxes_wh[box_index]/anchors[anchor_index]  # shape(2,)
        box_wh = boxes_wh[box_index]/input_size  # shape(2,)， range(0, 1)
        
        #  fill in y_true.
        w = grid_index[layer_index][box_index, 0].astype('int32')
        h = grid_index[layer_index][box_index, 1].astype('int32')
        y_true[layer_index][h, w, layer_anchor_index, :2] = box_xy
        y_true[layer_index][h, w, layer_anchor_index, 2:4] = box_wh
        y_true[layer_index][h, w, layer_anchor_index, 4:5] = 1
        y_true[layer_index][h, w, layer_anchor_index, 5+box_class] = 1
        
    return y_true

### Get y_true

In [13]:
y_true = []
for i in tqdm(range(len(boxes))):
    _y_true = boxes_to_y(boxes[i], y_classes[i], anchors, num_classes, image_wh[i])
    _y_true = [np.expand_dims(_y_true[j], axis=0) for j in range(3)]
    if i == 0:
        y_true = _y_true
    else:
        y_true = [np.concatenate([y_true[j], _y_true[j]], axis=0) for j in range(3)]
    

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:11<00:00, 14.07it/s]


In [14]:
print(y_true[0].shape, y_true[1].shape, y_true[2].shape)

(1000, 13, 13, 3, 7) (1000, 26, 26, 3, 7) (1000, 52, 52, 3, 7)


## Build new model

In [15]:
from keras.layers import Input, Conv2D, Lambda
from keras.models import Model, load_model
from keras.regularizers import l2
from keras.optimizers import Adam
from keras import backend as K

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [16]:
def my_init(shape, dtype=None):
    return K.random_normal(shape, stddev=1e-4, dtype=dtype)

In [17]:
model = load_model('yolo.h5')
for layer in model.layers:
    layer.trainable = False
    
y = [model.layers[-6].output, model.layers[-5].output, model.layers[-4].output]
for i in range(3):
    y[i] = Conv2D(num_anchors*(num_classes+5), (1, 1),
                  padding='same',
                  kernel_initializer=my_init,
                  kernel_regularizer=l2(5e-4),
                  name='conv2d_%d'%(59+i*8))(y[i])
    
model = Model(model.input, y)



In [18]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 416, 416, 3)  0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 416, 416, 32) 864         input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 416, 416, 32) 128         conv2d_1[0][0]                   
__________________________________________________________________________________________________
leaky_re_lu_1 (LeakyReLU)       (None, 416, 416, 32) 0           batch_normalization_1[0][0]      
__________________________________________________________________________________________________
zero_paddi

batch_normalization_11 (BatchNo (None, 52, 52, 128)  512         conv2d_11[0][0]                  
__________________________________________________________________________________________________
leaky_re_lu_11 (LeakyReLU)      (None, 52, 52, 128)  0           batch_normalization_11[0][0]     
__________________________________________________________________________________________________
conv2d_12 (Conv2D)              (None, 52, 52, 256)  294912      leaky_re_lu_11[0][0]             
__________________________________________________________________________________________________
batch_normalization_12 (BatchNo (None, 52, 52, 256)  1024        conv2d_12[0][0]                  
__________________________________________________________________________________________________
leaky_re_lu_12 (LeakyReLU)      (None, 52, 52, 256)  0           batch_normalization_12[0][0]     
__________________________________________________________________________________________________
add_4 (Add

__________________________________________________________________________________________________
leaky_re_lu_22 (LeakyReLU)      (None, 52, 52, 256)  0           batch_normalization_22[0][0]     
__________________________________________________________________________________________________
add_9 (Add)                     (None, 52, 52, 256)  0           add_8[0][0]                      
                                                                 leaky_re_lu_22[0][0]             
__________________________________________________________________________________________________
conv2d_23 (Conv2D)              (None, 52, 52, 128)  32768       add_9[0][0]                      
__________________________________________________________________________________________________
batch_normalization_23 (BatchNo (None, 52, 52, 128)  512         conv2d_23[0][0]                  
__________________________________________________________________________________________________
leaky_re_l

batch_normalization_33 (BatchNo (None, 26, 26, 512)  2048        conv2d_33[0][0]                  
__________________________________________________________________________________________________
leaky_re_lu_33 (LeakyReLU)      (None, 26, 26, 512)  0           batch_normalization_33[0][0]     
__________________________________________________________________________________________________
add_14 (Add)                    (None, 26, 26, 512)  0           add_13[0][0]                     
                                                                 leaky_re_lu_33[0][0]             
__________________________________________________________________________________________________
conv2d_34 (Conv2D)              (None, 26, 26, 256)  131072      add_14[0][0]                     
__________________________________________________________________________________________________
batch_normalization_34 (BatchNo (None, 26, 26, 256)  1024        conv2d_34[0][0]                  
__________

zero_padding2d_5 (ZeroPadding2D (None, 27, 27, 512)  0           add_19[0][0]                     
__________________________________________________________________________________________________
conv2d_44 (Conv2D)              (None, 13, 13, 1024) 4718592     zero_padding2d_5[0][0]           
__________________________________________________________________________________________________
batch_normalization_44 (BatchNo (None, 13, 13, 1024) 4096        conv2d_44[0][0]                  
__________________________________________________________________________________________________
leaky_re_lu_44 (LeakyReLU)      (None, 13, 13, 1024) 0           batch_normalization_44[0][0]     
__________________________________________________________________________________________________
conv2d_45 (Conv2D)              (None, 13, 13, 512)  524288      leaky_re_lu_44[0][0]             
__________________________________________________________________________________________________
batch_norm

batch_normalization_55 (BatchNo (None, 13, 13, 512)  2048        conv2d_55[0][0]                  
__________________________________________________________________________________________________
leaky_re_lu_55 (LeakyReLU)      (None, 13, 13, 512)  0           batch_normalization_55[0][0]     
__________________________________________________________________________________________________
conv2d_56 (Conv2D)              (None, 13, 13, 1024) 4718592     leaky_re_lu_55[0][0]             
__________________________________________________________________________________________________
batch_normalization_56 (BatchNo (None, 13, 13, 1024) 4096        conv2d_56[0][0]                  
__________________________________________________________________________________________________
leaky_re_lu_56 (LeakyReLU)      (None, 13, 13, 1024) 0           batch_normalization_56[0][0]     
__________________________________________________________________________________________________
conv2d_57 

batch_normalization_69 (BatchNo (None, 52, 52, 128)  512         conv2d_71[0][0]                  
__________________________________________________________________________________________________
leaky_re_lu_69 (LeakyReLU)      (None, 52, 52, 128)  0           batch_normalization_69[0][0]     
__________________________________________________________________________________________________
conv2d_72 (Conv2D)              (None, 52, 52, 256)  294912      leaky_re_lu_69[0][0]             
__________________________________________________________________________________________________
batch_normalization_70 (BatchNo (None, 52, 52, 256)  1024        conv2d_72[0][0]                  
__________________________________________________________________________________________________
leaky_re_lu_70 (LeakyReLU)      (None, 52, 52, 256)  0           batch_normalization_70[0][0]     
__________________________________________________________________________________________________
conv2d_73 

## YOLO loss

In [19]:
def yolo_loss(args, anchors, num_classes):
    feats = args[:3]
    y_true = args[3:]
    anchor_mask = [[6,7,8], [3,4,5], [0,1,2]]
    num_anchors = len(anchor_mask)
    input_wh = K.constant([416, 416])
    batch_size = K.shape(feats[0])[0]
    loss = 0
    
    # iterate along 3 layers.
    for i in range(3):
        # get anchor tensor
        anchors_tensor = K.reshape(K.constant(anchors[anchor_mask[i]]), [1, 1, 1, num_anchors, 2])
        
        # get grid
        grid_hw = K.shape(feats[i])[1:3]
        grid_y = K.tile(K.reshape(K.arange(0, stop=grid_hw[0]), [-1, 1, 1, 1]), [1, grid_hw[1], 1, 1])
        grid_x = K.tile(K.reshape(K.arange(0, stop=grid_hw[1]), [1, -1, 1, 1]), [grid_hw[0], 1, 1, 1])
        grid = K.concatenate([grid_x, grid_y])
        grid = K.cast(grid, K.dtype(feats[i]))
        
        # get prediction values
        feature = K.reshape(feats[i], [-1, grid_hw[0], grid_hw[1], num_anchors, num_classes + 5])
        box_xy = K.sigmoid(feature[..., :2])
        box_wh = K.exp(feature[..., 2:4])
        box_confidence = K.sigmoid(feature[..., 4:5])
        box_class_probs = K.sigmoid(feature[..., 5:])
        box_xy = (box_xy + grid) / K.cast(grid_hw[::-1], K.dtype(feature))
        box_wh = box_wh * anchors_tensor / K.cast(input_wh, K.dtype(feature))
        
        # get true values
        ture_xy = y_true[i][..., :2]
        true_wh = y_true[i][..., 2:4]
        true_confidence = y_true[i][..., 4:5]
        true_class_probs = y_true[i][..., 5:]
        
        # calculate loss
        lambda_cord = K.constant(5)
        
        # shape(batch, grid_h, grid_w, num_anchors, 2)
        xy_loss = lambda_cord * K.square(true_confidence * box_xy - ture_xy)
        
        wh_loss = lambda_cord * K.square(true_confidence * K.sqrt(box_wh) - K.sqrt(true_wh))
        
        # iou_loss = -true_confidence*K.log(box_confidence) - (1-true_confidence)*K.log(1-box_confidence)
        iou_loss = K.square(true_confidence-box_confidence)
        iou_loss = true_confidence*iou_loss + 0.5 * (1-true_confidence)*iou_loss
        
        class_loss = K.square(true_confidence * box_class_probs - true_class_probs)
        
        loss += K.sum(xy_loss) + K.sum(wh_loss) + K.sum(iou_loss) + K.sum(class_loss)
        
    return loss/K.cast(batch_size, K.dtype(loss))

## Train

In [20]:
y_tensor = [Input(shape=(13, 13, num_anchors, num_classes+5), name='y_input_1'),
            Input(shape=(26, 26, num_anchors, num_classes+5), name='y_input_2'),
            Input(shape=(52, 52, num_anchors, num_classes+5), name='y_input_3')]
loss_layer = Lambda(yolo_loss, output_shape=(1,), name='yolo_loss', arguments={'anchors': anchors, 'num_classes': num_classes})([*model.output, *y_tensor])

train_model = Model([model.input, *y_tensor], loss_layer)

In [21]:
train_model.compile(optimizer=Adam(lr=0.001, clipnorm=1.), loss={'yolo_loss': lambda y_true, y_pred: y_pred})

In [22]:
train_model.fit([X, *y_true], np.zeros(len(X)), batch_size=64, epochs=10, validation_split=0.1)

Train on 900 samples, validate on 100 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1e6227206d8>

In [61]:
model.save('yolo_train.h5')

### Test loss func

In [52]:
y_pred = model.predict(X[0:2])

In [53]:
sess = K.get_session()
input_x = [K.placeholder(shape=(None, 13*(2**i), 13*(2**i), 21)) for i in range(3)]
input_y = [K.placeholder(shape=(None, 13*(2**i), 13*(2**i), 3, 7)) for i in range(3)]
loss_tensor = yolo_loss([*input_x, *input_y], anchors, num_classes)
loss = sess.run(loss_tensor, feed_dict={input_x[0]: y_pred[0], input_x[1]: y_pred[1], input_x[2]: y_pred[2],
                                        input_y[0]: y_true[0][0:2], input_y[1]: y_true[1][0:2], input_y[2]: y_true[2][0:2]})

In [None]:
print(loss)