## YOLO Training ##

In [1]:
import tensorflow as tf
import cv2
import numpy as np
import os
import matplotlib.pyplot as plt
from tensorflow import contrib
autograph = contrib.autograph
import yolonet_model
import time


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



**Define the parameters**

In [2]:
image_width = 448
image_height = 448
image_width_delta = 90   #448*1.2-448
image_height_delta = 90  #448*1.2-448 
batch_size = 64
valid_batch_size = 1
epoch_size = 26332
grids=7
lambda_noobj = 0.5
lambda_obj = 5.0
grid_width = image_width//grids
grid_height = image_height//grids
labels = ['person','bird','cat','cow','dog','horse','sheep','aeroplane','bicycle',
          'boat','bus','car','motorbike','train','bottle','chair','diningtable',
          'pottedplant','sofa','tvmonitor']

**Define the function to parse the tfrecord**

In [4]:
def _parse_function(example_proto):
    features = {"image": tf.FixedLenFeature([], tf.string, default_value=""),
                "height": tf.FixedLenFeature([1], tf.int64, default_value=[0]),
                "width": tf.FixedLenFeature([1], tf.int64, default_value=[0]),
                "channels": tf.FixedLenFeature([1], tf.int64, default_value=[3]),
                "colorspace": tf.FixedLenFeature([], tf.string, default_value=""),
                "img_format": tf.FixedLenFeature([], tf.string, default_value=""),
                "label": tf.VarLenFeature(tf.int64),
                "bbox_xmin": tf.VarLenFeature(tf.int64),
                "bbox_xmax": tf.VarLenFeature(tf.int64),
                "bbox_ymin": tf.VarLenFeature(tf.int64),
                "bbox_ymax": tf.VarLenFeature(tf.int64),
                "filename": tf.FixedLenFeature([], tf.string, default_value="")
               }
    parsed_features = tf.parse_single_example(example_proto, features)
    
    label = tf.expand_dims(parsed_features["label"].values, 0)
    label = tf.cast(label, tf.float32)
    height = parsed_features["height"]
    width = parsed_features["width"]
    channels = parsed_features["channels"]

    #Generate the random crop offset
    random_width_start = tf.random.uniform([1], minval=0, maxval=image_width_delta, dtype=tf.dtypes.int64)
    random_height_start = tf.random.uniform([1], minval=0, maxval=image_height_delta, dtype=tf.dtypes.int64)
    random_start = tf.concat([random_height_start, random_width_start, tf.constant([0], dtype=tf.dtypes.int64)], axis=0)
    
    #Adjust the bbox coordinates with random crop offset
    def f1():
        xmin = tf.expand_dims(parsed_features["bbox_xmin"].values, 0)
        xmax = tf.expand_dims(parsed_features["bbox_xmax"].values, 0)
        ymin = tf.expand_dims(parsed_features["bbox_ymin"].values, 0)
        ymax = tf.expand_dims(parsed_features["bbox_ymax"].values, 0)
        xmin = xmin - random_width_start
        xmin = tf.clip_by_value(xmin, 0, image_width)
        xmax = xmax - random_width_start
        xmax = tf.clip_by_value(xmax, 0, image_width)
        ymin = ymin - random_height_start
        ymin = tf.clip_by_value(ymin, 0, image_height)
        ymax = ymax - random_height_start
        ymax = tf.clip_by_value(ymax, 0, image_height)
        return xmin, xmax, ymin, ymax
    #Adjust the bbox coordinates with image flipped and random crop offset
    def f2():
        xmin = tf.expand_dims(parsed_features["bbox_xmin"].values, 0)
        xmax = tf.expand_dims(parsed_features["bbox_xmax"].values, 0)
        ymin = tf.expand_dims(parsed_features["bbox_ymin"].values, 0)
        ymax = tf.expand_dims(parsed_features["bbox_ymax"].values, 0)
        xmin_temp = xmin - random_width_start
        xmax_temp = xmax - random_width_start
        xmin = image_width - tf.clip_by_value(xmax_temp, 0, image_width)
        xmax = image_width - tf.clip_by_value(xmin_temp, 0, image_width)
        ymin = ymin - random_height_start
        ymin = tf.clip_by_value(ymin, 0, image_height)
        ymax = ymax - random_height_start
        ymax = tf.clip_by_value(ymax, 0, image_height)
        return xmin, xmax, ymin, ymax
    
    #Generate the random flip flag
    random_flip = tf.random.uniform([1], minval=0, maxval=1, dtype=tf.dtypes.float32)
    #Get the random flip and crop image coordinates
    xmin, xmax, ymin, ymax = tf.cond(tf.less(random_flip[0], 0.5), f1, f2)
    image_raw = tf.image.decode_jpeg(parsed_features["image"], channels=3)
    image_sliced = tf.slice(image_raw, random_start, [image_height, image_width, -1])
    image_decoded = tf.image.convert_image_dtype(image_sliced, tf.float32)
    image_flipped = tf.cond(tf.less(random_flip[0], 0.5), lambda:image_decoded, lambda:tf.image.flip_left_right(image_decoded))
    image_train = tf.image.per_image_standardization(image_flipped)
    
    #Calculate the boxes center point
    box_center_x = xmin+(xmax-xmin)//2
    box_center_y = ymin+(ymax-ymin)//2
    #Calculate the boxes relate to which grid
    grid_id = (tf.ceil(box_center_y/grid_height)-1)*grids + tf.ceil(box_center_x/grid_width) - 1
    grid_id = tf.cast(grid_id, tf.float32)
    #Calculate and normalize the bbox center and width by grids
    center_x_percent = box_center_x%grid_width/grid_width
    center_x_percent = tf.cast(center_x_percent, tf.float32)
    center_y_percent = box_center_y%grid_height/grid_height
    center_y_percent = tf.cast(center_y_percent, tf.float32)
    box_width = (xmax-xmin)/image_width
    box_width = tf.cast(box_width, tf.float32)
    box_height = (ymax-ymin)/image_height
    box_height = tf.cast(box_height, tf.float32)
    #Generate the new bbox vector for label
    bbox = tf.concat(axis=0, values=[grid_id, center_x_percent, center_y_percent, box_width, box_height, label])
    bbox = tf.transpose(bbox, [1, 0])

    return image_train, parsed_features["filename"], image_raw, bbox

**Construct the train dataset**

In [5]:
with tf.device('/cpu:0'):
    train_files = tf.data.Dataset.list_files("train_tf/*.tfrecord")
    dataset_train = train_files.interleave(tf.data.TFRecordDataset, cycle_length=4, num_parallel_calls=4)
    dataset_train = dataset_train.shuffle(buffer_size=epoch_size)
    dataset_train = dataset_train.repeat(100)
    dataset_train = dataset_train.map(_parse_function, num_parallel_calls=12)
    dataset_train = dataset_train.padded_batch(batch_size, \
                                               padded_shapes=([None,None,None], [], \
                                                              [None,None,None], [None,None]))
    dataset_train = dataset_train.prefetch(batch_size)
    iterator = tf.data.Iterator.from_structure(dataset_train.output_types, dataset_train.output_shapes)
    image_train, filename, image_decoded, bbox = iterator.get_next()
    train_init_op = iterator.make_initializer(dataset_train)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


**Verify the train data is correct**

In [None]:
with tf.Session() as sess:
    sess.run(train_init_op)
    images_r, images_t_r, filename_r, bbox_run = sess.run([image_decoded, image_train, filename, bbox])
    
#Show the random crop and flip image with the bbox
image_index = 9     #select one image in the batch
image = images_t_r[image_index]
image_bbox = bbox_run[image_index]

for i in range(image_bbox.shape[0]):
    if image_bbox[i][3]==0.0 or image_bbox[i][4]==0.0:
        continue
    else:
        center_x = grid_width*image_bbox[i][1]+image_bbox[i][0]%grids*grid_width
        center_y = grid_height*image_bbox[i][2]+image_bbox[i][0]//grids*grid_height
        xmin = int(center_x - image_bbox[i][3]*image_width//2)
        xmax = int(center_x + image_bbox[i][3]*image_width//2)
        ymin = int(center_y - image_bbox[i][4]*image_height//2)
        ymax = int(center_y + image_bbox[i][4]*image_height//2)
        cv2.rectangle(image, (xmin,ymin), (xmax,ymax), (0,255,0), 2)

plt.imshow(image)

**Define the IOU and LOSS calculation function**

In [6]:
#Calculate the IOU, the merged is two rect vectors combined
#Each rect vecor has 4 elements, xmin, xmax, ymin and ymax
def calculate_IOU(merged):
    rect1 = merged[:4]
    rect2 = merged[4:8]
    IOU = 0.0
    IOU_area = 0.0
    xmin=0.0
    xmax=0.0
    ymin=0.0
    ymax= 0.0
    rect1_area=0.0
    rect2_area= 0.0
    if (rect1[0]>=rect2[1] or rect2[0]>=rect1[1] or rect1[2]>=rect2[3] or rect2[2]>=rect1[3]):
        IOU = 0.0
    else:
        xmin=tf.maximum(rect1[0], rect3[0])
        xmax=tf.minimum(rect1[1], rect3[1])
        ymin=tf.maximum(rect1[2], rect3[2])
        ymax=tf.minimum(rect1[3], rect3[3])
        IOU_area=(xmax-xmin)*(ymax-ymin)
        rect1_area=(rect1[1]-rect1[0])*(rect1[3]-rect1[2])
        rect3_area=(rect3[1]-rect3[0])*(rect3[3]-rect3[2])
        IOU=IOU_area / (rect1_area+rect3_area-IOU_area)
    return IOU
tf_calculate_IOU = autograph.to_graph(calculate_IOU)

#Calculate the loss. Based on the YOLO V1 paper
#The pred is the prediction vector with shape [batchsize*7*7,30].
#The label is the vector with shape [batchsize*7*7,26].
def loss_func(pred, label):
    #Divide the pred and label vectors by the mask for with object or non object.
    mask_obj = label[:,1]>0.0
    mask_noobj = label[:,1]<1.0
    pred_obj = tf.boolean_mask(pred, mask_obj)
    label_obj = tf.boolean_mask(label, mask_obj)
    pred_noobj = tf.boolean_mask(pred, mask_noobj)
    label_noobj = tf.boolean_mask(label, mask_noobj)
    #Calculate the no obj prediction error
    loss_noobj = tf.reduce_sum(tf.square(pred_noobj[:,0])+tf.square(pred_noobj[:,5]))
    loss_classes = tf.reduce_sum(tf.square(pred_obj[:,10:]-label_obj[:,6:]))
    #Calculate the prediction box coordinates
    center_x1 = pred_obj[:,1:2]*grid_width
    center_y1 = pred_obj[:,2:3]*grid_height
    xmin_1 = center_x1 - pred_obj[:,3:4]**2*image_width//2
    xmax_1 = center_x1 + pred_obj[:,3:4]**2*image_width//2
    ymin_1 = center_y1 - pred_obj[:,4:5]**2*image_height//2
    ymax_1 = center_y1 + pred_obj[:,4:5]**2*image_height//2
    center_x2 = pred_obj[:,6:7]*grid_width
    center_y2 = pred_obj[:,7:8]*grid_height
    xmin_2 = center_x2 - pred_obj[:,8:9]**2*image_width//2
    xmax_2 = center_x2 + pred_obj[:,8:9]**2*image_width//2
    ymin_2 = center_y2 - pred_obj[:,9:10]**2*image_height//2
    ymax_2 = center_y2 + pred_obj[:,9:10]**2*image_height//2
    #Calculate the label box coordinates
    center_x = label_obj[:,2:3]*grid_width
    center_y = label_obj[:,3:4]*grid_height
    xmin = center_x - label_obj[:,4:5]*image_width//2
    xmax = center_x + label_obj[:,4:5]*image_width//2
    ymin = center_y - label_obj[:,5:6]*image_height//2
    ymax = center_y + label_obj[:,5:6]*image_height//2
    #Concat the prediction box and ground truth box and calculate the IOU
    merged1 = tf.concat([xmin_1,xmax_1,ymin_1,ymax_1,xmin,xmax,ymin,ymax], -1)
    merged2 = tf.concat([xmin_2,xmax_2,ymin_2,ymax_2,xmin,xmax,ymin,ymax], -1)
    IOU1 = tf.map_fn(tf_calculate_IOU, merged1)
    IOU2 = tf.map_fn(tf_calculate_IOU, merged2)
    #Select the higher IOU prediction box for coordination loss calculation
    IOU1_mask = tf.math.greater_equal(IOU1,IOU2)
    IOU2_mask = tf.math.greater(IOU2,IOU1)
    coord_IOU1 = tf.boolean_mask(pred_obj[:,:5], IOU1_mask)
    label_IOU1 = tf.boolean_mask(label_obj[:,2:6], IOU1_mask)
    coord_IOU2 = tf.boolean_mask(pred_obj[:,5:10], IOU2_mask)
    label_IOU2 = tf.boolean_mask(label_obj[:,2:6], IOU2_mask)
    loss_coord = lambda_obj * (tf.reduce_sum( \
                 tf.square(coord_IOU1[:,1]-label_IOU1[:,0]) + \
                 tf.square(coord_IOU1[:,2]-label_IOU1[:,1]) + \
                 tf.square(coord_IOU1[:,3]-tf.sqrt(label_IOU1[:,2])) + \
                 tf.square(coord_IOU1[:,4]-tf.sqrt(label_IOU1[:,3])))+ \
                 tf.reduce_sum( \
                 tf.square(coord_IOU2[:,1]-label_IOU2[:,0]) + \
                 tf.square(coord_IOU2[:,2]-label_IOU2[:,1]) + \
                 tf.square(coord_IOU2[:,3]-tf.sqrt(label_IOU2[:,2])) + \
                 tf.square(coord_IOU2[:,4]-tf.sqrt(label_IOU2[:,3]))))
    #Calculate the confidence for these two prediction boxes
    loss_confidence = tf.reduce_sum(tf.square(pred_obj[:,0]-IOU1)+tf.square(pred_obj[:,5]-IOU2))
    #Sum up all the loss parts
    loss = (loss_noobj+loss_classes+loss_coord+loss_confidence)/batch_size
    return loss

**Training**

In [7]:
image_train_batch = tf.placeholder(shape=[None, image_height, image_width, 3], dtype=tf.float32)
grids_vector_batch = tf.placeholder(shape=[None, grids*grids, 26], dtype=tf.float32)
result = yolonet_model.inference(image_train_batch, pretrain_trainable=False, wd=0.0005, pretrain_training=False, yolo_training=True)
result = tf.reshape(result, [-1,30])
grids_vector = tf.reshape(grids_vector_batch, [-1,26])
mse_loss = loss_func(result, grids_vector)
tf.add_to_collection('losses', mse_loss)
loss = tf.add_n(tf.get_collection('losses'), name='total_loss')

global_step = tf.Variable(0, trainable=False)
epoch_steps = int(epoch_size/batch_size)
boundaries = [epoch_steps*5,epoch_steps*55,epoch_steps*75]
values = [0.001, 0.01, 0.001, 0.0001]
learning_rate = tf.train.piecewise_constant(global_step, boundaries, values)
optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
    optimize_op = optimizer.minimize(loss, global_step=global_step)

#Load the pretrain Imagenet weights
#For the first time training, cancel the comment of below codes.
'''
variables_list = []
for var in tf.all_variables():
    #The var with "new" means it's for object dection training, not included in Imagenet pretrain
    if 'new' in var.name or 'Variable' in var.name:
        continue
    else:
        variables_list.append(var)
saver=tf.train.Saver(variables_list)
'''
saver_yolo=tf.train.Saver()

with tf.Session() as sess:
    #For first time training, cancel the comment of below codes.
    '''
    sess.run(tf.global_variables_initializer())
    #Load the pretrained Imagenet weights
    saver.restore(sess, "/home/roy/AI/model_bn_loss/model.ckpt-105000")   
    sess.run(global_step.initializer)
    '''
    #Comment out below line if the first training
    saver_yolo.restore(sess, "model_yolo/model.ckpt-30000")   
    sess.run([train_init_op])
    total_loss = 0.0 
    starttime = time.time()
    while(True):
        try:
            images_run, bbox_run = sess.run([image_train, bbox])
            #Construct the grids_vector based on bbox_run
            batch_num, box_num, _ = bbox_run.shape
            grids_vector_list = []
            for i in range(batch_num):
                vector = np.zeros([grids*grids, 26], dtype=float)
                for j in range(box_num):
                    if bbox_run[i][j][3]==0.0 or bbox_run[i][j][4]==0.0:
                        continue
                    else:
                        grid_id = int(bbox_run[i][j][0])
                        vector[grid_id][0] = grid_id
                        vector[grid_id][1] = 1.0
                        vector[grid_id][2] = bbox_run[i][j][1]
                        vector[grid_id][3] = bbox_run[i][j][2]
                        vector[grid_id][4] = bbox_run[i][j][3]
                        vector[grid_id][5] = bbox_run[i][j][4]
                        label = int(bbox_run[i][j][5])
                        vector[grid_id][label+6] = 1.0
                grids_vector_list.append(vector)
            grids_vector_run = np.stack(grids_vector_list)
            
            loss_a, step, lr, _ = sess.run([loss, global_step, learning_rate, optimize_op], feed_dict={image_train_batch:images_run, grids_vector_batch:grids_vector_run})
            total_loss += loss_a
        
            if step%100==0:
                print("step: %i, Learning_rate:%f, Time: %is Loss: %f" \
                      %(step, lr, int(time.time()-starttime), total_loss/100))
                total_loss = 0.0
                starttime = time.time()
    
            if step%2000==0:
                save_path = saver_yolo.save(sess, "model_yolo/model.ckpt", global_step=global_step)
        except tf.errors.OutOfRangeError:
            break  


Instructions for updating:
Use keras.layers.batch_normalization instead.
Instructions for updating:
Use tf.cast instead.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from model_yolo/model.ckpt-30000
step: 30100, Learning_rate:0.001000, Time: 103s Loss: 1.224235
step: 30200, Learning_rate:0.001000, Time: 96s Loss: 1.198330
step: 30300, Learning_rate:0.001000, Time: 95s Loss: 1.204168
step: 30400, Learning_rate:0.001000, Time: 96s Loss: 1.225659
step: 30500, Learning_rate:0.001000, Time: 96s Loss: 1.218227
step: 30600, Learning_rate:0.001000, Time: 96s Loss: 1.200745
step: 30700, Learning_rate:0.001000, Time: 94s Loss: 1.185329
step: 30800, Learning_rate:0.001000, Time: 95s Loss: 1.213746
step: 30900, Learning_rate:0.000100, Time: 95s Loss: 1.177624
step: 31000, Learning_rate:0.000100, Time: 96s Loss: 1.207570
step: 31100, Learning_rate:0.000100, Time: 95s Loss: 1.193434
step: 31200, Learning_rate:0.000100, Time: 95s Loss: 1.170741
step: 31300, Learning_rate:0.000100, Time: 95s Loss: 1.189242
step: 31400, Learning_rate:0.000100,

step: 42200, Learning_rate:0.000100, Time: 94s Loss: 1.193948
step: 42300, Learning_rate:0.000100, Time: 94s Loss: 1.178660
step: 42400, Learning_rate:0.000100, Time: 95s Loss: 1.190530
step: 42500, Learning_rate:0.000100, Time: 94s Loss: 1.194576
step: 42600, Learning_rate:0.000100, Time: 94s Loss: 1.164123
step: 42700, Learning_rate:0.000100, Time: 95s Loss: 1.188197
step: 42800, Learning_rate:0.000100, Time: 94s Loss: 1.170009
step: 42900, Learning_rate:0.000100, Time: 95s Loss: 1.180124
step: 43000, Learning_rate:0.000100, Time: 94s Loss: 1.167312
step: 43100, Learning_rate:0.000100, Time: 94s Loss: 1.194800
step: 43200, Learning_rate:0.000100, Time: 94s Loss: 1.186283
step: 43300, Learning_rate:0.000100, Time: 94s Loss: 1.171472
step: 43400, Learning_rate:0.000100, Time: 95s Loss: 1.175364
step: 43500, Learning_rate:0.000100, Time: 94s Loss: 1.163143
step: 43600, Learning_rate:0.000100, Time: 94s Loss: 1.177204
step: 43700, Learning_rate:0.000100, Time: 94s Loss: 1.157613
step: 43

KeyboardInterrupt: 

## 