In [1]:
import numpy as np
import tensorflow as tf
import os
import xml.etree.ElementTree as ET
import cv2
import pickle
import copy

In [2]:
CLASSES=['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
           'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
           'motorbike', 'person', 'pottedplant', 'sheep', 'sofa',
           'train', 'tvmonitor']

PASCAL_PATH = os.path.join('../', 'dataset')
CACHE_PATH = os.path.join(PASCAL_PATH, 'cache')
OUTPUT_DIR = os.path.join(PASCAL_PATH,'output')
WEIGHTS_DIR = os.path.join(PASCAL_PATH, 'weights')

In [3]:
slim = tf.contrib.slim

class YOLONet(object):
    
    def __init__(self, is_training=True):
        self.classes=CLASSES
        self.num_class = len(self.classes)
        self.image_size=448
        self.cell_size=7
        self.boxes_per_cell=2
        self.output_size=(self.cell_size*self.cell_size)*\
                    (self.num_class+self.boxes_per_cell*5)
        self.scale=1.0*self.image_size/self.cell_size
        self.boundary1 = self.cell_size*self.cell_size*self.num_class
        self.boundary2 = self.boundary1 + self.cell_size*self.cell_size*self.boxes_per_cell

        self.object_scale = 1.0
        self.noobject_scale=0.5
        self.class_scale=1.0
        self.coord_scale=5.0
        
        self.batch_size=45
        self.alpha=0.1
        self.offset = np.transpose(np.reshape(
            np.array([np.arange(self.cell_size)]*self.cell_size*self.boxes_per_cell),
            (self.boxes_per_cell, self.cell_size, self.cell_size)), (1,2,0))
        
        self.images = tf.placeholder(tf.float32, [None, self.image_size, self.image_size,3],name='images')
        self.logits = self.build_network(self.images, num_outputs=self.output_size, alpha=self.alpha,
                                        is_training=is_training)
        
        if is_training:
            self.labels = tf.placeholder(tf.float32,[None, self.cell_size,self.cell_size, 5+self.num_class])
            self.loss_layer(self.logits, self.labels)
            self.total_loss=tf.losses.get_total_loss()
            
            
    def build_network(self, images, num_outputs, alpha,
                     keep_prob=0.5, is_training=True,scope='yolo'):
        with tf.variable_scope(scope):
            with slim.arg_scope(
                [slim.conv2d, slim.fully_connected],
                activation_fn=leaky_relu(alpha),
                weights_regularizer=slim.l2_regularizer(0.0005),
                weights_initializer=tf.glorot_uniform_initializer()
                #weights_initializer=tf.truncated_normal_initializer(0.0, 0.01)
            ):
                net = tf.pad(
                    images, np.array([[0, 0], [3, 3], [3, 3], [0, 0]]),
                    name='pad_1')
                net = slim.conv2d(
                    net, 64, 7, 2, padding='VALID', scope='conv_2')
                net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_3')
                net = slim.conv2d(net, 192, 3, scope='conv_4')
                net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_5')
                net = slim.conv2d(net, 128, 1, scope='conv_6')
                net = slim.conv2d(net, 256, 3, scope='conv_7')
                net = slim.conv2d(net, 256, 1, scope='conv_8')
                net = slim.conv2d(net, 512, 3, scope='conv_9')
                net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_10')
                net = slim.conv2d(net, 256, 1, scope='conv_11')
                net = slim.conv2d(net, 512, 3, scope='conv_12')
                net = slim.conv2d(net, 256, 1, scope='conv_13')
                net = slim.conv2d(net, 512, 3, scope='conv_14')
                net = slim.conv2d(net, 256, 1, scope='conv_15')
                net = slim.conv2d(net, 512, 3, scope='conv_16')
                net = slim.conv2d(net, 256, 1, scope='conv_17')
                net = slim.conv2d(net, 512, 3, scope='conv_18')
                net = slim.conv2d(net, 512, 1, scope='conv_19')
                net = slim.conv2d(net, 1024, 3, scope='conv_20')
                net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_21')
                net = slim.conv2d(net, 512, 1, scope='conv_22')
                net = slim.conv2d(net, 1024, 3, scope='conv_23')
                net = slim.conv2d(net, 512, 1, scope='conv_24')
                net = slim.conv2d(net, 1024, 3, scope='conv_25')
                net = slim.conv2d(net, 1024, 3, scope='conv_26')
                net = tf.pad(
                    net, np.array([[0, 0], [1, 1], [1, 1], [0, 0]]),
                    name='pad_27')
                net = slim.conv2d(
                    net, 1024, 3, 2, padding='VALID', scope='conv_28')
                net = slim.conv2d(net, 1024, 3, scope='conv_29')
                net = slim.conv2d(net, 1024, 3, scope='conv_30')
                net = tf.transpose(net, [0, 3, 1, 2], name='trans_31')
                net = slim.flatten(net, scope='flat_32')
                net = slim.fully_connected(net, 512, scope='fc_33')
                net = slim.fully_connected(net, 4096, scope='fc_34')
                net = slim.dropout(
                    net, keep_prob=keep_prob, is_training=is_training,
                    scope='dropout_35')
                net = slim.fully_connected(
                    net, num_outputs, activation_fn=None, scope='fc_36')
        return net
    
    def calc_iou(self, boxes1, boxes2, scope='iou'):
        """calculate ious
        Args:
          boxes1: 5-D tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL, 4]  ====> (x_center, y_center, w, h)
          boxes2: 5-D tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL, 4] ===> (x_center, y_center, w, h)
        Return:
          iou: 4-D tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
        """
        with tf.variable_scope(scope):
            # transform (x_center, y_center, w, h) to (x1, y1, x2, y2)
            boxes1_t = tf.stack([boxes1[..., 0] - boxes1[..., 2]/2.0,
                                 boxes1[..., 1] - boxes1[..., 3]/2.0,
                                 boxes1[..., 0] + boxes1[..., 2]/2.0,
                                 boxes1[..., 1] + boxes1[..., 3]/2.0],
                                 axis=-1)
            
            boxes2_t =tf.stack([boxes2[..., 0] - boxes2[..., 2] / 2.0,
                                boxes2[..., 1] - boxes2[..., 3] / 2.0,
                                boxes2[..., 0] + boxes2[..., 2] / 2.0,
                                boxes2[..., 1] + boxes2[..., 3] / 2.0],
                                axis=-1)
            # calculate the left up point & right down point
            lu = tf.maximum(boxes1_t[..., :2], boxes2_t[..., :2])
            rd = tf.minimum(boxes1_t[..., 2:], boxes2_t[..., 2:])
            
            # intersection area
            intersection = tf.maximum(0., rd - lu)
            inter_area = intersection[..., 0]*intersection[..., 1]
            
            # calculate the boxes1 area and boxes2 area
            area1 = boxes1[..., 2]*boxes1[...,3]
            area2 = boxes2[..., 2]*boxes2[...,3]
            
            union_area = tf.maximum(area1 + area2 - inter_area, 1e-10)
            return tf.clip_by_value(inter_area/union_area,0.0,1.0)
        
    def loss_layer(self, predicts, labels, scope='loss_layer'):
        '''
        predicts: shape (None,1470)
        lables: shape (None,7,7,25)
        '''
        with tf.variable_scope(scope):
            # 将网络输出分离为类别和定位以及box大小，输出为维度为7*7*20+7*7*2+7*7*2*4=1470
            # 预测类别，形状为(None,7,7,20)
            predict_classes = tf.reshape(predicts[:,:self.boundary1],
                                      [self.batch_size, self.cell_size, self.cell_size, self.num_class])

            # 预测是否有物体，形状为(None,7,7,2)
            predict_scales = tf.reshape(predicts[:,self.boundary1:self.boundary2],
                                       [self.batch_size, self.cell_size, self.cell_size, self.boxes_per_cell])

            # 预测box大小,shape为(None,7,7,2,4)
            predict_boxes = tf.reshape(predicts[:,self.boundary2:],
                                      [self.batch_size, self.cell_size, self.cell_size, self.boxes_per_cell,4])

            # label的是否有目标信息，shape为(None,7,7,1)
            response = tf.reshape(labels[...,0],
                                 [self.batch_size, self.cell_size, self.cell_size, 1])

            # label的定位信息，shape为(None,7,7,1,4)
            boxes = tf.reshape(labels[..., 1:5],
                              [self.batch_size, self.cell_size, self.cell_size,1,4])

            # lable的box大小信息,shape为(None,7,7,2,4)
            boxes = tf.tile(boxes,[1,1,1, self.boxes_per_cell,1])/self.image_size

            # label的类别信息,shape(None,7,7,20), one_hot
            classes = labels[..., 5:]

            # offset的shape(1,7,7,2)
            offset = tf.reshape(tf.constant(self.offset, dtype=tf.float32),
                               [1, self.cell_size, self.cell_size, self.boxes_per_cell])
            # 复制batchsize次，shape(batch_size,7,7,2)
            offset = tf.tile(offset, [self.batch_size,1,1,1])
            offset_tran = tf.transpose(offset, (0,2,1,3))
            predict_boxes_tran = tf.stack(
                            [(predict_boxes[...,0] + offset)/self.cell_size,
                             (predict_boxes[...,1] + offset_tran)/self.cell_size,
                              tf.square(predict_boxes[..., 2]),
                              tf.square(predict_boxes[..., 3])], axis=-1)
            # shape(None,7,7,2)
            iou_predict_truth = self.calc_iou(predict_boxes_tran, boxes)
            # calculate I tensror [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
            object_mask = tf.reduce_max(iou_predict_truth, 3, keep_dims=True)
            object_mask = tf.cast((iou_predict_truth >= object_mask),tf.float32)*response

            # calculate no_I tensor [CELL_SIZE,CELL_SIZE,BOXES_PER_CELL]
            noobject_mask = tf.ones_like(object_mask, dtype=tf.float32) - object_mask

            boxes_tran = tf.stack([boxes[..., 0]*self.cell_size - offset,
                                   boxes[..., 1]*self.cell_size - offset_tran,
                                   tf.sqrt(boxes[...,2]),
                                   tf.sqrt(boxes[...,3])], axis=-1)

            # class loss
            class_delta = response*(predict_classes - classes)
            class_loss = tf.reduce_mean(tf.reduce_sum(tf.square(class_delta), axis=[1,2,3]),
                                       name='class_loss')*self.class_scale
            
            # object_loss
            object_delta = object_mask * (predict_scales - iou_predict_truth)
            object_loss = tf.reduce_mean(tf.reduce_sum(tf.square(object_delta), axis=[1, 2, 3]),
                                       name='object_loss') * self.object_scale
            # noobject loss
            noobject_delta = noobject_mask*predict_scales
            noobject_loss = tf.reduce_mean(
                            tf.reduce_sum(tf.square(noobject_delta), axis=[1,2,3]),
                            name='noobject_loss')*self.noobject_scale
            # coord_loss
            coord_mask = tf.expand_dims(object_mask, 4)
            boxes_delta = coord_mask*(predict_boxes - boxes_tran)
            coord_loss = tf.reduce_mean(
                            tf.reduce_sum(tf.square(boxes_delta), axis=[1,2,3,4]),
                            name='coord_loss')*self.coord_scale

            tf.losses.add_loss(class_loss)
            tf.losses.add_loss(object_loss)
            tf.losses.add_loss(noobject_loss)
            tf.losses.add_loss(coord_loss)
            
def leaky_relu(alpha):
    def op(inputs):
        return tf.nn.leaky_relu(inputs, alpha=alpha,name='leaky_relu')
    return op   
                

In [4]:
class pascal_voc(object):
    def __init__(self, phase, rebuild=False):
        self.devkit_path = os.path.join(PASCAL_PATH, 'VOCdevkit')
        self.data_path = os.path.join(self.devkit_path,'VOC2007')
        self.cache_path = CACHE_PATH
        self.batch_size = 45
        self.image_size = 448
        self.cell_size = 7
        self.classes = CLASSES
        self.class_to_ind = dict(zip(self.classes, range(len(self.classes))))
        self.flipped = True
        self.phase = phase
        self.rebuild = rebuild
        self.cursor = 0
        self.epoch = 1
        self.prepare()
    
    def get(self):
        images = np.zeros((self.batch_size, self.image_size, self.image_size, 3))
        labels = np.zeros((self.batch_size, self.cell_size, self.cell_size,25))
        
        count = 0
        while count < self.batch_size:
            imname = self.gt_labels[self.cursor]['imname']
            flipped = self.gt_labels[self.cursor]['flipped']
            images[count,:,:,:] = self.image_read(imname,flipped)
            labels[count,:,:,:] = self.gt_labels[self.cursor]['label']
            count += 1
            self.cursor += 1
            if self.cursor >= len(self.gt_labels):
                np.random.shuffle(self.gt_labels)
                self.cursor = 0
                self.epoch += 1
        return images, labels
    
    def image_read(self, imname, flipped = False):
        image = cv2.imread(imname)
        image = cv2.resize(image, (self.image_size, self.image_size))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
        image = (image/255.0)*2.0 - 1
        if flipped:
            image = image[:,::-1,:]
        return image
    
    def prepare(self):
        gt_labels = self.load_labels()
        if self.flipped:
            print('Appending horizontally-flipped training examples ...')
            gt_labels_cp = copy.deepcopy(gt_labels)
            for idx in range(len(gt_labels_cp)):
                gt_labels_cp[idx]['flipped'] = True
                gt_labels_cp[idx]['label'] = gt_labels_cp[idx]['label'][:,::-1,:]
                
                for i in range(self.cell_size):
                    for j in range(self.cell_size):
                        if gt_labels_cp[idx]['label'][i,j,0] == 1:
                            # box的(x,y,w,h)中只有x需要变换
                            gt_labels_cp[idx]['label'][i,j,1] = self.image_size - 1 - gt_labels_cp[idx]['label'][i,j,1]
            # 翻转后的label直接拼接在原label的后面                
            gt_labels += gt_labels_cp 
        np.random.shuffle(gt_labels)
        self.gt_labels = gt_labels
        return gt_labels
    
    def load_labels(self):
        cache_file = os.path.join(
                    self.cache_path, 'pascal_' + self.phase + '_gt_labels.pkl')
        
        if os.path.isfile(cache_file) and not self.rebuild:
            print('Loading gt_labels from:' + cache_file)
            with open(cache_file, 'rb') as f:
                gt_labels = pickle.load(f)
            return gt_labels
        
        print('Processing gt_labels from: ' + self.data_path)
        
        if not os.path.exists(self.cache_path):
            os.makedirs(self.cache_path)
        
        if self.phase == 'train':
            txtname = os.path.join(self.data_path,'ImageSets','Main','trainval.txt')
        else:
            txtname = os.path.join(self.data_path,'ImageSets','Main','test.txt')
        with open(txtname, 'r') as f:
            self.image_index = [x.strip() for x in f.readlines()]
        
        gt_labels = []
        for index in self.image_index:
            label,num = self.load_pascal_annotation(index)
            if num == 0:
                continue
            imname = os.path.join(self.data_path,'JPEGImages', index + '.jpg')
            gt_labels.append({'imname': imname,
                              'label': label,
                              'flipped': False})
        print('Saving gt_labels to: ' + cache_file)
        with open(cache_file, 'wb') as f:
            pickle.dump(gt_labels, f)
        return gt_labels
    
    def load_pascal_annotation(self, index):
        '''
        Load image and bounding boxes info from XML file in the PASCAL VOC format.
        '''
        imname = os.path.join(self.data_path,'JPEGImages', index+'.jpg')
        im = cv2.imread(imname)
        h_ratio = 1.0*self.image_size/im.shape[0]
        w_ratio = 1.0*self.image_size/im.shape[1]
        
        label = np.zeros((self.cell_size,self.cell_size,25))
        filename = os.path.join(self.data_path, 'Annotations',index+'.xml')
        tree = ET.parse(filename)
        objs = tree.findall('object')
        
        for obj in objs:
            bbox = obj.find('bndbox')
            x1 = max(min((float(bbox.find('xmin').text) - 1) * w_ratio, self.image_size - 1), 0)
            y1 = max(min((float(bbox.find('ymin').text) - 1) * h_ratio, self.image_size - 1), 0)
            x2 = max(min((float(bbox.find('xmax').text) - 1) * w_ratio, self.image_size - 1), 0)
            y2 = max(min((float(bbox.find('ymax').text) - 1) * h_ratio, self.image_size - 1), 0)
            cls_ind = self.class_to_ind[obj.find('name').text.lower().strip()]
            # boxes为(x,y,w,h)，此时尺寸是在448x448上
            boxes = [(x2 + x1)/2.0, (y2 + y1)/2.0, x2 - x1, y2 - y1]
            # grid cell的index
            x_ind = int(boxes[0] * self.cell_size / self.image_size)
            y_ind = int(boxes[1] * self.cell_size / self.image_size)
            if label[y_ind, x_ind, 0] == 1:
                continue
            # label中对应的cell有物体
            label[y_ind, x_ind, 0] = 1
            # label中对应的cell中物体的bbox
            label[y_ind, x_ind, 1:5] = boxes
            # label中对应的cell中物体的类别，one-hot
            label[y_ind, x_ind, 5 + cls_ind] = 1
        return label,len(objs)

In [5]:
class Solver(object):
    def __init__(self, net, data):
        self.net = net
        self.data = data
        self.max_iter = 25000
        self.initial_learning_rate = 0.001
        self.decay_steps = 10000
        self.decay_rate = 0.1
        self.staircase = True
        self.weights_file = './YOLO_small.ckpt'
        
        
        self.variable_to_restore = tf.global_variables()
        self.saver = tf.train.Saver(self.variable_to_restore, max_to_keep=None)
        self.global_step = tf.train.create_global_step()
        self.learning_rate = tf.train.exponential_decay(
                            self.initial_learning_rate,self.global_step,
                            self.decay_steps, self.decay_rate, self.staircase,
                            name='learning_rate')
        self.optimizer = tf.train.GradientDescentOptimizer(
                            learning_rate=self.learning_rate)
        self.train_op = slim.learning.create_train_op(
                            self.net.total_loss, self.optimizer,global_step=self.global_step)
        
        gpu_options = tf.GPUOptions()
        config = tf.ConfigProto(gpu_options=gpu_options)
        self.sess = tf.Session(config=config)
        self.sess.run(tf.global_variables_initializer())
        
        if self.weights_file is not None:
            print('Restoring weights from: ' + self.weights_file)
            self.saver.restore(self.sess, self.weights_file)
            
    def train(self):
        for step in range(1,self.max_iter + 1):
            images,labels = self.data.get()
            feed_dict = {self.net.images: images,
                         self.net.labels: labels}
            
            loss,_ = self.sess.run([self.net.total_loss, self.train_op],
                                  feed_dict=feed_dict)
            if step % 100 == 0:
                print('step: {}, loss: {}'.format(step, loss))
                
            checkpoint_name = os.path.join('.', 'yolo' + str(step) + '.ckpt')
            if step %10000 == 0 and step != 0:
                self.saver.save(self.sess, checkpoint_name)
        self.saver.save(self.sess, checkpoint_name)
yolo = YOLONet()
pascal = pascal_voc('train')
solver = Solver(yolo, pascal)
solver.train()   

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Loading gt_labels from:../dataset/cache/pascal_train_gt_labels.pkl
Appending horizontally-flipped training examples ...
Restoring weights from: ./YOLO_small.ckpt
INFO:tensorflow:Restoring parameters from ./YOLO_small.ckpt
step: 100, loss: 4.264373779296875
step: 200, loss: 4.742156982421875
step: 300, loss: 4.636838436126709
step: 400, loss: 4.487580299377441
step: 500, loss: 5.612963676452637
step: 600, loss: 4.921511650085449
step: 700, loss: 4.739345550537109
step: 800, loss: 4.191561698913574
step: 900, loss: 4.3981218338012695
step: 1000, loss: 4.486600875854492
step: 1100, loss: 4.5085954666137695
step: 1200, loss: 4.81608247756958
step: 1300, loss: 4.395411491394043
step: 1400, loss: 4.146206855773926
step: 1500, loss: 3.974743366241455
step: 1600, loss: 4.32240104675293
step: 1700, loss: 5.350527286529541
step: 1800, loss: 5.222931861877441
step: 1900, loss: 4.13148307800293
step: 2000, loss: 4.86304569244

In [11]:
from tensorflow.python.framework import ops
ops.reset_default_graph()
class Detector(object):
    def __init__(self, net, weight_file):
        self.net = net
        self.weights_file = weight_file
        
        self.classes = CLASSES
        self.num_class = len(self.classes)
        self.image_size = 448
        self.cell_size = 7
        self.boxes_per_cell = 2
        self.threshold = 0.2
        self.iou_threshold = 0.5
        self.boundary1 = self.cell_size*self.cell_size*self.num_class
        self.boundary2 = self.boundary1 + self.cell_size*self.cell_size*self.boxes_per_cell
        
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver()
        self.saver.restore(self.sess, self.weights_file)
        
    def draw_result(self,img,result):
        for i in range(len(result)):
            x = int(result[i][1])
            y = int(result[i][2])
            w = int(result[i][3]/2)
            h = int(result[i][4]/2)
            
            cv2.rectangle(img,(x-w,y-h),(x+w,y+h),(0,255,0),2)
            cv2.rectangle(img,(x-w,y-h-20),(x+w,y-h),(125,125,125),-1)
            lineType = cv2.LINE_AA if cv2.__version__ > '3' else cv2.CV_AA
            cv2.putText(
                img, result[i][0]+' :%.2f'%result[i][5],
                (x-w+5,y-h-7),cv2.FONT_HERSHEY_SIMPLEX, 0.5,
                (0,0,0),1, lineType)
            
    def detect(self,img):
        img_h,img_w,_ = img.shape
        inputs = cv2.resize(img, (self.image_size, self.image_size))
        inputs = cv2.cvtColor(inputs, cv2.COLOR_BGR2RGB).astype(np.float32)
        inputs = (inputs / 255.0) * 2.0 - 1.0
        inputs = np.reshape(inputs, (1, self.image_size, self.image_size, 3))
        
        result = self.detect_from_cvmat(inputs)[0]
        
        for i in range(len(result)):
            result[i][1] *= (1.0 * img_w / self.image_size)
            result[i][2] *= (1.0 * img_h / self.image_size)
            result[i][3] *= (1.0 * img_w / self.image_size)
            result[i][4] *= (1.0 * img_h / self.image_size)
        
        return result
    
    def detect_from_cvmat(self, inputs):
        net_output = self.sess.run(self.net.logits,
                                  feed_dict={self.net.images: inputs})
        results = []
        for i in range(net_output.shape[0]):
            results.append(self.interpret_output(net_output[i]))
        return results
    
    def interpret_output(self, output):
        probs = np.zeros((self.cell_size,self.cell_size,
                         self.boxes_per_cell, self.num_class))
        class_probs = np.reshape(output[0:self.boundary1],
                                (self.cell_size, self.cell_size,self.num_class))
        scales = np.reshape(output[self.boundary1:self.boundary2],
                           (self.cell_size, self.cell_size, self.boxes_per_cell))
        boxes = np.reshape(output[self.boundary2:],
                          (self.cell_size, self.cell_size,self.boxes_per_cell,4))
        offset = np.array([np.arange(self.cell_size)]*self.cell_size*self.boxes_per_cell)
        offset = np.transpose(np.reshape(
                              offset, [self.boxes_per_cell,self.cell_size,self.cell_size]),(1,2,0))
        
        boxes[:,:,:,0] += offset
        boxes[:,:,:,1] += np.transpose(offset,(1,0,2))
        boxes[:,:,:,:2] = 1.0*boxes[:,:,:,0:2]/self.cell_size
        boxes[:,:,:,2:] = np.square(boxes[:,:,:,2:])
        
        boxes *= self.image_size
        
        for i in range(self.boxes_per_cell):
            for j in range(self.num_class):
                probs[:,:,i,j] = np.multiply(class_probs[:,:,j],scales[:,:,i])
         
        filter_mat_probs = np.array(probs >= self.threshold, dtype='bool')
        filter_mat_boxes = np.nonzero(filter_mat_probs)
        boxes_filtered = boxes[filter_mat_boxes[0],
                               filter_mat_boxes[1], filter_mat_boxes[2]]
        probs_filtered = probs[filter_mat_probs]
        classes_num_filtered = np.argmax(
            filter_mat_probs, axis=3)[
            filter_mat_boxes[0], filter_mat_boxes[1], filter_mat_boxes[2]]

        argsort = np.array(np.argsort(probs_filtered))[::-1]
        boxes_filtered = boxes_filtered[argsort]
        probs_filtered = probs_filtered[argsort]
        classes_num_filtered = classes_num_filtered[argsort]

        for i in range(len(boxes_filtered)):
            if probs_filtered[i] == 0:
                continue
            for j in range(i + 1, len(boxes_filtered)):
                if self.iou(boxes_filtered[i], boxes_filtered[j]) > self.iou_threshold:
                    probs_filtered[j] = 0.0

        filter_iou = np.array(probs_filtered > 0.0, dtype='bool')
        boxes_filtered = boxes_filtered[filter_iou]
        probs_filtered = probs_filtered[filter_iou]
        classes_num_filtered = classes_num_filtered[filter_iou]

        result = []
        for i in range(len(boxes_filtered)):
            result.append(
                [self.classes[classes_num_filtered[i]],
                 boxes_filtered[i][0],
                 boxes_filtered[i][1],
                 boxes_filtered[i][2],
                 boxes_filtered[i][3],
                 probs_filtered[i]])

        return result
    
    def iou(self, box1, box2):
        tb = min(box1[0] + 0.5 * box1[2], box2[0] + 0.5 * box2[2]) - \
            max(box1[0] - 0.5 * box1[2], box2[0] - 0.5 * box2[2])
        lr = min(box1[1] + 0.5 * box1[3], box2[1] + 0.5 * box2[3]) - \
            max(box1[1] - 0.5 * box1[3], box2[1] - 0.5 * box2[3])
        inter = 0 if tb < 0 or lr < 0 else tb * lr
        return inter / (box1[2] * box1[3] + box2[2] * box2[3] - inter)

    def camera_detector(self, cap, wait=10):
        ret, _ = cap.read()

        while ret:
            ret, frame = cap.read()
            
            result = self.detect(frame)
        
            self.draw_result(frame, result)
            cv2.imshow('Camera', frame)
            cv2.waitKey(wait)

            ret, frame = cap.read()

    def image_detector(self, imname, wait=0):
        
        image = cv2.imread(imname)
        result = self.detect(image)
        print(result)
        self.draw_result(image, result)
        cv2.imwrite('demo.jpg',image)
        #cv2.imshow('Image', image)
        #cv2.waitKey(wait)

yolo = YOLONet(False)
weight_file = './yolo25000.ckpt'
#weight_file='./YOLO_small.ckpt'
detector = Detector(yolo,weight_file)
imname = '../dataset/VOCdevkit/VOC2007/JPEGImages/000050.jpg'
detector.image_detector(imname)

INFO:tensorflow:Restoring parameters from ./yolo25000.ckpt
[['person', 180.51026548658098, 233.84686027254375, 232.38132681165425, 291.361825806754, 0.9480621218681335], ['person', 228.30344949449812, 133.53401848248072, 111.04434728622437, 112.0371605668749, 0.43928977847099304], ['bottle', 390.2549743652344, 226.4765841620309, 35.41623055934906, 86.59440279006958, 0.2403481900691986], ['diningtable', 348.6579486301967, 294.4240399769374, 196.0460628782, 129.76182358605521, 0.23747755587100983], ['person', 42.55091292517526, 198.42875003814697, 44.54611028943743, 209.22584193093437, 0.22830785810947418], ['person', 325.0153745923724, 149.037058864321, 90.14904499053955, 117.4522978918893, 0.21839332580566406]]


![dection](./demo.jpg)