In [1]:
import numpy as np
import os
import sys
import scipy
import cv2
import gc

#解析使用
import xml
from xml.etree import ElementTree as ET

from glob import glob

import keras.backend as K
from keras.applications import VGG19
from keras.models import Model
from keras.utils import to_categorical

import imageio
from skimage import transform

from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.svm import SVC #类别分类使用
from sklearn.linear_model import Ridge #bounding-box回归
from sklearn.externals import joblib

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import tensorflow as tf

from tensorflow.contrib import slim

In [3]:
import warnings

warnings.filterwarnings(action='error')

In [4]:
TRAIN_DATA_PATH = '../../../tensorflow2/dataset/VOCtrainval_11-May-2012/JPEGImages/'
TEST_DATA_PATH = '../../../tensorflow2/dataset/VOC2012test/JPEGImages/'

TRAIN_XML_PATH = '../../../tensorflow2/dataset/VOCtrainval_11-May-2012/Annotations/'
TEST_XML_PATH = '../../../tensorflow2/dataset/VOC2012test/Annotations/'

CLASSES_NUM = 20

STR = [
    'person',
    'bird','cat','cow','dog','horse','sheep',
    'aeroplane','bicycle','boat','bus','car','motorbike','train',
    'bottle','chair','diningtable','pottedplant','sofa','tvmonitor'
]

LABEL2STR = {idx:value for idx , value in enumerate(STR)}
STR2LABEL = {value:key for key,value in LABEL2STR.items()}
#STR2LABEL = {value:idx for idx , value in enumerate(STR)}

STR2LABEL['none'] = 'none' #先不使用part部分 只进行naive目标检测

#目标检测相关
IoU_THRESHOLD = 0.5

#SVM相关
SVM_IoU_THRESHOLD = 0.3

#NMS相关
NMS_IoU_THRESHOLD = 0.3 #or ~0.5

#bbox回归
BBOX_REGRESS_IoU_THRESHOLD = 0.6

In [5]:
xml_file_names_train = glob(TRAIN_XML_PATH + '*') #所有的xml文件 完整路径

#从xml文件中读出图片相关的信息

def xml_parse(xml_file):
    '''
    return filename , shape , name_boxes , crop_boxes
    xml文件中的shape格式为 (width height 3)
    '''
    xml_file = xml.dom.minidom.parse(xml_file)
    xml_file_docu_ele = xml_file.documentElement

    filename_list = xml_file_docu_ele.getElementsByTagName('filename')
    
    #filename_list可能有多个filename的 所以要索引0(此数据集中filename只有一个)
    filename = filename_list[0].childNodes[0].data #filename_list.firstChild.data

    #图像的尺寸信息
    size_list = xml_file_docu_ele.getElementsByTagName('size')

    for size in size_list:
        width_list = size.getElementsByTagName('width')
        width = int(width_list[0].childNodes[0].data)

        height_list = size.getElementsByTagName('height')
        height = int(height_list[0].childNodes[0].data)

        channel_list = size.getElementsByTagName('depth')
        channel = int(channel_list[0].childNodes[0].data)

    #一个文件中有多个object
    object_list = xml_file_docu_ele.getElementsByTagName('object')

    #多个object与多个object对应的详细信息
    name_boxes = [] #一个元素就是一个object
    crop_boxes = []

    for objects in object_list:
        #一次循环处理一个object信息
        #一个xml文件（即一个图像中）有多个object

        #name
        name_list = objects.getElementsByTagName('name')

        name_box = name_list[0].childNodes[0].data

        #bounding box points
        bndbox = objects.getElementsByTagName('bndbox')

        x1_list = bndbox[0].getElementsByTagName('xmin')
        x1 = int( round( float(x1_list[0].childNodes[0].data) ) )

        y1_list = bndbox[0].getElementsByTagName('ymin')
        y1 = int(round(float( y1_list[0].childNodes[0].data )))

        x2_list = bndbox[0].getElementsByTagName('xmax')
        x2 = int(round(float( x2_list[0].childNodes[0].data )))

        y2_list = bndbox[0].getElementsByTagName('ymax')
        y2 = int(round(float( y2_list[0].childNodes[0].data )))

        crop_box = [x1,x2,y1,y2]

        name_boxes.append(name_box)
        crop_boxes.append(crop_box)

    #crop_box:[x1 x2 y1 y2]
    return filename , name_boxes , np.array(crop_boxes) #filename调试使用


In [6]:
#xml_parse(xml_file_names_train[897])

In [7]:
class Image(object):
    '''
    图片的真实信息
    '''
    def __init__(self):
        self.img_file_names_train = glob(TRAIN_DATA_PATH+'*') #训练全路径信息
                
    def load(self , img_path_name = None):
        if not img_path_name:
            img_path_name = np.random.choice(self.img_file_names_train) #随机选择一张图片
            #img_path_idx = np.random.randint(0 , high = len(self.img_file_names_train)) #随机索引

        img_arr = cv2.imread(img_path_name) #BGR height*width*chanel
        
        xml_file_name = TRAIN_XML_PATH + img_path_name[-15:-4] +  '.xml'
        
        _ , name_boxes , crop_boxes = xml_parse(xml_file_name)
        
        labels = [] #存储与bndbox对应的 label信息

        for i in range(len(crop_boxes)): #多个object 
            labels.append(STR2LABEL.get(name_boxes[i] , 'none'))
        
        return img_arr , labels , crop_boxes
    

In [17]:
#reference: github:sualab
class Img_generator(object):
    def __init__(self):
        self.img_loader = Image()

    #计算bbox面积
    def bbox_area(self , bbox):
        w = bbox[1] - bbox[0]
        h = bbox[3] - bbox[2]
        
        return w*h
    
    #计算交并比
    def IoU(self , bbox_a , bbox_b):
        xmin_a = bbox_a[0]
        xmax_a = bbox_a[1]
        ymin_a = bbox_a[2]
        ymax_a = bbox_a[3]
        
        xmin_b = bbox_b[0]
        xmax_b = bbox_b[1]
        ymin_b = bbox_b[2]
        ymax_b = bbox_b[3]
        
        if   xmin_a < xmax_b <= xmax_a and (ymin_a < ymax_b <= ymax_a or ymin_a <= ymin_b < ymax_a):
            flag = True
        elif xmin_a <= xmin_b < xmax_a and (ymin_a < ymax_b <= ymax_a or ymin_a <= ymin_b < ymax_a):
            flag = True
        elif xmin_b < xmax_a <= xmax_b and (ymin_b < ymax_a <= ymax_b or ymin_b <= ymin_a < ymax_b):
            flag = True
        elif xmin_b <= xmin_a < xmax_b and (ymin_b < ymax_a <= ymax_b or ymin_b <= ymin_a < ymax_b):
            flag = True
        else:
            flag = False
        
        if flag:
            x_sorted_list = sorted([xmin_a, xmax_a, xmin_b, xmax_b])
            y_sorted_list = sorted([ymin_a, ymax_a, ymin_b, ymax_b])
            
            x_intersect_w = x_sorted_list[2] - x_sorted_list[1] #0 1 2 3
            y_intersect_h = y_sorted_list[2] - y_sorted_list[1] #0 1 2 3
            
            area_inter = x_intersect_w * y_intersect_h #计算重合面积
            
            union_area = self.bbox_area(bbox_a) + self.bbox_area(bbox_b) - area_inter
            
            return area_inter/union_area
        else:
            return 0.0
    
        
    def get_train_proposal(self , ground_truth_labels , ground_truth_coord , img_shape): #img_shape:[height width 3] 
        #[[1.3221,1.73145],[3.19275,4.00944],[5.05587,8009892],[9.47112,4.84053],[11.2364,10.0071]]
        #将上述的anchor信息（13*13坐标系中） 乘以32 转换为416*416坐标系中
        anchors = [[42.3072,55.4064],[102.168,128.30208],[161.78784,256.316544],[303.07584,154.89696],[359.5648,320.2272]] #只有宽高信息
        #grid cell尺寸 feature map尺寸
        grid_w = 13
        grid_h = 13
        
        oh = img_shape[0] #原图height
        ow = img_shape[1] #原图width
        
        labels = []
        label = np.zeros((13 , 13 , 5 , 5+20))
        
        for idx , (x_min , x_max , y_min , y_max) in enumerate(ground_truth_coord):
            x_min , y_min , x_max , y_max = x_min/ow , y_min/oh , x_max/ow , y_max/oh #变为在原图中的比例
            x_min , y_min , x_max , y_max = np.clip([x_min , y_min , x_max , y_max] , a_min=0.0 , a_max=1.0)
        
            anchor_boxes = np.array(anchors) / np.array([ow , oh]) #将anchors转换为在原图中的比例
            
            best_anchor = self._get_best_anchor(anchor_boxes , [x_max-x_min , y_max-y_min])
            
            #当前ground truth的中点落在哪一个grid cell中
            cx = int(np.floor((x_min+x_max)/2) * grid_w)
            cy = int(np.floor((y_min+y_max)/2) * grid_h)
            
            label[cy , cx , best_anchor , 0:4] = [x_min , y_min , x_max , y_max]
            label[cy , cx , best_anchor , 4] = 1.0
            label[cy , cx , best_anchor , 5+ground_truth_labels[idx]] = 1.0
        
        labels.append(label)
        
        return np.array(labels)
    
    def _get_best_anchor(self , anchors , box_wh):
        '''
        此处使用的坐标均在(0 1)范围内
        
        此处计算iou的时候 不考虑坐标位置 只考虑height and width
        '''
        box_wh = np.array(box_wh)
        
        best_iou = 0.0
        best_anchor = 0 #最好anchor的索引
        
        for k , anchor in enumerate(anchors):
            intersect_wh = np.maximum(np.minimum(box_wh , anchor) , 0.0)
            intersect_area = intersect_wh[0] * intersect_wh[1]
            
            box_area = box_wh[0] * box_wh[1]
            anchor_area = anchor[0] * anchor[1]
            
            iou = intersect_area / (box_area+anchor_area-intersect_area)
            
            if iou > best_iou:
                best_iou = iou
                best_anchor = k
            
        return best_anchor
    
    def load(self , img_path_name):
        '''
        img_path_name:绝对路径
        '''
        
        #图片数据 label ground_truth坐标信息
        img_arr , ground_truth_labels , ground_truth_coord = self.img_loader.load(img_path_name)
        
        labels = self.get_train_proposal(ground_truth_labels , ground_truth_coord , img_arr.shape)
        
        img_arr = cv2.resize(img_arr , (416 , 416))
        img_arr = img_arr / 127.5 - 1 #对下面的get_train_proposal没有影响
        
        '''
        resize 并 归一化像素值
        img_arr 为 BGR形式
        '''
        
        #[R G B] [123.68 116.779 103.939]
        #减去每个通道的像素平均值 归一化
        #因为cv2打开的形式为BGR
        #img_arr[:,:,0] = img_arr[:,:,0] - 103.939
        #img_arr[:,:,1] = img_arr[:,:,1] - 116.779
        #img_arr[:,:,2] = img_arr[:,:,2] - 123.680
        
        return np.expand_dims(img_arr , axis=0) , labels
    
    
    def load_test(self , img_path_name):
        img_arr = cv2.imread(img_path_name)
        
        img_arr_resize = cv2.resize(img_arr , (448 , 448))
        img_arr_resize_norm = img_arr_resize / 127.5 - 1
        
        return np.expand_dims(img_arr_resize_norm , axis=0) , img_arr


In [18]:
class Dataset(object):
    def __init__(self):
        self.img_generator = Img_generator()
        
        self.img_loader = Image()
        
        self.img_file_names_train = glob(TRAIN_DATA_PATH + '*')
        self.img_file_names_test = glob(TEST_DATA_PATH + '*')
    
    def get_batch(self):
        path = np.random.choice(self.img_file_names_train)
        
        x , labels = self.img_generator.load(path)
    
        return x , labels
    
    def get_batch_test(self , path):
        
        if not path:
            #未指定path 从测试目录中随机选一张图片测试
            path = np.random.choice(self.img_file_names_test)
            print('test image:', path)
        
        
        x , img_arr= self.img_generator.load_test(path)
        
        return x , img_arr

In [19]:
test = Dataset()

In [20]:
a,b = test.get_batch()

In [10]:
class DarkNet(object):
    def __init__(self , is_training=True):
        
        self.x = tf.placeholder(dtype=tf.float32 , shape=[1 , 416 , 416 , 3])        
        
        self.build() #构建网络产生输出
        
        if is_training:
            self.labels = tf.placeholder(dtype=tf.float32 , shape=[1 , 13 , 13 , 5 , (5+20)])
            self.loss()

    def build(self):
        #arch from paper
        def _batch_norm(_input):
            return slim.batch_norm(_input)
        
        def _weight_variable(shape , name):
            return tf.get_variable('weights_'+name , shape=shape , dtype=tf.float32 ,
                                    initializer = tf.initializers.truncated_normal(stddev=0.01) , trainable = True)
        
        def _bias_variable(shape , name):
            return tf.get_variable('biases_'+name , shape=shape , dtype=tf.float32 ,
                                    initializer = tf.initializers.constant(0.0))
        
        def _conv(_input , num_outputs , kernel_size , stride=1 , padding='SAME' , name='default' , is_activation=True):
            weight = _weight_variable(shape=[kernel_size , kernel_size , _input.get_shape().as_list()[-1] , num_outputs] , name=name)
            biases = _bias_variable(shape=[num_outputs] , name=name)
            
            if is_activation:
                #conv->norm->relu [->pooling]
                return tf.nn.leaky_relu( _batch_norm( tf.nn.conv2d(_input , weight , strides=[1,stride,stride,1] , padding=padding) + biases ) ,
                                     alpha=0.1)
            else:
                #conv
                return tf.nn.conv2d(_input , weight , strides=[1,stride,stride,1] , padding=padding) + biases
                          
        def _max_pool(_input , kernel_size=2 , stride=2 , padding='VALID'):
            return slim.max_pool2d(_input , kernel_size=kernel_size , stride=stride)
        
        
        #_conv中已放入batch-norm
        #darknet-19
        output = _conv(self.x , 32 , 3 , name='conv1')
        output = _max_pool(output)
           
        output = _conv(output , 64 , 3 , name='conv2')              
        output = _max_pool(output)
        
        output = _conv(output , 128 , 3 , name='conv3')
        output = _conv(output , 64 , 1 , name='conv4')
        output = _conv(output , 128 , 3 , name='conv5')
        output = _max_pool(output)
                
        output = _conv(output , 256 , 3 , name='conv6')
        output = _conv(output , 128 , 1 , name='conv7')
        output = _conv(output , 256 , 3 , name='conv8')
        output = _max_pool(output)
        
        output = _conv(output , 512 , 3 , name='conv9')
        output = _conv(output , 256 , 1 , name='conv10')
        output = _conv(output , 512 , 3 , name='conv11')
        output = _conv(output , 256 , 1 , name='conv12')
        output = _conv(output , 512 , 3 , name='conv13')
        #(26 26 512)
        fine_grained = output #细粒度
        
        output = _max_pool(output) #细粒度 passthrough layer需要
        #此时shape为（13 13 512）
        
        output = _conv(output , 1024 , 3 , name='conv14')
        output = _conv(output , 512 , 1 , name='conv15')
        output = _conv(output , 1024 , 3 , name='conv16')
        output = _conv(output , 512 , 1 , name='conv17')
        output = _conv(output , 1024 , 3 , name='conv18')
        
        #detection arch
        output = _conv(output , 1024 , 3 , name='conv19')
        output = _conv(output , 1024 , 3 , name='conv20')
        #(13 13 1024)
        '''
        细粒度与粗粒度合并
        '''
        fine_grained = _conv(fine_grained , 64 , 1 , name='passthrough')
        #(26 26 64)
        fine_grained = tf.space_to_depth(fine_grained , block_size=2)
        #(13 13 256)
        
        output = tf.concat((fine_grained , output) , axis=-1) #(13 13 256+1024) == (13 13 1280)

        output = _conv(output , 1024 , 3 , name='conv21')
        
        #最后一层不归一 不激活 不池化
        output = _conv(output , (5*(20+5)) , 1 , name='conv22' , is_activation=False)
        
        self.output = tf.reshape(output , shape=(-1 , 13 , 13 , 5 , 25))
        
        
    def loss(self):
        '''
        reference: github:sualab
        '''
                
        #self.labels
        
        loss_weights = [5.0 , 5.0 , 5.0 , 0.5 , 1.0]
        grid_h = 13
        grid_w = 13
        num_classws = 20
        anchors = np.array( [[42.3072,55.4064],[102.168,128.30208],[161.78784,256.316544],[303.07584,154.89696],[359.5648,320.2272]] ) #可以不需要进行np.array转换 直接可以用np函数处理
        
        cxcy = np.transpose([ np.tile(np.arange(grid_w) , grid_h) , np.repeat(np.arange(grid_h) , grid_w) ])
        cxcy = np.reshape(cxcy , (1,grid_h , grid_w , 1 , 2))
        
        '''
        将网络的输出进行切分
        '''
        txty = self.output[:,:,:,:,0:2]
        twth = self.output[:,:,:,:,2:4]
        confidence = tf.sigmoid(self.output[:,:,:,:,4])
        class_probs = tf.nn.softmax(self.output[:,:,:,:,5:] , axis=-1)
        
        
        
        
        lambda_coord = 5.0
        lambda_noobj = 0.5
        
        #[:,:,0]
        _mask = tf.cast( tf.greater( tf.slice(self.target,begin=[0,0,0],size=[7,7,1]) , np.zeros(shape=[7,7,1] , dtype=float) ) , dtype=tf.float32 )
        mask = tf.tile(_mask , multiples=[1 , 1 , 2]) #7*7*2
        #[:,:,[1,2]]
        self.loss_coord = tf.reduce_sum( tf.square( ( tf.slice(self.output,begin=[0,0,1],size=[7,7,2]) - tf.slice(self.target,begin=[0,0,1],size=[7,7,2]) ) * mask ) )
        
        '''========'''
        '''此处不修正 不出现给负数开方 出现nan情况'''
        #[:,:,[3,4]]
        self.loss_coord += tf.reduce_sum( tf.square( (tf.sqrt( tf.slice(self.output,begin=[0,0,3],size=[7,7,2]) ) - tf.sqrt( tf.slice(self.target,begin=[0,0,3],size=[7,7,2]))) * mask ) )
        '''========'''
        
        self.loss_coord = lambda_coord * self.loss_coord
        
        #[:,:,:[0,5]]
        self.loss_iou = tf.reduce_sum( tf.square( ( tf.slice(self.output,begin=[0,0,0],size=[7,7,1]) - tf.slice(self.target,begin=[0,0,0],size=[7,7,1])) * mask ) )
        self.loss_iou += tf.reduce_sum( tf.square( ( tf.slice(self.output,begin=[0,0,5],size=[7,7,1]) - tf.slice(self.target,begin=[0,0,5],size=[7,7,1])) * mask ) )
        #[:,:,:[0,5]]
        self.loss_iou += lambda_noobj * (tf.reduce_sum( tf.square( ( tf.slice(self.output,begin=[0,0,0],size=[7,7,1]) - tf.slice(self.target,begin=[0,0,0],size=[7,7,1]) ) * (1.0-mask) ) ) +\
                                    tf.reduce_sum( tf.square( ( tf.slice(self.output,begin=[0,0,5],size=[7,7,1]) - tf.slice(self.target,begin=[0,0,5],size=[7,7,1]) ) * (1.0-mask) ) ))
        
        mask = tf.tile(_mask , multiples=[1,1,20]) #7*7*20 (因为20 classes)
        #[:,:,10:]
        self.loss_cls = tf.reduce_sum( tf.square( ( tf.slice(self.output,begin=[0,0,10],size=[7,7,20]) - tf.slice(self.target,begin=[0,0,10],size=[7,7,20]) ) * mask ) )
        
        self.total_loss = self.loss_coord + self.loss_iou + self.loss_cls


In [11]:
test = DarkNet()

In [11]:
def display(img_arr , labels , bbox , name):    
    
    print('debug:' , img_arr.shape , labels , bbox)
    
    for i in range(len(labels)):

        x1 = bbox[i][0]
        x2 = bbox[i][1]
        y1 = bbox[i][2]
        y2 = bbox[i][3]

        img_arr = cv2.rectangle(img_arr , (x1 , y1) , (x2 , y2) , (255,255,255))

        img_arr = cv2.putText(img_arr , LABEL2STR[labels[i]] , org=(x1 , y1+10) , fontFace = cv2.FONT_HERSHEY_PLAIN , fontScale=1 , color = (255,255,255), thickness = 1)

    #plt.imshow(meta_img) #图像查看

    plt.imsave(arr=img_arr[: , : ,[2,1,0]] , fname = 'result/%s.jpg' % name) #保存图像


In [24]:
#refer:https://blog.csdn.net/two_vv/article/details/76769860

class YOLO_V2(object):
    '''
    完整模型
    '''
    
    def __init__(self , is_training = True):
        self._grid() #创建grid cell 坐标信息
        
        self.dataset = Dataset()
        
        self.filewriter_path = 'save/logs' #模型可视化
        self.checkpoint_path = 'save/model/' #模型持久化
                              
        self.model = DarkNet(is_training)
        
        self.sess = tf.Session()
        
        self.saver = tf.train.Saver(max_to_keep=2) #max_to_keep 最大保存5次模型  之后继续保存则会覆盖前面的模型
        
        if is_training:
            '''训练参数'''
            self.epoch = 100000
            
            self.global_step = tf.Variable(initial_value=0 , trainable=False)
            
            self.learning_rate = tf.train.exponential_decay(learning_rate=0.00001 , global_step=self.global_step,
                                                            decay_steps=900 , decay_rate=0.8 , staircase=True)
            
            self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.model.total_loss , global_step=self.global_step)
        
            #引入滑动平均
            self.ema = tf.train.ExponentialMovingAverage(decay=0.9) #滑动平均
            self.average_op = self.ema.apply(tf.trainable_variables()) #给所有的可训练变量应用滑动平均
            
            with tf.control_dependencies([self.optimizer]):
                self.train_op = tf.group(self.average_op)
            
            '''可视化'''
            self.sess.run(tf.global_variables_initializer())
            
            tf.summary.scalar('total_loss' , self.model.total_loss)
            self.merged_summary = tf.summary.merge_all() #merge all summaries in the default graph
            self.writer = tf.summary.FileWriter(self.filewriter_path , self.sess.graph) #可视化
            
    
    def _grid(self):
        #S*S grid cells
        x_slice = [ [64*i , 64*(i+1)] for i in [0,1,2,3,4,5,6] ]
        y_slice = [ [64*i , 64*(i+1)] for i in [0,1,2,3,4,5,6] ]
        
        self.grid = np.zeros(shape=[7 , 7] , dtype=list)
        
        for x_idx , x in enumerate(x_slice):
            for y_idx , y in enumerate(y_slice):
                self.grid[x_idx][y_idx] = y + x
            
    def train(self):
        
        if os.path.exists(self.checkpoint_path+'checkpoint'):
            self.saver.restore(self.sess , tf.train.latest_checkpoint(self.checkpoint_path))
        else:
            self.sess.run(tf.global_variables_initializer())
        
        for i in range(300):
            '''
            构造target的时候（即构造训练集的时候）每一个grid cell中只构造一个bounding box 即7*7*(1+4+20) #confidence_score(iou)+offset+p_i
            '''
            x , labels = self.dataset.get_batch()
            
            self.sess.run(self.train_op , feed_dict={self.model.x : x , self.labels.target : labels} )

            if i % 10 == 0:
                self.saver.save(self.sess , self.checkpoint_path + 'model.ckpt' , global_step = i)
                
                total_loss , summary = self.sess.run([self.model.total_loss , self.merged_summary] , feed_dict={self.model.x : x , self.model.labels : labels})
                        
                self.writer.add_summary(summary , global_step = i)
                                
                print(i , total_loss)
            
        self.writer.close() #event to disk and close the file

    def predict(self , path=None):
        if os.path.exists(self.checkpoint_path + 'checkpoint'):
            self.saver.restore(self.sess , tf.train.latest_checkpoint(self.checkpoint_path) )
            
            self._predict(path)
        else:
            print('no model!!!')
            return 
            
    def _predict(self , path):
        threshold_1 = 0.01
        threshold_nms = 0.7
        
        x , img_arr= self.dataset.get_batch_test(path)
        
        output = self.sess.run(self.model.output , feed_dict={self.model.x : x})
        self._output = output
        
        prod = self._confidence_p(output)
        bbox = self._bbox(output)
                
        pred_labels = []
        pred_bnds = []
        
        #step 1 set zero
        prod = np.greater(prod , threshold_1).astype(dtype=int) * prod #小于阈值置0
        
        #step 2 sort and nms
        for i in range(20):
            des_idx = np.argsort(-1 * prod[i]) #降序排列
            
            nms_idx = self._nms(prod[i][des_idx] , bbox[: , des_idx] , des_idx , threshold_nms) #返回的对应被丢弃的bnd box索引
            
            #将丢弃的confidence*pi置0
            prod[i][nms_idx] = 0
        
        #step 3 final prediction
        for i in range(98): #一次处理每一个bounding box
            if np.max(prod[: , i]) != 0:
                pred_idx = np.argmax(prod[: , i])
                
                pred_labels.append(pred_idx)
                pred_bnds.append( self._to_original(bbox[: , i] , img_arr.shape) )
        
        #step 4 draw in image
        display(img_arr , pred_labels , pred_bnds , 'first')
        
    def _to_original(self , bbox , shape):
        return [int( bbox[0]*(shape[1]/448) ) , int( bbox[1]*(shape[1]/448) ) , #x1 x2
                 int( bbox[2]*(shape[0]/448) ) , int( bbox[3]*(shape[0]/448) ) ] #y1 y2
    
    def _confidence_p(self , output):
        #置信度与分类概率的乘积
        prod = []
        
        for i in range(7):
            for j in range(7):
                prod.append(output[i,j,0] * output[i,j, 10:]) #bnd 1
                prod.append(output[i,j,5] * output[i,j, 10:]) #bnd 2
                
        prod = np.array(prod) #98*20
        prod = prod.T #20*98
        
        return prod            
    
    def _bbox(self , output):
        #将output中的坐标还原为448*448坐标系中
        def __bbox(bnd , grid):
            target_x = bnd[0]
            target_y = bnd[1]
            target_w = bnd[2]
            target_h = bnd[3]
            
            center_x = int( target_x * 64 + grid[0] )
            center_y = int( target_y * 64 + grid[2] )
            
            w = int( target_w * 448 )
            h = int( target_h * 448 )
            
            x1 = int( center_x - 0.5*w )
            x2 = int( center_x + 0.5*w )
            y1 = int( center_y - 0.5*h )
            y2 = int( center_y + 0.5*h )
            
            return np.clip([x1,x2,y1,y2] , a_min=0 , a_max=448) #对于跨越边界的框进行clip
        
        bbox = []
        
        for i in range(7):
            for j in range(7):
                bbox.append( __bbox(output[i,j,1:5] , self.grid[i][j]) ) #bnd 1
                bbox.append( __bbox(output[i,j,6:10], self.grid[i][j]) ) #bnd 2
    
        bbox = np.array(bbox) #98*4
        bbox = bbox.T #4*98
        
        return bbox
        

    def _nms(self , prod , bbox , des_idx , threshold_nms):
        '''
        prod已降序排列 bbox也对应降序 des_idx为在未降序的数组中的 降序索引(argsort)
        '''
        length = len(prod) #98
        lost_flag = [1]*length #标记丢弃的框 0表示丢弃
        
        max_score_idx = 0 #记录当前最大score的idx
        
        #对于score为0的bnd box不必进行nms 直接丢弃
        for i in range(length):
            if prod[i] == 0.0:
                lost_flag[i:] = [0] * (length - i) #因为prod已经降序 故出现0.0 后续全为0.0
                break
        
        while max_score_idx < length:
            max_score_rect = bbox[: , max_score_idx]
            
            for i in range(max_score_idx+1 , length):
                if lost_flag[i] == 1 and self._iou( max_score_rect , bbox[: , i] ) > threshold_nms: #大于阈值 丢弃
                    lost_flag[i] = 0

            max_score_idx_bak = max_score_idx #后续使用
            
            #让max_score_idx指向下一个没被丢弃的最大值
            for i in range(max_score_idx+1 , length):
                if lost_flag[i] == 1:
                    max_score_idx = i
                    break
            
            #说明max_score_idx没有移动过 即后续的都被丢弃了 提前终止循环
            if max_score_idx == max_score_idx_bak:
                break
            
        nms_idx = [] #用来存放丢弃的bnd box!!!

        for i in range(length):
            if lost_flag[i] == 0:
                nms_idx.append( des_idx[i] )
                
        return nms_idx
    
    
    def _iou(self , bbox_a , bbox_b):
        #计算bbox面积
        def __area(bbox):
            w = bbox[1] - bbox[0]
            h = bbox[3] - bbox[2]

            return w*h
    
        xmin_a = bbox_a[0]
        xmax_a = bbox_a[1]
        ymin_a = bbox_a[2]
        ymax_a = bbox_a[3]
        
        xmin_b = bbox_b[0]
        xmax_b = bbox_b[1]
        ymin_b = bbox_b[2]
        ymax_b = bbox_b[3]
        
        if   xmin_a < xmax_b <= xmax_a and (ymin_a < ymax_b <= ymax_a or ymin_a <= ymin_b < ymax_a):
            flag = True
        elif xmin_a <= xmin_b < xmax_a and (ymin_a < ymax_b <= ymax_a or ymin_a <= ymin_b < ymax_a):
            flag = True
        elif xmin_b < xmax_a <= xmax_b and (ymin_b < ymax_a <= ymax_b or ymin_b <= ymin_a < ymax_b):
            flag = True
        elif xmin_b <= xmin_a < xmax_b and (ymin_b < ymax_a <= ymax_b or ymin_b <= ymin_a < ymax_b):
            flag = True
        else:
            flag = False
        
        if flag:
            x_sorted_list = sorted([xmin_a, xmax_a, xmin_b, xmax_b])
            y_sorted_list = sorted([ymin_a, ymax_a, ymin_b, ymax_b])
            
            x_intersect_w = x_sorted_list[2] - x_sorted_list[1] #0 1 2 3
            y_intersect_h = y_sorted_list[2] - y_sorted_list[1] #0 1 2 3
            
            area_inter = x_intersect_w * y_intersect_h #计算重合面积
            
            union_area = __area(bbox_a) + __area(bbox_b) - area_inter
            
            return area_inter/union_area
        else:
            return 0.0

In [28]:
tf.reset_default_graph()

In [29]:
test = YOLO_V2()

In [30]:
test.train()

INFO:tensorflow:Restoring parameters from save/model/model.ckpt-290
0 11.899965
10 2.6669745
20 19.913555
30 11.7010765
40 3.1835704
50 29.869507
60 3.508064
70 5.7225046
80 3.1015394
90 4.790275
100 5.5282784
110 3.9910197
120 27.844131
130 5.255406
140 2.4926865
150 3.6026378
160 6.209631
170 18.191385
180 4.6523376
190 3.5999625
200 4.7379174
210 5.1759095
220 2.6724927
230 11.298651
240 6.9037914
250 3.919413
260 9.728306
270 3.6226234
280 5.5427094
290 9.907307


In [35]:
tf.reset_default_graph()

In [36]:
testt = YOLO_V2(is_training=False)

In [37]:
testt.predict()

INFO:tensorflow:Restoring parameters from save/model/model.ckpt-290
test image: ../../../tensorflow2/dataset/VOC2012test/JPEGImages\2008_007489.jpg
debug: (375, 500, 3) [10] [[95, 233, 178, 281]]


In [19]:
testt.predict()

INFO:tensorflow:Restoring parameters from save/model/model.ckpt-290
test image: ../../../tensorflow2/dataset/VOC2012test/JPEGImages\2010_006259.jpg
debug: (375, 500, 3) [5, 14, 3, 19, 10, 10, 15] [[-2, 17, 0, 129], [285, 304, 99, 114], [0, 0, 160, 160], [81, 261, 174, 285], [212, 372, 172, 272], [285, 285, 207, 220], [59, 225, 321, 321]]


In [17]:
testt._output[:,:,0]

array([[0.        , 0.        , 0.        , 0.        , 0.00819682,
        0.        , 0.        ],
       [0.06861994, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.0195321 ],
       [0.01040532, 0.        , 0.        , 0.        , 0.00409173,
        0.        , 0.        ],
       [0.06733071, 0.        , 0.        , 0.        , 0.        ,
        0.02720767, 0.        ],
       [0.        , 0.        , 0.12342051, 0.        , 0.0597523 ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ]], dtype=float32)

In [23]:
testt._output[:,:,0]

array([[0.        , 0.        , 0.        , 0.        , 0.01564434,
        0.        , 0.        ],
       [0.07016775, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.02026871],
       [0.00974074, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.07869903, 0.        , 0.        , 0.        , 0.        ,
        0.02900382, 0.        ],
       [0.        , 0.        , 0.13965729, 0.        , 0.07298322,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ]], dtype=float32)