In [75]:
import numpy as np
import os
import sys
import scipy
import cv2
import gc

#解析使用
import xml
from xml.etree import ElementTree as ET

from glob import glob

import keras.backend as K
from keras.applications import VGG19
from keras.models import Model
from keras.utils import to_categorical

import imageio
from skimage import transform

from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.svm import SVC #类别分类使用
from sklearn.linear_model import Ridge #bounding-box回归
from sklearn.externals import joblib

In [76]:
import tensorflow as tf

from tensorflow.contrib import slim

from ImageNet_classes import class_names #验证alexnet使用

In [77]:
TRAIN_DATA_PATH = '../../../tensorflow2/dataset/VOCtrainval_11-May-2012/JPEGImages/'
TEST_DATA_PATH = '../../../tensorflow2/dataset/VOC2012test/JPEGImages/'

TRAIN_XML_PATH = '../../../tensorflow2/dataset/VOCtrainval_11-May-2012/Annotations/'
TEST_XML_PATH = '../../../tensorflow2/dataset/VOC2012test/Annotations/'

CLASSES_NUM = 20

STR = [
    'person',
    'bird','cat','cow','dog','horse','sheep',
    'aeroplane','bicycle','boat','bus','car','motorbike','train',
    'bottle','chair','diningtable','pottedplant','sofa','tvmonitor'
]

LABEL2STR = {idx:value for idx , value in enumerate(STR)}
STR2LABEL = {value:key for key,value in LABEL2STR.items()}
#STR2LABEL = {value:idx for idx , value in enumerate(STR)}

STR2LABEL['none'] = 'none' #先不使用part部分 只进行naive目标检测

#目标检测相关
IoU_THRESHOLD = 0.5

#SVM相关
SVM_IoU_THRESHOLD = 0.3

#NMS相关
NMS_IoU_THRESHOLD = 0.3 #or ~0.5

#bbox回归
BBOX_REGRESS_IoU_THRESHOLD = 0.6

In [19]:
xml_file_names_train = glob(TRAIN_XML_PATH + '*') #所有的xml文件 完整路径

#从xml文件中读出图片相关的信息

def xml_parse(xml_file):
    '''
    return filename , shape , name_boxes , crop_boxes
    xml文件中的shape格式为 (width height 3)
    '''
    xml_file = xml.dom.minidom.parse(xml_file)
    xml_file_docu_ele = xml_file.documentElement

    filename_list = xml_file_docu_ele.getElementsByTagName('filename')
    
    #filename_list可能有多个filename的 所以要索引0(此数据集中filename只有一个)
    filename = filename_list[0].childNodes[0].data #filename_list.firstChild.data

    #图像的尺寸信息
    size_list = xml_file_docu_ele.getElementsByTagName('size')

    for size in size_list:
        width_list = size.getElementsByTagName('width')
        width = int(width_list[0].childNodes[0].data)

        height_list = size.getElementsByTagName('height')
        height = int(height_list[0].childNodes[0].data)

        channel_list = size.getElementsByTagName('depth')
        channel = int(channel_list[0].childNodes[0].data)

    #一个文件中有多个object
    object_list = xml_file_docu_ele.getElementsByTagName('object')

    #多个object与多个object对应的详细信息
    name_boxes = [] #一个元素就是一个object
    crop_boxes = []

    for objects in object_list:
        #一次循环处理一个object信息
        #一个xml文件（即一个图像中）有多个object

        #name
        name_list = objects.getElementsByTagName('name')

        name_box = name_list[0].childNodes[0].data

        #bounding box points
        bndbox = objects.getElementsByTagName('bndbox')

        x1_list = bndbox[0].getElementsByTagName('xmin')
        x1 = int( round( float(x1_list[0].childNodes[0].data) ) )

        y1_list = bndbox[0].getElementsByTagName('ymin')
        y1 = int(round(float( y1_list[0].childNodes[0].data )))

        x2_list = bndbox[0].getElementsByTagName('xmax')
        x2 = int(round(float( x2_list[0].childNodes[0].data )))

        y2_list = bndbox[0].getElementsByTagName('ymax')
        y2 = int(round(float( y2_list[0].childNodes[0].data )))

        crop_box = [x1,x2,y1,y2]

        name_boxes.append(name_box)
        crop_boxes.append(crop_box)

    #crop_box:[x1 x2 y1 y2]
    return filename , name_boxes , np.array(crop_boxes) #filename调试使用

#xml_parse(xml_file_names_train[10])

In [16]:
xml_parse(xml_file_names_train[897])

('2008_000281.jpg',
 ['car', 'car', 'person'],
 [[106, 186, 377, 419], [194, 283, 396, 444], [413, 429, 399, 444]])

In [18]:
class Image(object):
    '''
    图片的真实信息
    '''
    def __init__(self):
        self.img_file_names_train = glob(TRAIN_DATA_PATH+'*') #训练全路径信息
                
    def load(self , img_path_name = None):
        if not img_path_name:
            img_path_name = np.random.choice(self.img_file_names_train) #随机选择一张图片
            #img_path_idx = np.random.randint(0 , high = len(self.img_file_names_train)) #随机索引

        img_arr = cv2.imread(img_path_name) #BGR height*width*chanel
        
        xml_file_name = TRAIN_XML_PATH + img_path_name[-15:-4] +  '.xml'
        
        _ , name_boxes , crop_boxes = xml_parse(xml_file_name)
        
        labels = [] #存储与bndbox对应的 label信息

        for i in range(len(crop_boxes)): #多个object 
            labels.append(STR2LABEL.get(name_boxes[i] , 'none'))
        
        return img_arr , labels , crop_boxes
    

In [84]:
0.0==0.000000000000000000001

False

In [90]:
1 <=1

True

In [113]:
class Img_generator(object):
    def __init__(self):
        self.img_loader = Image()

    #计算bbox面积
    def bbox_area(self , bbox):
        w = bbox[1] - bbox[0]
        h = bbox[3] - bbox[2]
        
        return w*h
    
    #计算交并比
    def IoU(self , bbox_a , bbox_b):
        xmin_a = bbox_a[0]
        xmax_a = bbox_a[1]
        ymin_a = bbox_a[2]
        ymax_a = bbox_a[3]
        
        xmin_b = bbox_b[0]
        xmax_b = bbox_b[1]
        ymin_b = bbox_b[2]
        ymax_b = bbox_b[3]
        
        if   xmin_a < xmax_b <= xmax_a and (ymin_a < ymax_b <= ymax_a or ymin_a <= ymin_b < ymax_a):
            flag = True
        elif xmin_a <= xmin_b < xmax_a and (ymin_a < ymax_b <= ymax_a or ymin_a <= ymin_b < ymax_a):
            flag = True
        elif xmin_b < xmax_a <= xmax_b and (ymin_b < ymax_a <= ymax_b or ymin_b <= ymin_a < ymax_b):
            flag = True
        elif xmin_b <= xmin_a < xmax_b and (ymin_b < ymax_a <= ymax_b or ymin_b <= ymin_a < ymax_b):
            flag = True
        else:
            flag = False
        
        if flag:
            x_sorted_list = sorted([xmin_a, xmax_a, xmin_b, xmax_b])
            y_sorted_list = sorted([ymin_a, ymax_a, ymin_b, ymax_b])
            
            x_intersect_w = x_sorted_list[2] - x_sorted_list[1] #0 1 2 3
            y_intersect_h = y_sorted_list[2] - y_sorted_list[1] #0 1 2 3
            
            area_inter = x_intersect_w * y_intersect_h #计算重合面积
            
            union_area = self.bbox_area(bbox_a) + self.bbox_area(bbox_b) - area_inter
            
            return area_inter/union_area
        else:
            return 0.0
    
    
    def map2new(self , img_shape , ground_truth_coord):
        '''
        坐标系映射至448*448坐标系中
        '''
        original_height = img_shape[0]
        original_width = img_shape[1]
        
        ground_truth_coord[: , :2] = np.array( ground_truth_coord[: , :2] * (448/original_width) , dtype=int) #x1 x2
        ground_truth_coord[: , 2:] = np.array( ground_truth_coord[: , 2:] * (448/original_height) , dtype=int) #y1 y2
        
        return ground_truth_coord
        
    
    def get_train_proposal(self , img_arr , labels , ground_truth_coord):
        #get_train_proposal为关键函数
        
        def _center(gt):
            '''
            gt的中心坐标
            '''
            x = int( ( gt[0] + gt[1] ) / 2 )
            y = int( ( gt[2] + gt[3] ) / 2 )
        
            return [x,y]
        
        def _is_in_grid(gt_center , grid):
            '''
            判断gt的中心是否在此grid cell中
            '''
            if (grid[0] <= gt_center[0] <= grid[1]) and (grid[2] <= gt_center[1] <= grid[3]):
                return True
            else:
                return False
        
        def _target_gt(gt , grid):
            '''
            将gt变为target需要的格式
            '''
            center = _center(gt)
            #gt中心坐标在对应的grid中的偏移
            target_x = ( center[0] - grid[0] ) / 64
            target_y = ( center[1] - grid[2] ) / 64
            
            target_w = ( gt[1]-gt[0] ) / 448
            target_h = ( gt[3]-gt[2] ) / 448
            
            return [target_x , target_y , target_w , target_h]
            
        
        '''下面操作在448*448坐标系中进行'''
        
        #S*S grid cells
        x_slice = [ [64*i , 64*(i+1)] for i in [0,1,2,3,4,5,6] ]
        y_slice = [ [64*i , 64*(i+1)] for i in [0,1,2,3,4,5,6] ]
        
        grid = np.zeros(shape=[7 , 7] , dtype=list)
        
        for x_idx , x in enumerate(x_slice):
            for y_idx , y in enumerate(y_slice):
                grid[x_idx][y_idx] = y + x
        
        #训练样本中的y
        #[confidence x y w h]*2 + cls_score
        target = np.zeros(shape=[7 , 7 , 30] , dtype=float)
        
        for i in range(7):
            for j in range(7):
                for idx , gt in enumerate(ground_truth_coord):
                    
                    if _is_in_grid( _center(gt) , grid[i][j]):
                        '''
                        此gt的中心位于此grid cell中
                        '''
                        iou = self.IoU(gt , grid[i][j])
                        
                        #如果出现一个grid cell有多个gt对应 则只保留iou最高的前两个
                        if target[i][j][0] > target[i][j][5]:
                            if iou > target[i][j][5]:
                                target[i][j][5] = iou
                                target[i][j][6 : 10] = _target_gt(gt , grid[i][j])
                                target[i][j][ labels[idx] + 10 ] = 1.0
                        elif target[i][j][0] <= target[i][j][5]:
                            if iou > target[i][j][0]:
                                target[i][j][0] = iou
                                target[i][j][1 : 5] = _target_gt(gt , grid[i][j])
                                target[i][j][ labels[idx] + 10 ] = 1.0
                        #并入上面
                        #else:
                        #    #先执行此处
                        #    if iou > target[i][j][0]:
                        #        #只要大于 随意选一个位置即可
                        #        target[i][j][0] = iou
                        #        target[i][j][1 : 5] = _target_gt(gt)
                        #        target[i][j][ labels[idx] + 10 ] = 1.0
                
                #here
                #处理完一个grid cell
                #如果[i , j] grid只有一个gt与之对应 则将其翻倍（因为yolo v1一个grid对应两个bounding box）
                #只有一个gt与之对应 则只会出现在target[i][j][0 1 2 3 4]处
                if (target[i][j][0] != 0.0) and (target[i][j][5] == 0.0):
                    target[i][j][5:10] = target[i][j][0:5]
                    #cls_score是一样的 同一个位置
                                                  
        return np.array(target)
    
    
    def load(self , img_path_name):
        '''
        img_path_name:绝对路径
        '''
        
        #图片数据 ground truth具体数据 ground truth对应label ground truth坐标信息 图片文件名
        img_arr , labels , ground_truth_coord = self.img_loader.load(img_path_name)
        
        ground_truth_coord = self.map2new(img_arr.shape , ground_truth_coord) #将ground_truch坐标从原坐标系映射至448*448坐标系中
        
        img_arr = cv2.resize(img_arr , (448 , 448))
        img_arr = img_arr / 127.5 - 1 #对下面的get_train_proposal没有影响
        
        target = self.get_train_proposal(img_arr , labels , ground_truth_coord)
        
        '''
        resize 并 归一化像素值
        img_arr 为 BGR形式
        '''
        
        '''[R G B] [123.68 116.779 103.939]
        减去每个通道的像素平均值 归一化'''
        #img_arr[:,:,0] = img_arr[:,:,0] - 103.939
        #img_arr[:,:,1] = img_arr[:,:,1] - 116.779
        #img_arr[:,:,2] = img_arr[:,:,2] - 123.680
        
        #'''增加一维 batch维'''
        return np.expand_dims(img_arr , axis=0) , target
    
    
    def get_test_proposal(self , img_arr):
        '''
        return:rois
        proposals_coord
        '''
        
        h = img_arr.shape[0]
        w = img_arr.shape[1]
        
        def bbox_trans(_rect):
            rect = [-1,-1,-1,-1]
            
            rect[0] = int(_rect[0]*600 / w)
            rect[1] = int(_rect[1]*600 / w)
            rect[2] = int(_rect[2]*1000 / h)
            rect[3] = int(_rect[3]*1000 / h)
        
            return rect
        
        anchors = [] #x1 x2 y1 y2 计算iou使用
        
        feature_map_height = 61
        feature_map_width = 36
        
        scales = [128 , 256 , 512]
        ratios = [[1,2] , [1,1] , [2,1]] #用scale除以即可 [height_ratio width_ratio]
        
        '''
        x_0 y_0 为 feature map中的坐标
        
        x_0_coord y_0_coord 为原图中的坐标（中点坐标）
        
        跨越边界的anchor 进行截断
        '''
        
        for x_0 in range(61): #height
            for y_0 in range(36): #width
                
                x_0_coord = x_0 * 16
                y_0_coord = y_0 * 16
                
                for scale in scales:
                    for ratio in ratios:
                        scale_height = int(scale / ratio[0])
                        scale_width = int(scale / ratio[1])
                    
                        x_1_coord = int(x_0_coord - scale_width/2)
                        y_1_coord = int(y_0_coord - scale_height/2)

                        if x_1_coord < 0:
                            x_1_coord = 0
                            
                        if y_1_coord < 0:
                            y_1_coord = 0

                        x_2_coord = int(x_0_coord + scale_width/2)
                        y_2_coord = int(y_0_coord + scale_height/2)

                        if x_2_coord > 600:
                            x_2_coord = 600
                            
                        if y_2_coord > 1000:
                            y_2_coord = 1000
                        
                        anchors.append( [x_1_coord , x_2_coord , y_1_coord , y_2_coord] )

        return np.array(anchors)
    
    def load_test(self , img_path_name):
        img_arr = cv2.imread(img_path_name)
        
        anchors = self.get_test_proposal(img_arr)
        
        img_arr = cv2.resize(img_arr , (600 , 1000))
        img_arr = img_arr / 127.5 - 1.0
        
        return np.expand_dims(img_arr , axis=0) , anchors

# class Img_generator

In [114]:
test = Img_generator()

In [117]:
a,b = test.load('../../../tensorflow2/dataset/VOCtrainval_11-May-2012/JPEGImages/2007_000129.jpg')

In [118]:
for i in range(7):
    for j in range(7):
        print(b[i][j])



[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 

In [None]:
class Dataset(object):
    def __init__(self):
        self.img_generator = Img_generator()
        
        self.img_loader = Image()
        
        self.img_file_names_train = glob(TRAIN_DATA_PATH + '*')
        self.img_file_names_test = glob(TEST_DATA_PATH + '*')
    
    def get_batch(self):
        path = np.random.choice(self.img_file_names_train)
        
        x , y_positive , y_negative = self.img_generator.load(path)
        
        y_pos_idx = np.random.choice( list( range( len(y_positive) ) ), size=128)
        y_positive = y_positive[y_pos_idx]
        
        y_neg_idx = np.random.choice( list( range( len(y_negative) ) ) , size=128 )
        y_negative = y_negative[y_neg_idx]
        
        y=np.concatenate((y_positive , y_negative) , axis=0)
        np.random.shuffle(y)
        
        return x , y
    
    def get_batch_test(self , path):
        '''
        返回图片的真实img_arr 未resize 未归一化
        注意cv2打开图片通道为BGR
        '''
        if not path:
            #未指定path 从测试目录中随机选一张图片测试
            path = np.random.choice(self.img_file_names_test)
        
        '''resize & norm/anchors/原图 '''
        x , anchors = self.img_generator.load_test(path)
        
        return x , anchors
    
    
    def target2coord(self , bbox_pred , img_arr , anchors):
        img_height = img_arr.shape[0]
        img_width = img_arr.shape[1]
        
        def to(rect):
            x1 = rect[0]
            x2 = rect[1]
            y1 = rect[2]
            y2 = rect[3]
            
            w = x2-x1
            h = y2-y1
            
            x_c = (x1+x2)//2
            y_c = (y1+y2)//2
            
            return x_c , y_c , w , h
        
        def ot(target):
            x_c = target[0]
            y_c = target[1]
            w = target[2]
            h = target[3]
            
            x1 = 0.5*(2*x_c-w)
            y1 = 0.5*(2*y_c-h)
            x2 = x1+w
            y2 = y1+h
            
            x1=int(round(x1))
            y1=int(round(y1))
            x2=int(round(x2))
            y2=int(round(y2))
            
            if x1<0:
                x1 = 0
            if x2>img_width:
                x2 = img_width
            if y1<0:
                y1 = 0
            if y2>img_height:
                y2 = img_height
                            
            return [x1 , x2 , y1 , y2]
        
        def target2rect(target_hat , P_box):
            t_x = target_hat[0]
            t_y = target_hat[1]
            t_w = target_hat[2]
            t_h = target_hat[3]
            
            P_x , P_y , P_w , P_h = to(P_box) #将P框转换为 中点坐标 宽 高 形式
            
            G_x_hat = P_w*t_x+P_x
            G_y_hat = P_h*t_y+P_y
            G_w_hat = P_w*np.exp(t_w)
            G_h_hat = P_h*np.exp(t_h)
            
            return ot([G_x_hat , G_y_hat , G_w_hat , G_h_hat]) #ot还需要转化为(x1,x2,y1,y2)形式
        
        bbox_coord_pred = []
        
        for i in range(len(bbox_pred)):
            bbox_coord_pred.append( target2rect(bbox_pred[i] , anchors[i]) )
                
        return bbox_coord_pred