In [1]:
import numpy as np
import tensorflow as tf

import cv2

from glob import glob

import multiprocessing


import os
import pandas as pd


import xml
from xml.etree import ElementTree as ET

from keras.utils import to_categorical

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
STR = [
    'person',
    'bird','cat','cow','dog','horse','sheep',
    'aeroplane','bicycle','boat','bus','car','motorbike','train',
    'bottle','chair','diningtable','pottedplant','sofa','tvmonitor'
]

LABEL2STR = {idx:value for idx , value in enumerate(STR)}
STR2LABEL = {value:key for key,value in LABEL2STR.items()}

In [3]:
TRAIN_DATA_PATH = '../../tensorflow2/dataset/VOCtrainval_11-May-2012/JPEGImages/'
TRAIN_XML_PATH = '../../tensorflow2/dataset/VOCtrainval_11-May-2012/Annotations/'

In [4]:
def xml_parse(xml_file):
    xml_file = xml.dom.minidom.parse(xml_file)
    xml_file_docu_ele = xml_file.documentElement

    filename_list = xml_file_docu_ele.getElementsByTagName('filename')
    #filename_list可能有多个filename的 所以要索引0(此数据集中filename只有一个)
    filename = filename_list[0].childNodes[0].data #filename_list.firstChild.data

    #图像的尺寸信息
    size_list = xml_file_docu_ele.getElementsByTagName('size')

    for size in size_list:
        width_list = size.getElementsByTagName('width')
        width = int(width_list[0].childNodes[0].data)

        height_list = size.getElementsByTagName('height')
        height = int(height_list[0].childNodes[0].data)

        channel_list = size.getElementsByTagName('depth')
        channel = int(channel_list[0].childNodes[0].data)

    shape = (width , height , channel)

    #一个文件中有多个object
    object_list = xml_file_docu_ele.getElementsByTagName('object')

    #多个object与多个object对应的详细信息
    name_boxes = [] #一个元素就是一个object
    crop_boxes = []

    for objects in object_list:
        #一次循环处理一个object信息
        #一个xml文件（即一个图像中）有多个object

        name_list = objects.getElementsByTagName('name')

        #name_box中第0个元素是object的名称 后面的是详细物体part的名称
        name_box = []

        for i in range(len(name_list)):
            name_box.append(name_list[i].childNodes[0].data)

        bndbox = objects.getElementsByTagName('bndbox')

        #crop_box中第0个元素是object的坐标 后面的为详细物体part的坐标
        crop_box = []

        for box in bndbox:

            int(round(float('1.0542')))
            x1_list = box.getElementsByTagName('xmin')
            x1 = int( round( float(x1_list[0].childNodes[0].data) ) )

            y1_list = box.getElementsByTagName('ymin')
            y1 = int(round(float( y1_list[0].childNodes[0].data )))

            x2_list = box.getElementsByTagName('xmax')
            x2 = int(round(float( x2_list[0].childNodes[0].data )))

            y2_list = box.getElementsByTagName('ymax')
            y2 = int(round(float( y2_list[0].childNodes[0].data )))

            crop_box.append([x1,x2,y1,y2])

        name_boxes.append(name_box)
        crop_boxes.append(crop_box)

    return filename , shape , STR2LABEL[name_boxes[0][0]] , crop_boxes[0][0]



In [5]:
FAST = True
ACCURATE = not FAST

if FAST:
    HEIGHT = 231
    WIDTH = 231
else:
    HEIGHT = 221
    WIDTH = 221

In [6]:
#调整尺寸
def resize(img , resize_type):
    if resize_type == 'reg':
        #resize_val 为(x,x)
        return cv2.resize(img , (256,256))
    
    elif resize_type == 'min':
        #resize_val scalar
        height = img.shape[0]
        width = img.shape[1]
        ratio = height / width
        
        if height<width:
            new_shape = (256 , int(256.0/ratio))
        else:
            new_shape = (int(256.0*ratio) , 256)
        
        return cv2.resize(img , new_shape)
    
    elif resize_type == 'max':
        #resize_val scalar
        height = img.shape[0]
        width = img.shape[1]
        ratio = height / width
        
        if height<width:
            new_shape = (int(256.0*ratio) , 256)
        else:
            new_shape = (256 , int(256.0/ratio))
            
        return cv2.resize(img , new_shape)
        
#随机裁剪
#def random_clip(img , crop_size):
#    shape = img.shape
#    
#    if shape[0] == crop_size[0] and shape[1] == crop_size[1]:
#        return img
#    
#    height_clip_domain = shape[0]-crop_size[0]
#    width_clip_domain = shape[1]-crop_size[1]
#    
#    height_clip_idx = np.random.randint(0 , height_clip_domain)
#    width_clip_idx = np.random.randint(0 , width_clip_domain)
#    
#    return img[height_clip_idx : height_clip_idx+crop_size[0] , width_clip_idx : width_clip_idx+crop_size[1] , :]


def central_clip(img , crop_size):
    img_height = img.shape[0]
    img_width = img.shape[1]
    
    height_r = img_height - crop_size[0]
    width_r = img_width - crop_size[1]
    
    top = height_r//2
    left = width_r//2
    
    return img[top : top+crop_size[0] , left : left+crop_size[1] , :]

In [7]:
def split_data(train_size = 0.8):
    # xml文件
    filenames = glob(pathname = '../../tensorflow2/dataset/VOCtrainval_11-May-2012/Annotations/*.xml')
    filenames = np.array(filenames)
    
    idx = list(range(len(filenames)))
    np.random.shuffle(idx)    
    
    train_idx = idx[ : int(len(idx) * train_size)]
    val_idx = idx[int(len(idx) * train_size) : ]
    
    return filenames[train_idx] , filenames[val_idx]

In [8]:
def preprocess_train(img , bbox):
    #训练数据预处理
    
    shape = img.shape
    height_ratio = HEIGHT/shape[0]
    width_ratio = WIDTH/shape[1]
    
    img = cv2.resize(img , (HEIGHT , WIDTH))
    #img = resize(img , 'max')
    #img = central_clip(img , crop_size=(HEIGHT , WIDTH))
    #img = random_clip(img , crop_size=(HEIGHT , WIDTH , 3))
    
    img = img/127.5 - 1.0
    
    bbox[0] = float(int(bbox[0]*width_ratio))
    bbox[2] = float(int(bbox[2]*width_ratio))
    
    bbox[1] = float(int(bbox[1]*height_ratio))
    bbox[3] = float(int(bbox[3]*height_ratio))
    
    return img , bbox
    

def preprocess_val(img , bbox):
    #验证数据预处理
    
    shape = img.shape
    height_ratio = HEIGHT/shape[0]
    width_ratio = WIDTH/shape[1]
    
    img = cv2.resize(img , (HEIGHT , WIDTH))
    #img = resize(img , 'max')
    #img = central_clip(img , crop_size=(HEIGHT , WIDTH))
    #img = random_clip(img , crop_size=(HEIGHT , WIDTH , 3))
    
    img = img/127.5 - 1.0
    
    bbox[0] = float(int(bbox[0]*width_ratio))
    bbox[2] = float(int(bbox[2]*width_ratio))
    
    bbox[1] = float(int(bbox[1]*height_ratio))
    bbox[3] = float(int(bbox[3]*height_ratio))
    # img = central_clip(img , crop_size=(HEIGHT , WIDTH))
    
    return img , bbox


In [9]:
train_filenames , val_filenames = split_data(train_size=0.9)

In [10]:
train_num = len(train_filenames)
val_num = len(val_filenames)

In [11]:
def next_batch(batch_size , is_training = True):
    data = []
    bboxes = []
    
    if is_training:
        #训练数据随机索引
        shuffle_idx = np.random.randint(low=0 , high=train_num , size=batch_size)
        
        for i in shuffle_idx:
            filename , _ , _ , bbox = xml_parse(train_filenames[i]) # _ shape label _
            
            img = cv2.imread(TRAIN_DATA_PATH + filename)
            img , bbox = preprocess_train(img , bbox)
            
            data.append(img)
            bboxes.append(bbox)
            
        return np.array(data) , np.array(bboxes)
            
    else:
        #验证数据随机索引
        shuffle_idx = np.random.randint(low=0 , high=val_num , size=batch_size)
        
        for i in shuffle_idx:
            filename , _ , _ , bbox = xml_parse(val_filenames[i])
            
            img = cv2.imread(TRAIN_DATA_PATH + filename)
            img , bbox = preprocess_val(img , bbox)
            
            data.append(img)
            bboxes.append(bbox)
            
        return np.array(data) , np.array(bboxes)
    

In [12]:
class OverFeat(object):

    def __init__(self , num_classes , model_type = 'fast'):
        self.TYPE = model_type
        
        self.EPOCH = 90 #paper
        
        self.BATCH_SIZE = 128
                
        self.KEEP_PROB = 0.5
        
        if model_type == 'accurate':
            self.HEIGHT = 221
            self.WIDTH = 221
            
            self.X = tf.placeholder(dtype=tf.float32 , shape=(None , self.HEIGHT , self.WIDTH , 3))
            self.y = tf.placeholder(dtype=tf.float32 , shape=(None , 4)) #tensorflow完成one-hot
        
            self.model_accurate()
        else:
            self.HEIGHT = 231
            self.WIDTH = 231
            
            self.X = tf.placeholder(dtype=tf.float32 , shape=(None , self.HEIGHT , self.WIDTH , 3))
            self.y = tf.placeholder(dtype=tf.float32 , shape=(None , 4)) #tensorflow完成one-hot
            
            self.model_fast()

    def model_fast(self):
        conv1 = self.conv(self.X , 11 , 11 , 96 , 4 , 4 , name='conv1')
        max_pooling1 = self.max_pooling(conv1 , 2 , 2 , 2 , 2 , name='pooling1')
        
        conv2 = self.conv(max_pooling1 , 5 , 5 , 256 , 1 , 1 , name='conv2')
        max_pooling2 = self.max_pooling(conv2 , 2 , 2 , 2 , 2 , name='pooling2')
        
        conv3 = self.conv(max_pooling2 , 3 , 3 , 512 , 1 , 1 , name='conv3')
        
        conv4 = self.conv(conv3 , 3 , 3 , 1024 , 1 , 1 , name='conv4')
    
        conv5 = self.conv(conv4 , 3 , 3 , 1024 , 1 , 1 , name='conv5')
        max_pooling5 = self.max_pooling(conv5 , 2 , 2 , 2 , 2 , name='pooling5')
        
        #conv6 = self.fcn(max_pooling5 , output_channel=3072 , name='fcn') #FCN形式 全连接变为卷积形式
        #===
        max_pooling5 = tf.layers.flatten(max_pooling5)
        #===
        
        fc6 = self.fc(max_pooling5 , 1024 , name='fc6') #应该是4096 显卡不行
        fc6 = tf.layers.dropout(fc6 , rate=1. - self.KEEP_PROB)
            
        fc7 = self.fc(fc6 , 1024 , name='fc7')
        fc7 = tf.layers.dropout(fc7 , rate=1. - self.KEEP_PROB)
        
        fc8 = self.fc(fc7 , 4 , name='fc8')
        
        self.logits = fc8
        
    
    def model_accurate(self):
        conv1 = self.conv(self.X , 7 , 7 , 96 , 2 , 2 , name='conv1')
        max_pooling1 = self.max_pooling(conv1 , 3 , 3 , 3 , 3 , name='pooling1')
        
        conv2 = self.conv(max_pooling1 , 7 , 7 , 256 , 1 , 1 , name='conv2')
        max_pooling2 = self.max_pooling(conv2 , 2 , 2 , 2 , 2 , name='pooling2')
        
        conv3 = self.conv(max_pooling2 , 3 , 3 , 512 , 1 , 1 , name='conv3')
        
        conv4 = self.conv(conv3 , 3 , 3 , 512 , 1 , 1 , name='conv4')
        
        conv5 = self.conv(conv4 , 3 , 3 , 1024 , 1 , 1 , name='conv5')
        
        conv6 = self.conv(conv5 , 3 , 3 , 1024 , 1 , 1 , name='conv6')
        max_pooling6 = self.max_pooling(conv6 , 3 , 3 , 3 , 3 , name='pooling6')
        
        #conv7 = self.fcn(max_pooling6 , output_channel = 4096 , name='fcn') #FCN形式 全连接变为卷积形式
        #===
        max_pooling6 = tf.layers.flatten(max_pooling6)
        #===
        
        fc7 = self.fc(max_pooling6 , 4096 , name='fc7')
        fc7 = tf.layers.dropout(fc7 , rate=1. - self.KEEP_PROB)
        
        fc8 = self.fc(fc7 , 1024 , name='fc8')
        fc8 = tf.layers.dropout(fc8 , rate=1. - self.KEEP_PROB)
                
        fc9 = self.fc(fc8 , 4 , name='fc9')
        
        self.logits = fc9
    
    def train(self):
        #训练使用
        loss = tf.reduce_mean( tf.square(tf.subtract(self.y , self.logits)) )
        #构建训练过程
        
        epoch = tf.Variable(initial_value=0 , name='epoch' , trainable=False)
        epoch_add = tf.assign_add(epoch , value=1) #对epoch加1 因为下面的lr需要变化
        
        learning_rate = tf.train.piecewise_constant(epoch , boundaries=[30,50,60,70,80] ,
                                                    values=[0.05,0.025,0.0125,0.00625,0.003125,0.0015625])
        optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate , momentum=0.6)
        
        train_op = optimizer.minimize(loss)
        
        with tf.Session() as sess:
            
            sess.run(tf.global_variables_initializer())
            
            for i in range(self.EPOCH):
                
                for j in range(train_num // self.BATCH_SIZE):
                    
                    data , labels = next_batch(batch_size=self.BATCH_SIZE)
                                        
                    _ , _loss = sess.run((train_op , loss) , feed_dict={self.X : data, self.y : labels})

                    print(_loss)
                        

                #for k in range(val_num // self.BATCH_SIZE):
                #    
                #    data , labels = next_batch(batch_size=self.BATCH_SIZE , is_training=False)
                #    
                #    _loss = sess.run(loss , feed_dict={self.X : data, self.y : labels})

                #    print('val' , _loss)
    
                sess.run(epoch_add)

    def predict(self):
        pass
    
    def conv(self , x , filter_height , filter_width , output_channel , stride_height , stride_width , name , padding='same'):
        
        return tf.layers.conv2d(x , output_channel , [filter_height , filter_width] , [stride_height , stride_width] , padding=padding ,
                             activation=tf.nn.relu , kernel_initializer = tf.initializers.random_normal(stddev=1e-2) ,
                             bias_initializer = tf.initializers.constant() , kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=1e-5) ,
                             name=name , reuse=tf.AUTO_REUSE)
    
        #input_channel = x.get_shape().as_list()[-1]
        
        #with tf.variable_scope(name) as scope:
            #weights = tf.get_variable(name='weights' , shape=[filter_height , filter_width , input_channel , output_channel] , initializer=tf.random_normal_initializer(0.0 , 1e-2) , regularizer=tf.contrib.layers.l2_regularizer(scale=1e-5))
            #biases = tf.get_variable(name='biases' , shape=[output_channel] , initializer=tf.constant_initializer())
            #
            #conv = tf.nn.conv2d(x , weights , strides=[1 , stride_height , stride_width , 1] , padding=padding)
            #biases = tf.nn.bias_add(conv , biases)
            #
            #relu = tf.nn.relu(biases)
            
            #return relu
            
    #3*3 pooling
    def max_pooling(self , x , pooling_height , pooling_width , stride_height , stride_width  , name , padding='same'):
        #return tf.nn.max_pool(x , [1 , pooling_height , pooling_width , 1] , strides=[1 , stride_height , stride_width , 1] , padding=padding , name=name)
        
        return tf.layers.max_pooling2d(x , [pooling_height , pooling_width] , [stride_height , stride_width] , padding=padding , name=name)
    
    def fc(self , x , output_size , name):
        
        return tf.layers.dense(x , output_size , activation=tf.nn.relu , kernel_initializer=tf.initializers.random_normal(stddev=1e-2) ,
                               bias_initializer = tf.initializers.constant() , kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=1e-5),
                               name = name , reuse=tf.AUTO_REUSE)
        
        #with tf.variable_scope(name) as scope:
        #    weights = tf.get_variable(name='weights' , shape=[input_size , output_size] , initializer=tf.random_normal_initializer())
        #    biases = tf.get_variable(name='biases' , shape=[output_size] , initializer=tf.constant_initializer())
        #    
        #    biases = tf.nn.bias_add(tf.matmul(x , weights) , biases)
        #    
        #    if relu:
        #        return tf.nn.relu(biases)
        #    else:
        #        return biases
        #
    
    #所有卷积层后紧跟的fc层变为卷积层方式
    def fcn(self , x , output_channel , name , padding='same'):
        if self.TYPE == 'accurate':
            return tf.layers.conv2d(x , output_channel , [6 , 6] , [1 , 1] , padding=padding ,
                             activation=tf.nn.relu , kernel_initializer = tf.initializers.random_normal(stddev=1e-2) ,
                             bias_initializer = tf.initializers.constant() , kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=1e-5) ,
                             name=name , reuse=tf.AUTO_REUSE)
        else:
            return tf.layers.conv2d(x , output_channel , [5 , 5] , [1 , 1] , padding=padding ,
                             activation=tf.nn.relu , kernel_initializer = tf.initializers.random_normal(stddev=1e-2) ,
                             bias_initializer = tf.initializers.constant() , kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=1e-5) ,
                             name=name , reuse=tf.AUTO_REUSE)
        
    def batch_norm(self , x , name):
        return tf.layers.batch_normalization(x , axis=-1 , training=self.IS_TRAINING , renorm=True , fused=True , name=name)

In [13]:
overfeat = OverFeat(20)


In [14]:
overfeat.train()

24347.93
26980.455
248441000000000.0
26741.86
26207.928
25236.951
26365.08
24358.262
25823.287
24814.4
25126.688
23460.188
25595.904


KeyboardInterrupt: 