## 1. import packages

In [5]:
# coding: utf-8
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import math
import os
import numpy as np
import tensorflow as tf
import matplotlib

matplotlib.use('Agg')
import matplotlib.pyplot as plt
%matplotlib inline

## 2.  obtain the train and val and test images/labels list

In [6]:

def get_files(file_dir, test_ratio, val_ratio):
    """
    说明：对整体的数据集进行训练集、验证集、测试集的划分，
         最终返回这3个集合对应的图像以及标签列表
    :type file_dir: str 总体数据的路径
    :type test_ratio: float [0, 1]
    :type val_ratio: float [0, 1]
    :returns (1)the list of train's images and train's labels
             (2)the list of val's images and val's labels
             (3)the list of test's images and test's labels
    """
    baidus = []  # 保存所有baidu图像所在的位置
    label_baidus = []  # 保存所有baidu图像的标签
    sogous = []
    label_sogous = []

    for file in os.listdir(file_dir):  # file_dir = datasets
        # print(file)
        if file.startswith('baidu'):
            file_list = os.listdir(os.path.join(file_dir, file))

            for subfile in file_list:  # baidu对应正确的类：1
                baidus.append(os.path.join(file_dir, file, subfile))  # 将图片的路径加入到列表中
                label_baidus.append(1)  # 添加对应的标签

        elif file.startswith('sogou'):
            file_list = os.listdir(os.path.join(file_dir, file))
            for subfile in file_list:  # sogou对应正确的类：0
                sogous.append(os.path.join(file_dir, file, subfile))
                label_sogous.append(0)

    print('There are %d baidu\nThere are %d sogou' % (len(baidus), len(sogous)))

    # Stack arrays in sequence horizontally (column wise).
    # 按顺序堆叠数组（按列方式）把baidu,sogou合起来组成一个list(img and label)
    image_list = np.hstack((baidus, sogous))
    label_list = np.hstack((label_baidus, label_sogous))
    # print("type(image_list): ", type(image_list))  # <class 'numpy.ndarray'>
    # print("image_list.dtype: ", image_list.dtype)
    # print("image_list.shape: ", image_list.shape)  # (38,)
    # print("image_list: ", image_list)

    # 利用shuffle打乱顺序
    temp = np.array([image_list, label_list])
    temp = temp.transpose()
    np.random.shuffle(temp)
    # print(temp.shape)  # (38, 2)

    # 从打乱的temp中再取出list(img和label)
    all_image_list = temp[:, 0]
    all_label_list = temp[:, 1]

    # number of all samples
    n_sample = len(all_label_list)
    # number of testing samples，return the upper boundary(int)
    n_test = math.ceil(n_sample * test_ratio)
    # number of val samples
    n_val = math.ceil((n_sample - n_test) * val_ratio)
    # number of training samples
    n_train = n_sample - n_test - n_val

    tra_images = all_image_list[0:n_train]
    tra_labels = all_label_list[0:n_train]
    tra_labels = [int(float(i)) for i in tra_labels]

    val_images = all_image_list[n_train:n_train + n_val]
    val_labels = all_label_list[n_train:n_train + n_val]
    val_labels = [int(float(i)) for i in val_labels]

    test_images = all_image_list[n_train + n_val:]
    test_labels = all_label_list[n_train + n_val:]
    test_labels = [int(float(i)) for i in test_labels]

    # print(np.array(tra_images).shape)  # (30, )
    # print(np.array(tra_labels).shape)  # (30, )
    # print(np.array(val_images).shape)  # (7, )
    # print(np.array(val_labels).shape)  # (7, )
    # print(np.array(test_images).shape)  # (7, )
    # print(np.array(test_labels).shape)  # (7, )
    return tra_images, tra_labels, val_images, val_labels, test_images, test_labels


## 3.  get batches

In [9]:

def get_batch(image, label, image_W, image_H, batch_size, capacity):
    """
    Args:
        image: list type
        label: list type
        image_W: image width
        image_H: image height
        batch_size: batch size
        capacity: the maximum elements in queue
    Returns:
        image_batch: 4D tensor [batch_size, width, height, 3], dtype=tf.float32
        label_batch: 1D tensor [batch_size], dtype=tf.int32
    """
    """
    step1：
        将上面生成的List传入get_batch() ，
        转换类型，产生一个输入队列queue，因为img和lab是分开的，
        所以使用tf.train.slice_input_producer()，
        然后用tf.read_file()从队列中读取图像
    """

    ########################################################
    # step1：
    #   将上面生成的List传入get_batch() ，
    #   转换类型，产生一个输入队列queue，因为img和lab是分开的，
    #   所以使用tf.train.slice_input_producer()，
    #   然后用tf.read_file()
    #   从队列中读取图像
    ########################################################
    image = tf.cast(image, tf.string)
    label = tf.cast(label, tf.int32)

    # make an input queue
    # 输入队列中原始的元素为文件列表中的所有文件
    input_queue = tf.train.slice_input_producer([image, label])

    label = input_queue[1]
    # read img from a queue
    image_contents = tf.read_file(input_queue[0])

    ########################################################
    # step2:将图像解码，不同类型的图像不能混在一起
    ########################################################
    image = tf.image.decode_jpeg(image_contents, channels=3)

    ########################################################
    # step3: data argumentation should go to here
    ########################################################
    image = tf.image.resize_image_with_crop_or_pad(image, image_W, image_H)
    # if you want to test the generated batches of images, you might want to comment the following line.

    # 如果想看到正常的图片，请注释掉111行（标准化）和 130行（image_batch = tf.cast(image_batch, tf.float32)）
    # 训练时，不要注释掉！
    image = tf.image.per_image_standardization(image)

    ########################################################
    # step4: 生成batch
    ########################################################
    image_batch, label_batch = tf.train.batch([image, label],
                                              batch_size=batch_size,
                                              num_threads=64,
                                              capacity=capacity)
    # 重新排列label，行数为[batch_size]
    label_batch = tf.reshape(label_batch, [batch_size])
    image_batch = tf.cast(image_batch, tf.float32)
    # print(image_batch.shape, label_batch.shape)  # (2, 208, 208, 3) (2,)
    # print(image_batch.dtype, label_batch.dtype)  # <dtype: 'float32'> <dtype: 'int32'>
    return image_batch, label_batch

## 4. testing

In [10]:
"""
if __name__ == '__main__':
    BATCH_SIZE = 2
    CAPACITY = 100 + 3 * BATCH_SIZE
    IMG_W = 256
    IMG_H = 256

    train_dir = r'C:\Users\xudongmei\Desktop\bak\bak_crop'
    test_ratio = 0.2
    val_ratio = 0.2

    tra_images, tra_labels, val_images, val_labels, test_images, test_labels \
        = get_files(train_dir, test_ratio, val_ratio)
    tra_images_batch, tra_labels_batch = get_batch(tra_images, tra_labels, IMG_W, IMG_H, BATCH_SIZE, CAPACITY)
    val_images_batch, val_labels_batch = get_batch(val_images, val_labels, IMG_W, IMG_H, BATCH_SIZE, CAPACITY)
    test_images_batch, test_labels_batch = get_batch(test_images, test_labels, IMG_W, IMG_H, BATCH_SIZE, CAPACITY)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        i = 0
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        try:
            while not coord.should_stop() and i < 2:
                tra_img, tra_label = sess.run([tra_images_batch, tra_labels_batch])
                for i in range(BATCH_SIZE):
                    print("label: %d" % tra_label[i])
                    plt.imshow(tra_img[i, :, :, :])
                    plt.show()
                i += 1
        except tf.errors.OutOfRangeError:
            print('done!')

        finally:
            coord.request_stop()
        coord.join(threads)
"""

There are 21 baidu
There are 21 sogou
(26,)
(26,)
(7,)
(7,)
(9,)
(9,)
label: 1


ValueError: Floating point image RGB values must be in the 0..1 range.

<matplotlib.figure.Figure at 0x10a1d748>

label: 1


ValueError: Floating point image RGB values must be in the 0..1 range.

<matplotlib.figure.Figure at 0x10f97748>