# Dataset

In [4]:
from __future__ import print_function
import os
import tensorflow as tf
import numpy as np
from tensorflow.contrib.learn.python.learn.datasets import mnist

In [5]:
dataset1 = tf.data.Dataset.from_tensor_slices(tf.random_uniform([4, 10]))
print(dataset1.output_types)  # ==> "tf.float32"
print(dataset1.output_shapes)  # ==> "(10,)"

dataset2 = tf.data.Dataset.from_tensor_slices(
   (tf.random_uniform([4]),
    tf.random_uniform([4, 100], maxval=100, dtype=tf.int32)))
print(dataset2.output_types)  # ==> "(tf.float32, tf.int32)"
print(dataset2.output_shapes)  # ==> "((), (100,))"

dataset3 = tf.data.Dataset.zip((dataset1, dataset2))
print(dataset3.output_types)  # ==> (tf.float32, (tf.float32, tf.int32))
print(dataset3.output_shapes)  # ==> "(10, ((), (100,)))"


<dtype: 'float32'>
(10,)
(tf.float32, tf.int32)
(TensorShape([]), TensorShape([Dimension(100)]))
(tf.float32, (tf.float32, tf.int32))
(TensorShape([Dimension(10)]), (TensorShape([]), TensorShape([Dimension(100)])))


In [6]:
dataset = tf.data.Dataset.from_tensor_slices(
   {"a": tf.random_uniform([4]),
    "b": tf.random_uniform([4, 100], maxval=100, dtype=tf.int32)})
print(dataset.output_types)  # ==> "{'a': tf.float32, 'b': tf.int32}"
print(dataset.output_shapes)  # ==> "{'a': (), 'b': (100,)}"


{'a': tf.float32, 'b': tf.int32}
{'a': TensorShape([]), 'b': TensorShape([Dimension(100)])}


## One-hot Iterator

In [7]:
sess=tf.InteractiveSession()

In [8]:
dataset = tf.data.Dataset.range(100)
iterator = dataset.make_one_shot_iterator()
next_element = iterator.get_next()

for i in range(100):
  value = sess.run(next_element)
  assert i == value

## initializable Iterator

In [9]:
max_value = tf.placeholder(tf.int64, shape=[])
dataset = tf.data.Dataset.range(max_value)
iterator = dataset.make_initializable_iterator()
next_element = iterator.get_next()

sess.run(iterator.initializer, feed_dict={max_value: 10})
for i in range(10):
  value = sess.run(next_element)
  assert i == value

sess.run(iterator.initializer, feed_dict={max_value: 100})
for i in range(100):
  value = sess.run(next_element)
  assert i == value

## reinitializable Itarator

In [10]:
# 定义训练集和验证集
training_dataset = tf.data.Dataset.range(100).map(
    lambda x: x + tf.random_uniform([], -10, 10, tf.int64))
validation_dataset = tf.data.Dataset.range(50)

# reinitializable的iterator需要定义它的结构，我们使用训练集的结构来定义它。当然也要求验证集是同样的结构。
iterator = tf.data.Iterator.from_structure(training_dataset.output_types,
                                           training_dataset.output_shapes)
next_element = iterator.get_next()

training_init_op = iterator.make_initializer(training_dataset)
validation_init_op = iterator.make_initializer(validation_dataset)

# 运行20个Epoch
for _ in range(20):
  # 首先用训练集来初始化Iterator
  sess.run(training_init_op)
  for _ in range(100):
    sess.run(next_element)

  # 再使用验证集来初始化Iterator
  sess.run(validation_init_op)
  for _ in range(50):
    sess.run(next_element)


## feedable Iterator

In [11]:
# 定义训练集和验证集
# 这里的训练集后面使用了repeate，因此是可以无限词迭代的。
training_dataset = tf.data.Dataset.range(100).map(
    lambda x: x + tf.random_uniform([], -10, 10, tf.int64)).repeat()
validation_dataset = tf.data.Dataset.range(50)

# feedable iterator除了需要Dataset的结构外还需要一个string的placeholder
handle = tf.placeholder(tf.string, shape=[])
iterator = tf.data.Iterator.from_string_handle(
    handle, training_dataset.output_types, training_dataset.output_shapes)
next_element = iterator.get_next()

# reinitializable只有一个Iterator，通过session.run(init_op)来切换不同数据集
# 而feedable Iterator每个数据集有自己的Iterator，这样更加灵活，比如训练集我们是one-hot的Iterator
# 而验证集使用initializable的Iterator
training_iterator = training_dataset.make_one_shot_iterator()
validation_iterator = validation_dataset.make_initializable_iterator()

# 训练Iterator和验证Iterator都有一个string的handle，我们需要得到它，从而可以feed给feedable Iterator
training_handle = sess.run(training_iterator.string_handle())
validation_handle = sess.run(validation_iterator.string_handle())

# 无限的循环，需要自己interrupt它
while True:
  # 训练集上运行200次训练。注意训练dataset是用repeat得到的可遍历无限次的Iteartor
  # 因此下一次训练会接着从前面数据 
  for _ in range(200):
    sess.run(next_element, feed_dict={handle: training_handle})

  # 验证集需要我们初始化
  sess.run(validation_iterator.initializer)
  for _ in range(50):
    sess.run(next_element, feed_dict={handle: validation_handle})


KeyboardInterrupt: 

## get_next()

In [None]:
dataset = tf.data.Dataset.range(5)
iterator = dataset.make_initializable_iterator()
next_element = iterator.get_next()
 
result = tf.add(next_element, next_element)

sess.run(iterator.initializer)
print(sess.run(result))  # ==> "0"
print(sess.run(result))  # ==> "2"
print(sess.run(result))  # ==> "4"
print(sess.run(result))  # ==> "6"
print(sess.run(result))  # ==> "8"
try:
  sess.run(result)
except tf.errors.OutOfRangeError:
  print("End of dataset")  # ==> "End of dataset"






In [None]:
dataset = tf.data.Dataset.range(5)
iterator = dataset.make_initializable_iterator()
next_element = iterator.get_next()
 
result = tf.add(next_element, next_element)
sess.run(iterator.initializer)
while True:
  try:
    print(sess.run(result))
  except tf.errors.OutOfRangeError:
    break

In [None]:
dataset1 = tf.data.Dataset.from_tensor_slices(tf.random_uniform([4, 10]))
dataset2 = tf.data.Dataset.from_tensor_slices((tf.random_uniform([4]), tf.random_uniform([4, 100])))
dataset3 = tf.data.Dataset.zip((dataset1, dataset2))

iterator = dataset3.make_initializable_iterator()

sess.run(iterator.initializer)
next1, (next2, next3) = iterator.get_next()


## 使用dataset api来读取tfrecord数据并进行训练

生成tfrecord文件，和前面一样

In [None]:


save_dir = "../data"
# Download data to save_dir
data_sets = mnist.read_data_sets(save_dir,
                                 dtype=tf.uint8,
                                 reshape=False,
                                 validation_size=1000)

data_splits = ["train", "test", "validation"]
for d in range(len(data_splits)):
    print("saving " + data_splits[d])
    data_set = data_sets[d]
    filename = os.path.join(save_dir, data_splits[d] + '.tfrecords')
    writer = tf.python_io.TFRecordWriter(filename)
    print(data_set.images[0].shape, data_set.images[0].dtype)
    for index in range(data_set.images.shape[0]):
        image = data_set.images[index].tostring()
        # image = data_set.images[index].reshape(-1).astype(np.int32)
        example = tf.train.Example(features=tf.train.Features(feature={
            'height': tf.train.Feature(int64_list=tf.train.Int64List(value=
                                                                     [data_set.images.shape[1]])),
            'width': tf.train.Feature(int64_list=tf.train.Int64List(value=
                                                                    [data_set.images.shape[2]])),
            'depth': tf.train.Feature(int64_list=tf.train.Int64List(value=
                                                                    [data_set.images.shape[3]])),
            'label': tf.train.Feature(int64_list=tf.train.Int64List(value=
                                                                    [int(data_set.labels[index]), 2])),
        # 为了演示，这里给label多加一个整数2
            'image_raw': tf.train.Feature(bytes_list=tf.train.BytesList(value=
                                                                        [image]))}))
        # 'image_raw': tf.train.Feature(int64_list=tf.train.Int64List(value =
        #        image))}))
        writer.write(example.SerializeToString())
    writer.close()

In [None]:
def _parse_function(example_proto):
    features = {"image_raw": tf.FixedLenFeature([], tf.string),
                "label": tf.FixedLenFeature([2], tf.int64)}
    parsed_features = tf.parse_single_example(example_proto, features)
    image = tf.decode_raw(parsed_features['image_raw'], tf.uint8)
    image.set_shape([784])
    image = tf.cast(image, tf.float32) * (1. / 255) - 0.5

    label = tf.cast(parsed_features['label'][0], tf.int32)
    return image, label


filenames = ["../data/train.tfrecords"]
dataset = tf.data.TFRecordDataset(filenames)
dataset = dataset.map(_parse_function)
dataset = dataset.shuffle(buffer_size=10000)
dataset = dataset.batch(128)
dataset = dataset.repeat(10)
iterator = dataset.make_initializable_iterator()
images_batch, labels_batch = iterator.get_next()

W = tf.get_variable("W", [28 * 28, 10])
y_pred = tf.matmul(images_batch, W)
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y_pred, labels=labels_batch)
loss_mean = tf.reduce_mean(loss)
train_op = tf.train.AdamOptimizer().minimize(loss)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
init = tf.local_variables_initializer()
sess.run(init)
sess.run(iterator.initializer)
step = 0
while True:
    try:
        step += 1
        sess.run([train_op])
        if step % 500 == 0:
            loss_mean_val = sess.run([loss_mean])
            print(step)
            print(loss_mean_val)
    except tf.errors.OutOfRangeError:
        break

sess.close()