# Reading Data

In [1]:
from __future__ import print_function
import tensorflow as tf
import numpy as np
import re
import matplotlib.pyplot as plt
%matplotlib inline

  from ._conv import register_converters as _register_converters


In [71]:
author = "kyubyong. https://github.com/Kyubyong/tensorflow-exercises"

NOTE on notation

    _x, _y, _z, _X, _Y, _Z, ...: NumPy arrays
    x, y, z, X, Y, Z, ...: Tensors


## Placeholder

In [2]:
# Make data and save to npz.
_x = np.zeros((100, 10), np.int32)
for i in range(100):
    _x[i] = np.random.permutation(10)
_x, _y = _x[:, :-1], _x[:, -1]

import os
if not os.path.exists('example'): os.mkdir('example')
np.savez('example/example.npz', _x=_x, _y=_y)

In [6]:
# Load data
data = np.load('example/example.npz')
_x, _y = data["_x"], data["_y"]

#Q1. Make a placeholder for x such that it should be of dtype=int32, shape=(None, 9).
# Inputs and targets
x_pl = tf.placeholder(dtype=np.int32, shape=(None, 9))
y_hat = 45 - tf.reduce_sum(x_pl, axis=1) # We find a digit x_pl doesn't contain.

# Session
with tf.Session() as sess:
    _y_hat = sess.run(y_hat, {x_pl: _x})
    print("y_hat =", _y_hat[:30])
    print("true y =", _y[:30])

y_hat = [5 0 6 0 7 6 8 0 7 4 1 6 0 4 1 4 4 0 7 2 7 8 4 3 9 6 0 5 5 3]
true y = [5 0 6 0 7 6 8 0 7 4 1 6 0 4 1 4 4 0 7 2 7 8 4 3 9 6 0 5 5 3]


## TFRecord

In [11]:
tf.reset_default_graph()

# Load data
data = np.load('example/example.npz')
_x, _y = data["_x"], data["_y"]

# Serialize
with tf.python_io.TFRecordWriter("example/tfrecord") as fout:
    for _xx, _yy in zip(_x, _y):
        ex = tf.train.Example()
        
        # Q2. Add each value to ex.
        """
        feature {
          key: "x"
          value {
            int64_list {
              value: 5
              value: 4
              value: 6
              value: 2
              value: 0
              value: 1
              value: 7
              value: 3
              value: 8
            }
          }
        }
        feature {
          key: "y"
          value {
            int64_list {
              value: 9
            }
          }
        """
        ex.features.feature['x'].int64_list.value.extend(_xx)
        ex.features.feature['y'].int64_list.value.append(_yy)
        fout.write(ex.SerializeToString())

def read_and_decode_single_example(fname):
    # Create a string queue
    """<tensorflow.python.ops.data_flow_ops.FIFOQueue object at 0x1815c1feb8>"""
    fname_q = tf.train.string_input_producer([fname], num_epochs=1, shuffle=True)
    # Q3. Create a TFRecordReader
    reader = tf.TFRecordReader()
    
    # Read the string queue
    _, serialized_example = reader.read(fname_q)
    # Q4. Describe parsing syntax
    features = tf.parse_single_example(
        serialized_example,
        features={"x":tf.FixedLenFeature([9], tf.int64),
                  "y":tf.FixedLenFeature([1], tf.int64)}
        )
    # Output
    """Tensor("ParseSingleExample/ParseSingleExample:0", shape=(9,), dtype=int64)"""
    x = features['x']
    """Tensor("ParseSingleExample/ParseSingleExample:1", shape=(1,), dtype=int64)"""
    y = features['y']
    return x, y

# Ops
"""Tensor("ParseSingleExample/ParseSingleExample:1", shape=(1,), dtype=int64)"""
x, y = read_and_decode_single_example('example/tfrecord')
"""Tensor("sub:0", shape=(), dtype=int64)"""
y_hat = 45 - tf.reduce_sum(x)

# Session
with tf.Session() as sess:
    #Q5. Initialize local variables
    sess.run(tf.local_variables_initializer())
    
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    try:
        while not coord.should_stop():
            _y, _y_hat = sess.run([y, y_hat])
            print(_y[0],"==", _y_hat, end="; ")
    
    except tf.errors.OutOfRangeError:
        print('Done training -- epoch limit reached')
    finally:
        # When done, ask the threads to stop.
        coord.request_stop()
    
    # Wait for threads to finish.
    coord.join(threads)

5 == 5; 0 == 0; 6 == 6; 0 == 0; 7 == 7; 6 == 6; 8 == 8; 0 == 0; 7 == 7; 4 == 4; 1 == 1; 6 == 6; 0 == 0; 4 == 4; 1 == 1; 4 == 4; 4 == 4; 0 == 0; 7 == 7; 2 == 2; 7 == 7; 8 == 8; 4 == 4; 3 == 3; 9 == 9; 6 == 6; 0 == 0; 5 == 5; 5 == 5; 3 == 3; 8 == 8; 1 == 1; 8 == 8; 2 == 2; 8 == 8; 5 == 5; 4 == 4; 9 == 9; 2 == 2; 1 == 1; 1 == 1; 2 == 2; 7 == 7; 9 == 9; 8 == 8; 2 == 2; 7 == 7; 3 == 3; 1 == 1; 0 == 0; 8 == 8; 6 == 6; 2 == 2; 1 == 1; 5 == 5; 1 == 1; 3 == 3; 7 == 7; 7 == 7; 9 == 9; 8 == 8; 1 == 1; 0 == 0; 1 == 1; 4 == 4; 7 == 7; 3 == 3; 8 == 8; 5 == 5; 6 == 6; 3 == 3; 5 == 5; 3 == 3; 8 == 8; 4 == 4; 9 == 9; 0 == 0; 3 == 3; 5 == 5; 8 == 8; 6 == 6; 4 == 4; 1 == 1; 6 == 6; 5 == 5; 5 == 5; 9 == 9; 5 == 5; 8 == 8; 6 == 6; 4 == 4; 2 == 2; 9 == 9; 7 == 7; 2 == 2; 9 == 9; 8 == 8; 2 == 2; 7 == 7; 9 == 9; Done training -- epoch limit reached


## Queue

In [17]:
tf.reset_default_graph()

# Load data
data = np.load('example/example.npz')
_x, _y = data["_x"], data["_y"]

# Hyperparams
batch_size = 10 # We will feed mini-batches of size 10.
num_epochs = 2 # We will feed data for two epochs.

# Convert to tensors
x = tf.convert_to_tensor(_x)
y = tf.convert_to_tensor(_y)

# Q6. Make slice queues
x_q, y_q = tf.train.slice_input_producer(
    tensor_list=[x, y],
    num_epochs=num_epochs)

# Batching
x_batch, y_batch = tf.train.batch([x_q, y_q], batch_size=batch_size)

# Targets
y_hat = 45 - tf.reduce_sum(x_batch, axis=1)
# y_hat, y_batch -> shape(10,)
# Session
with tf.Session() as sess:
    sess.run(tf.local_variables_initializer())
    
    # Q7. Make a train.Coordinator and threads.
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    
    try:
        while not coord.should_stop():
            _y_hat, _y_batch = sess.run([y_hat, y_batch])
            print(_y_hat, "==", _y_batch)
    
    except tf.errors.OutOfRangeError:
        print('Done training -- epoch limit reached')
    finally:
        # When done, ask the threads to stop.
        coord.request_stop()
    
    # Wait for threads to finish.
    coord.join(threads)


[5 4 2 0 3 6 6 1 0 7] == [5 4 2 0 3 6 6 1 0 7]
[6 0 0 9 9 2 7 3 7 5] == [6 0 0 9 9 2 7 3 7 5]
[8 9 7 9 4 4 1 3 8 1] == [8 9 7 9 4 4 1 3 8 1]
[8 4 7 8 2 7 8 9 4 0] == [8 4 7 8 2 7 8 9 4 0]
[7 8 2 8 5 2 9 0 1 7] == [7 8 2 8 5 2 9 0 1 7]
[1 8 1 6 4 5 5 6 5 0] == [1 8 1 6 4 5 5 6 5 0]
[9 4 1 3 7 6 2 9 5 8] == [9 4 1 3 7 6 2 9 5 8]
[5 5 3 3 6 3 1 5 5 0] == [5 5 3 3 6 3 1 5 5 0]
[1 8 6 4 7 4 0 8 1 9] == [1 8 6 4 7 4 0 8 1 9]
[2 1 8 8 6 2 4 7 3 2] == [2 1 8 8 6 2 4 7 3 2]
[9 9 1 5 2 1 1 7 4 1] == [9 9 1 5 2 1 1 7 4 1]
[6 7 3 8 6 3 1 5 3 8] == [6 7 3 8 6 3 1 5 3 8]
[8 8 8 5 4 6 1 3 7 5] == [8 8 8 5 4 6 1 3 7 5]
[2 7 1 4 6 7 4 7 6 7] == [2 7 1 4 6 7 4 7 6 7]
[0 1 9 2 5 7 9 4 3 6] == [0 1 9 2 5 7 9 4 3 6]
[0 0 2 0 9 0 2 5 5 5] == [0 0 2 0 9 0 2 5 5 5]
[8 8 6 9 9 2 6 8 7 3] == [8 8 6 9 9 2 6 8 7 3]
[5 8 0 2 4 2 9 8 1 0] == [5 8 0 2 4 2 9 8 1 0]
[0 4 5 8 5 8 4 0 9 4] == [0 4 5 8 5 8 4 0 9 4]
[3 7 4 7 8 1 1 6 2 3] == [3 7 4 7 8 1 1 6 2 3]
Done training -- epoch limit reached


## Read csv files

In [18]:
tf.reset_default_graph()

# Load data
data = np.load('example/example.npz')
_x, _y = data["_x"], data["_y"]
_x = np.concatenate((_x, np.expand_dims(_y, axis=1)), 1)

# Write to a csv file
_x_str = np.array_str(_x)
_x_str = re.sub("[\[\]]", "", _x_str)
_x_str = re.sub("(?m)^ +", "", _x_str)
_x_str = re.sub("[ ]+", ",", _x_str)
with open('example/example.csv', 'w') as fout:
    fout.write(_x_str)
    
# Hyperparams
batch_size = 10

# Create a string queue
fname_q = tf.train.string_input_producer(["example/example.csv"])

# Q8. Create a TextLineReader
reader = tf.TextLineReader()

# Read the string queue
_, value = reader.read(fname_q)

# Q9. Decode value
record_defaults = [[0]]*10
col1, col2, col3, col4, col5, col6, col7, col8, col9, col10 = tf.decode_csv(
    value,
    record_defaults=record_defaults)
x = tf.stack([col1, col2, col3, col4, col5, col6, col7, col8, col9])
y = col10

# Batching
x_batch, y_batch = tf.train.shuffle_batch(
      [x, y], batch_size=batch_size, capacity=200, min_after_dequeue=100)

# Ops
y_hat = 45 - tf.reduce_sum(x_batch, axis=1)

with tf.Session() as sess:
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)

    for i in range(num_epochs*10):
        _y_hat, _y_batch = sess.run([y_hat, y_batch])
        print(_y_hat, "==", _y_batch)

    coord.request_stop()
    coord.join(threads)


[9 8 5 5 7 9 7 0 3 6] == [9 8 5 5 7 9 7 0 3 6]
[0 1 1 3 1 2 6 8 6 7] == [0 1 1 3 1 2 6 8 6 7]
[1 9 4 1 8 0 4 8 9 6] == [1 9 4 1 8 0 4 8 9 6]
[0 6 4 2 7 5 1 0 8 6] == [0 6 4 2 7 5 1 0 8 6]
[7 4 0 5 9 1 9 9 4 5] == [7 4 0 5 9 1 9 9 4 5]
[5 5 5 2 7 4 5 7 8 1] == [5 5 5 2 7 4 5 7 8 1]
[5 3 8 3 4 2 0 8 9 0] == [5 3 8 3 4 2 0 8 9 0]
[7 8 1 5 5 6 9 8 4 2] == [7 8 1 5 5 6 9 8 4 2]
[7 3 8 1 8 6 5 3 1 3] == [7 3 8 1 8 6 5 3 1 3]
[1 6 5 5 6 9 7 6 3 6] == [1 6 5 5 6 9 7 6 3 6]
[8 9 5 8 7 4 5 3 7 0] == [8 9 5 8 7 4 5 3 7 0]
[7 7 3 1 0 3 1 7 6 6] == [7 7 3 1 0 3 1 7 6 6]
[0 2 4 9 9 1 3 9 0 1] == [0 2 4 9 9 1 3 9 0 1]
[2 5 2 6 9 8 8 4 6 3] == [2 5 2 6 9 8 8 4 6 3]
[0 1 5 4 2 4 8 6 7 7] == [0 1 5 4 2 4 8 6 7 7]
[7 2 8 0 7 4 6 8 7 2] == [7 2 8 0 7 4 6 8 7 2]
[7 8 1 0 9 5 8 5 8 7] == [7 8 1 0 9 5 8 5 8 7]
[0 3 5 3 6 2 5 0 1 8] == [0 3 5 3 6 2 5 0 1 8]
[4 7 3 5 8 9 9 6 3 7] == [4 7 3 5 8 9 9 6 3 7]
[4 5 3 1 1 0 2 5 7 9] == [4 5 3 1 1 0 2 5 7 9]


## Read image files

In [23]:
tf.reset_default_graph()

# Hyperparams
batch_size = 10
num_epochs = 1

# Make fake images and save
for i in range(100):
    _x = np.random.randint(0, 256, size=(10, 10, 4)).astype(np.uint8)
    plt.imsave("example/image_{}.jpg".format(i), _x)

# Import jpg files
images = tf.train.match_filenames_once('example/*.jpg')

# Create a string queue
fname_q = tf.train.string_input_producer(images, num_epochs=num_epochs, shuffle=True)

# Q10. Create a WholeFileReader
reader = tf.WholeFileReader()

# Read the string queue
_, value = reader.read(fname_q)

# Q11. Decode value
img = tf.image.decode_image(value)

# Batching
img_batch = tf.train.batch([img], shapes=([10, 10, 4]), batch_size=batch_size)

with tf.Session() as sess:
    sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()])

    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)
    
    num_samples = 0
    try:
        while not coord.should_stop():
            sess.run(img_batch)
            num_samples += batch_size
            print(num_samples, "samples have been seen")

    except tf.errors.OutOfRangeError:
        print('Done training -- epoch limit reached')
    finally:
        coord.request_stop()

    coord.join(threads)

10 samples have been seen
20 samples have been seen
30 samples have been seen
40 samples have been seen
50 samples have been seen
60 samples have been seen
70 samples have been seen
80 samples have been seen
90 samples have been seen
100 samples have been seen
Done training -- epoch limit reached
