## Reading data into tensorflow graph

## resources
- [tf doc - reading data](https://www.tensorflow.org/how_tos/reading_data/): three main ways
- [tf doc - using queues and runners in mulitithreading](https://www.tensorflow.org/how_tos/threading_and_queues/): underlying mechanism for queues
- [tutorial part1](https://indico.io/blog/tensorflow-data-inputs-part1-placeholders-protobufs-queues/)
- [tutorial part2](https://indico.io/blog/tensorflow-data-input-part2-extensions/)


## get iris data
https://archive.ics.uci.edu/ml/datasets/Iris

In [4]:
import tensorflow as tf
import numpy as np
from os import path

In [5]:
tf.__version__

'0.12.1'

### get data batch from csv

In [6]:
!head -n  5 ../../data/iris.data

5.1,3.5,1.4,0.2,Iris-setosa
4.9,3.0,1.4,0.2,Iris-setosa
4.7,3.2,1.3,0.2,Iris-setosa
4.6,3.1,1.5,0.2,Iris-setosa
5.0,3.6,1.4,0.2,Iris-setosa


In [27]:
def get_data_batch(csv_file, batch_size=64):
    ## all start from a string_input_producer to fire up file names
    files = tf.train.string_input_producer([csv_file])
    ## reader - create symbolic row
    reader = tf.TextLineReader(skip_header_lines=1)
    key, row = reader.read(files)
    ## parse/decode the row, default value specifies type and replacement for missing values
    default_row = [[0.0], [0.0], [0.0], [0.0], [""]] # floats for features and string for label
#     sepal_len, sepal_wdh, pedal_len, pedal_wdh, label = tf.decode_csv(row, 
#                                                             record_defaults=default_row)
#     x = tf.pack([sepal_len, sepal_wdh, pedal_len, pedal_wdh])
    parsed_row = tf.decode_csv(row, record_defaults=default_row)
    x = tf.pack(parsed_row[:4])
    l = parsed_row[4]
    y = tf.argmax(tf.to_int32(tf.pack([
        tf.equal(l, "Iris-setosa"),
        tf.equal(l, "Iris-versicolor"),
        tf.equal(l, "Iris-virginica")
    ])), 0)
    # buffer data into batch
    batch_x, batch_y = tf.train.shuffle_batch([x, y], batch_size=batch_size, 
                                              capacity=batch_size*5, 
                                              min_after_dequeue=2*batch_size)
    return batch_x, batch_y

In [28]:
# test it

sess = tf.Session()
coord = tf.train.Coordinator()


bx, by = get_data_batch(path.abspath("../../data/iris.data"))
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
bbx, bby = sess.run([bx, by])
print(bbx.shape, bby.shape)

coord.request_stop()
coord.join(threads)
sess.close()

(64, 4) (64,)


## build a simple model to test it

In [43]:
def build_logistic_regression(x, y):
    w = tf.Variable(tf.zeros([4, 3]), dtype=tf.float32)
    b = tf.Variable(tf.zeros([3]), dtype=tf.float32)
    logits = tf.matmul(x, w) + b
    yhat = tf.nn.softmax(logits)
    label_hat = tf.arg_max(yhat, 1)
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y)
    
    train_op = tf.train.AdamOptimizer(5e-3).minimize(loss)
    test_op = tf.reduce_mean(tf.to_float(tf.equal(label_hat, y)))
    predict_op = label_hat
    
    return train_op, test_op, predict_op

In [44]:
with tf.Session() as sess:
    # create computing graph
    # input, output
    x, y = get_data_batch(path.abspath("../../data/iris.data"))
    # get model ops
    train_op, test_op, predict_op = build_logistic_regression(x, y)
    # initialize the graph
    sess.run(tf.global_variables_initializer())
    
    # start the queue runner for data, by using multithread coord
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    
    # train the model
    for i in range(1000):
        sess.run(train_op)
        if i % 100 == 0:
            print(sess.run(test_op))
    
    # stop the data queue, and gracely wait for all
    coord.request_stop()
    coord.join(threads)

0.140625
0.75
0.890625
0.921875
1.0
0.984375
0.96875
1.0
1.0
0.953125
