# Template for Preparing Data as TFRecords
TFRecord (*.tfrecord) is the recommended format for loading data into TensorFlow models.

This template shows how to save data as tfrecord files.

Relevant TensorFlow Docs
<ul>
    <li><a href='https://www.tensorflow.org/api_docs/python/tf/train/Int64List'>Int64List</a> and <a href='https://www.tensorflow.org/api_docs/python/tf/train/BytesList'>BytesList</a> and <a href='https://www.tensorflow.org/api_docs/python/tf/train/FloatList'>FloatList</a></li>
    <li><a href='https://www.tensorflow.org/versions/master/api_docs/python/tf/train/Feature'>Feature</a></li>
    <li><a href='https://www.tensorflow.org/versions/master/api_docs/python/tf/train/Example'>Example</a></li>
    <li><a href='https://www.tensorflow.org/api_docs/python/tf/python_io/TFRecordWriter'>TFRecordWriter</a></li>
    
</ul>

Template based off great blog post tutorial by Machine Learning Guru <a href='http://www.machinelearninguru.com/deep_learning/tensorflow/basics/tfrecord/tfrecord.html'>How to write into and read from a TFRecords file in TensorFlow</a> and TensorFlow example <a href='https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/how_tos/reading_data/convert_to_records.py'>here</a>.

In [1]:
import tensorflow as tf

# Helper functions to create features with specified data type
def int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def float_feature(value):
    return tf.train.Feature(bytes_list=tf.train.FloatList(value=[value]))

In [2]:
''' Create a TFRecords file.
    
    inputs:
        filename - name of TFRecord file, <filename>.tfrecord
        examples - python dictionary of examples to write to <filename>.tfrecord
                 - dictionary keys can be different attributes of example, such
                   as the 3D image and a label, {'images': ..., 'labels': ...}
        parse_fn - function taht parses the example numbered index from examples  
                    and returns its dictionary of feaures ready to be passed 
                    to tf.train.Features(feature=...)
'''
def create_tfrecord(filename, examples, parse_fn):
    keys = examples.keys()
    num_examples = examples[list(keys)[0]].shape[0]
    writer = tf.python_io.TFRecordWriter(filename)
    
    # Write each data examplar to TFRecord File
    for e in range(num_examples):
        # Parse this example
        features = parse_fn(examples, e)
        
        # Create Example out of Features
        example = tf.train.Example(features=tf.train.Features(feature=features))

        # Write Example to TFRecord file
        writer.write(example.SerializeToString())

        # Monitor progress
        if not e % 1000 and e != 0:
            print(str(e) + ' examples written to ' + str(filename) + '.')

In [3]:
''' Parses the example numbered index from examples and  returns its dictionary 
        of feaures ready to be passed to tf.train.Features(feature=...)

    inputs: 
        examples - dictionary of each example where
            examples['key'] is a list of some attribute for each example, such as 'image' or 'label'
            examples['key'][i] is the value of 'key' for example i
        index - the example to parse
    
    outputs:
        features - dictionary of features for the index element of the dataset

'''
def parse(examples, index):
    
    image = examples['images'][index]
    label = examples['labels'][index]
    shape = image.shape
    
    features = {
        'height': int64_feature(shape[0]),
        'width': int64_feature(shape[1]),
        'depth': int64_feature(shape[2]),
        'label': int64_feature(int(label)),
        'image_raw': bytes_feature(image.tostring())
    }
    
    return features
    


In [4]:
# Gather Data

##########################
###   Your Code Here   ###
##########################

from tensorflow.contrib.learn.python.learn.datasets import mnist

data_sets = mnist.read_data_sets("MNIST/",
    dtype=tf.uint8,
    reshape=False,
    validation_size=1000)

train_images = data_sets.train.images[0:10000]
train_labels = data_sets.train.labels[0:10000]
print('Training images: ' + str(train_images.shape) + ' ' + str(train_images.dtype))
print('Training labels: ' + str(train_labels.shape) + ' ' + str(train_labels.dtype))
train_data = {'images': train_images, 'labels': train_labels}

val_images = data_sets.validation.images
val_labels = data_sets.validation.labels
print('Validation images: ' + str(val_images.shape) + ' ' + str(val_images.dtype))
print('Validation labels: ' + str(val_labels.shape) + ' ' + str(val_labels.dtype))
val_data = {'images': val_images, 'labels': val_labels}


Extracting MNIST/train-images-idx3-ubyte.gz
Extracting MNIST/train-labels-idx1-ubyte.gz
Extracting MNIST/t10k-images-idx3-ubyte.gz
Extracting MNIST/t10k-labels-idx1-ubyte.gz
Training images: (10000, 28, 28, 1) uint8
Training labels: (10000,) uint8
Validation images: (1000, 28, 28, 1) uint8
Validation labels: (1000,) uint8


In [5]:
train_file = 'mnist_train'
val_file = 'mnist_val'

In [6]:
print('Creating train TFRecord ' + str(train_file))
create_tfrecord(train_file, train_data, parse)
print('Created train TFRecord ' + str(train_file) + '\n')

print('Creating validation TFRecord ' + str(val_file))
create_tfrecord(val_file, val_data, parse)
print('Created validation TFRecord ' + str(val_file) + '\n')

Creating train TFRecord mnist_train
1000 examples written to mnist_train.
2000 examples written to mnist_train.
3000 examples written to mnist_train.
4000 examples written to mnist_train.
5000 examples written to mnist_train.
6000 examples written to mnist_train.
7000 examples written to mnist_train.
8000 examples written to mnist_train.
9000 examples written to mnist_train.
Created train TFRecord mnist_train

Creating validation TFRecord mnist_val
Created validation TFRecord mnist_val

