# Simple TFRecord validation

This notebook helps visualize simple stats about a TFRecords dataset to make sure it's formatted properly.

In [1]:
import os
import os.path as op
import tensorflow as tf
import tensorflow_data_validation as tfdv

# Define stats_options to only look at classes. Don't try to load raw image data here
stats_options = tfdv.StatsOptions(feature_whitelist=['image/object/class/text'])

#tfr_fpaths = [op.join(os.environ['DATA_DIR'], 'divot_detect', 'rocks_sample', 'data_rocks_rocks_v2.record')]
tfr_fpaths = [op.join(os.environ['DATA_DIR'], 'divot_detect', 'craters_sample', 'craters_moon_20200701.record')]

In [2]:
def print_example_info(example_dict):
    """Print out some high level information about a TFRecord example"""
    
    
    print(f'\nImage file name: {example_dict["image/filename"].bytes_list.value[0]}')
    print(f'Beginning of image byte str: {example_dict["image/encoded"].bytes_list.value[0][:20]}')
    print(f'Image dimensions (w x h): {example_dict["image/width"].int64_list.value} x '
          f'{example_dict["image/height"].int64_list.value}')

    print(f'Class labels: {example_dict["image/object/class/text"].bytes_list}')

In [3]:
# Loop over each TFRecord filenames
for tfr_fpath in tfr_fpaths:
    print(f'Analyzing {tfr_fpath}')

    # Compute and visualize stats about number of unique classes 
    stats = tfdv.generate_statistics_from_tfrecord(data_location=tfr_fpath, stats_options=stats_options)
    raw_dataset = tf.data.TFRecordDataset(tfr_fpath)

    # This will automatically plot
    viz = tfdv.visualize_statistics(stats)
    
    # Get and print examples from the TFRecord dataset
    tf_iterator = tf.python_io.tf_record_iterator(tfr_fpath)
    example_dicts = [dict(tf.train.Example.FromString(ex).features.feature) 
                     for ex in tf_iterator]
    for example_dict in example_dicts:
        print_example_info(example_dict)




Analyzing /Users/wronk/Data/divot_detect/craters_sample/craters_moon_20200701.record




Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`



Image file name: b'./moon_images/tile_4000_29000.png'
Beginning of image byte str: b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00'
Image dimensions (w x h): [1000] x [1000]
Class labels: 

Image file name: b'./moon_images/tile_4000_27000.png'
Beginning of image byte str: b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00'
Image dimensions (w x h): [1000] x [1000]
Class labels: 

Image file name: b'./moon_images/tile_4000_20000.png'
Beginning of image byte str: b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00'
Image dimensions (w x h): [1000] x [1000]
Class labels: 

Image file name: b'./moon_images/tile_2000_19000.png'
Beginning of image byte str: b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00'
Image dimensions (w x h): [1000] x [1000]
Class labels: 

Image file name: b'./moon_images/tile_2000_16000.png'
Beginning of image byte str: b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x