In [1]:
# import required modules
import itertools
import time
import zipfile
import shutil
import json
import os
import sys
import logging
import glob
import numpy
import tensorflow as tf
import requests
import random
import logging

  from ._conv import register_converters as _register_converters


In [2]:
random.seed(900)
LOGGER = logging.getLogger()
LOGGER.setLevel(logging.DEBUG)
LOGGER.info("Logger in INFO mode")
LOGGER.debug("Logger in DEBUG mode")
LOGGER.debug("Logger in DEBUG mode")
REQUEST_TIMEOUT = 1.0

In [3]:
logging.basicConfig(level=logging.INFO,
                    format=('%(asctime)s (%(relativeCreated)d) %(levelname)s %(name)s'
                            ' [%(funcName)s:%(lineno)d] %(message)s'),
                    stream=sys.stdout)

In [4]:
path_to_charlie_root = "../../../.."
NOT_A_DAM_IMAGE_DIR = os.path.join(path_to_charlie_root,"data/imagery-6-7-2019_cropped/not_a_dam_images")
DAM_IMAGE_DIR = os.path.join(path_to_charlie_root,"data/imagery-6-7-2019_cropped/dam_images")
if not os.path.exists(NOT_A_DAM_IMAGE_DIR):
    raise ValueError("can't find %s'" % NOT_A_DAM_IMAGE_DIR)
if not os.path.exists(DAM_IMAGE_DIR):
    raise ValueError("can't find %s'" % DAM_IMAGE_DIR)
OUTPUTS_DIR = os.path.join(path_to_charlie_root,"data/TFRecords_imagery_6-7_cropped_made_7-22")
WORKSPACE_DIR = OUTPUTS_DIR

In [5]:
dam_file_list = [os.path.join(DAM_IMAGE_DIR, f)
                 for f in os.listdir(DAM_IMAGE_DIR) if f.endswith('clipped.png')]
not_a_dam_file_list = [os.path.join(NOT_A_DAM_IMAGE_DIR, f)
                       for f in os.listdir(NOT_A_DAM_IMAGE_DIR) if f.endswith('not_a_dam.png')]


In [6]:
len(dam_file_list)

30337

In [7]:
len(not_a_dam_file_list)

5899

In [8]:
all_images_file_list = dam_file_list+not_a_dam_file_list
random.seed(0)
random.shuffle(all_images_file_list)
len(all_images_file_list)

36236

In [9]:
holdout_set_portion = .2
validation_set_portion = .15
Dams_per_round = 1000 # = max_dams_per_record 
def int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def int64_list_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
def bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def bytes_list_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))
def float_list_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))

In [10]:
southaf_json_paths = os.listdir(os.path.join(path_to_charlie_root, 'data/imagery-6-7-2019_Cropped/southaf_set/json'))
if '122774_clipped.json' in southaf_json_paths:
    print('true')
else:
    print('false')

true


In [11]:
def make_TFRecords(dam_file_list,tf_record_iteration):    
    '''Function to make_TFRecords from a list of images_paths'''
    
    print('Starting to make TFRecords %d' % tf_record_iteration)
    
    random.seed(tf_record_iteration)
    random_numbers_list = [random.random() for x in range(0, len(dam_file_list))]
    random_number_iterator = 0
    for image_path in dam_file_list:
        image_string = tf.read_file(image_path)
        image_decoded = tf.image.decode_png(image_string).eval()
        image_string = open(image_path, 'rb').read()
        feature_dict = {
            'image/height': int64_feature(
                image_decoded.shape[0]),
            'image/width': int64_feature(
                image_decoded.shape[1]),
            'image/filename': bytes_feature(
                bytes(image_path, 'utf8')),
            'image/source_id': bytes_feature(
                bytes(image_path, 'utf8')),
            'image/encoded': bytes_feature(image_string),
            'image/format': bytes_feature(b'png'),
        }
        # if this image is a dam:
        json_path = image_path.replace('.png', '.json')
        if not os.path.exists(json_path):
            raise ValueError("can't find %s'" % json_path)
        if not 'not_a_dam' in image_path:
            dam_type = 'dam'
            with open(json_path, 'r') as json_file:
                image_metadata = json.load(json_file)
            xmin = image_metadata['pixel_bounding_box'][0] / float(image_decoded.shape[0])
            xmax = image_metadata['pixel_bounding_box'][2] / float(image_decoded.shape[0])
            ymin = image_metadata['pixel_bounding_box'][3] / float(image_decoded.shape[1])
            ymax = image_metadata['pixel_bounding_box'][1] / float(image_decoded.shape[1])
            if (xmin < 0 or ymin < 0 or xmax > 1 or ymax > 1):
                xmin = max(0, xmin)
                xmax = min(xmax, 1)
                ymin = max(0, ymin)
                ymax = min(ymax, 1)
            feature_dict.update({
                'image/object/bbox/xmin': float_list_feature([xmin]),
                'image/object/bbox/xmax': float_list_feature([xmax]),
                'image/object/bbox/ymin': float_list_feature([ymin]),
                'image/object/bbox/ymax': float_list_feature([ymax]),
                'image/object/class/label': int64_list_feature(
                    [1]),  # the '1' is type 1 which is a dam
                'image/object/class/text': bytes_list_feature(
                    [b'dam']),
            })
            tf_record = tf.train.Example(features=tf.train.Features(
                feature=feature_dict))
        else:
            dam_type = 'not_a_dam'
            with open(json_path, 'r') as json_file:
                image_metadata = json.load(json_file)
            tf_record = tf.train.Example(features=tf.train.Features(
                feature=feature_dict))
        random_number = random_numbers_list[random_number_iterator]
        random_number_iterator+=1
        json_path_basename = os.path.basename(json_path)
        if json_path_basename in southaf_json_paths: # both for dams & not_a_dams
            writer = southaf_writer
            log = southaf_log
        elif random_number < holdout_set_portion:
            writer = test_writer
            log = test_log
        elif random_number > (1-validation_set_portion):
            writer = validation_writer
            log = validation_log
        else:
            writer = training_writer
            log = training_log
        writer.write(tf_record.SerializeToString())
        # Add stats 
        log[dam_type] += 1
    return training_log, validation_log, test_log, southaf_log

In [12]:
# Make workspace outputs directories doesn't exist
directories_to_make = [WORKSPACE_DIR,
                       os.path.join(WORKSPACE_DIR,'training_set'),
                       os.path.join(WORKSPACE_DIR,'validation_set'),
                       os.path.join(WORKSPACE_DIR,'test_set'),
                       os.path.join(WORKSPACE_DIR,'southaf_set')]
for directory in directories_to_make:
    try:
        os.makedirs(directory)
    except OSError:
        pass

# Do the thing    
training_log = {'dam': 0, 'not_a_dam': 0}
validation_log = {'dam': 0, 'not_a_dam': 0}
test_log = {'dam': 0, 'not_a_dam': 0}
southaf_log = {'dam': 0, 'not_a_dam': 0}

#last_time = time.time()

training_writer_count = 0
validation_writer_count = 0
tf_record_iteration = 0
max_tf_record_iteration = int(len(all_images_file_list)/Dams_per_round)



while tf_record_iteration <= max_tf_record_iteration:
    
    with tf.Graph().as_default(), tf.Session() as sess:

        LOGGER.info('tf_record_iteration %d' % tf_record_iteration)

        # Open writers
        training_writer = tf.python_io.TFRecordWriter(os.path.join(WORKSPACE_DIR,
                     'training_set/dams_%d.record' % tf_record_iteration))
        validation_writer = tf.python_io.TFRecordWriter(os.path.join(WORKSPACE_DIR,
                         'validation_set/dams_%d.record' % tf_record_iteration))
        test_writer = tf.python_io.TFRecordWriter(os.path.join(WORKSPACE_DIR,
                         'test_set/dams_%d.record' % tf_record_iteration))
        southaf_writer = tf.python_io.TFRecordWriter(os.path.join(WORKSPACE_DIR,
                     'southaf_set/dams_%d.record' % tf_record_iteration))

        # Get a slice of the dataset
        slice_dam_images_list = all_images_file_list[tf_record_iteration*Dams_per_round:min((tf_record_iteration+1)*Dams_per_round,len(all_images_file_list))]

        #This is where I make TFRecords!
        training_log, validation_log, test_log, southaf_log = make_TFRecords(slice_dam_images_list,tf_record_iteration)

        # Close writers
        training_writer.close()
        validation_writer.close()
        southaf_writer.close()
        test_writer.close()

        # Advance loop
        tf_record_iteration += 1

        print('training_log',training_log)
        print('validation_log',validation_log)
        print('test_log',test_log)
        print('southaf_log',southaf_log)
        print('\n')

    
    
#@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)

2019-07-22 22:17:10,789 (144875) INFO root [<module>:32] tf_record_iteration 0
Starting to make TFRecords 0
training_log {'dam': 540, 'not_a_dam': 107}
validation_log {'dam': 116, 'not_a_dam': 25}
test_log {'dam': 164, 'not_a_dam': 35}
southaf_log {'dam': 11, 'not_a_dam': 2}


2019-07-22 22:19:27,245 (281331) INFO root [<module>:32] tf_record_iteration 1
Starting to make TFRecords 1
training_log {'dam': 1071, 'not_a_dam': 224}
validation_log {'dam': 246, 'not_a_dam': 56}
test_log {'dam': 317, 'not_a_dam': 61}
southaf_log {'dam': 21, 'not_a_dam': 4}


2019-07-22 22:21:43,338 (417424) INFO root [<module>:32] tf_record_iteration 2
Starting to make TFRecords 2
training_log {'dam': 1614, 'not_a_dam': 323}
validation_log {'dam': 372, 'not_a_dam': 73}
test_log {'dam': 487, 'not_a_dam': 82}
southaf_log {'dam': 37, 'not_a_dam': 12}


2019-07-22 22:23:54,323 (548409) INFO root [<module>:32] tf_record_iteration 3
Starting to make TFRecords 3
training_log {'dam': 2148, 'not_a_dam': 418}
validation

KeyboardInterrupt: 