This script requires TF + pip install request

In [4]:
import itertools
import time
import zipfile
import shutil
import json
import os
import sys
import logging
import glob

import numpy
import tensorflow as tf

# Installed in addition
import requests

## Useless?
#from osgeo import gdal
#import shapely.wkb
#import shapely.prepared
#from retrying import retry

import random

In [5]:
LOGGER = logging.getLogger()
LOGGER.setLevel(logging.DEBUG)
LOGGER.info("Logger in INFO mode")
LOGGER.debug("Logger in DEBUG mode")
logging.basicConfig(level=logging.DEBUG,
                    format=('%(asctime)s (%(relativeCreated)d) %(levelname)s %(name)s'
                            ' [%(funcName)s:%(lineno)d] %(message)s'),
                    stream=sys.stdout)
LOGGER.debug("Logger in DEBUG mode")

REQUEST_TIMEOUT = 1.0

2019-06-13 01:50:56,079 (79548) DEBUG root [<module>:9] Logger in DEBUG mode


#### Dataset paths 

In [6]:
path_to_charlie_root = "../../.."
NOT_A_DAM_IMAGE_DIR = os.path.join(path_to_charlie_root,"data/imagery-6-7-2019/not_a_dam_images")
DAM_IMAGE_DIR = os.path.join(path_to_charlie_root,"data/imagery-6-7-2019/dam_images")

TM_WORLD_BORDERS_URL = 'https://storage.googleapis.com/ecoshard-root/ipbes/TM_WORLD_BORDERS_SIMPL-0.3_md5_15057f7b17752048f9bd2e2e607fe99c.zip'

if not os.path.exists(NOT_A_DAM_IMAGE_DIR):
    raise ValueError("can't find %s'" % NOT_A_DAM_IMAGE_DIR)
if not os.path.exists(DAM_IMAGE_DIR):
    raise ValueError("can't find %s'" % DAM_IMAGE_DIR)
    
OUTPUTS_DIR = os.path.join(path_to_charlie_root,"data/making_TFRecords_temp_workspace")
WORKSPACE_DIR = OUTPUTS_DIR
    

Run just one of the cells below! (full dataset or subset)

In [8]:
## Subsets of data inputs - for faster development purposes

dam_file_list = [os.path.join(DAM_IMAGE_DIR, f)
                 for f in os.listdir(DAM_IMAGE_DIR) if f.endswith('5140_clipped.png')]
not_a_dam_file_list = [os.path.join(NOT_A_DAM_IMAGE_DIR, f)
                       for f in os.listdir(NOT_A_DAM_IMAGE_DIR) if f.endswith('362_not_a_dam_bb.png')]

In [8]:
## Full dataset

dam_file_list = [os.path.join(DAM_IMAGE_DIR, f)
                 for f in os.listdir(DAM_IMAGE_DIR) if f.endswith('clipped.png')]
not_a_dam_file_list = [os.path.join(NOT_A_DAM_IMAGE_DIR, f)
                       for f in os.listdir(NOT_A_DAM_IMAGE_DIR) if f.endswith('.png')]

In [9]:
all_images_file_list = dam_file_list+not_a_dam_file_list
random.shuffle(all_images_file_list)
all_images_file_list

['../../../data/imagery-6-7-2019/not_a_dam_images/1118-1362_not_a_dam_bb.png',
 '../../../data/imagery-6-7-2019/not_a_dam_images/374-1362_not_a_dam_bb.png',
 '../../../data/imagery-6-7-2019/not_a_dam_images/351-1362_not_a_dam_bb.png',
 '../../../data/imagery-6-7-2019/not_a_dam_images/369-1362_not_a_dam_bb.png',
 '../../../data/imagery-6-7-2019/dam_images/5140_clipped.png',
 '../../../data/imagery-6-7-2019/not_a_dam_images/1144-1362_not_a_dam_bb.png',
 '../../../data/imagery-6-7-2019/dam_images/115140_clipped.png',
 '../../../data/imagery-6-7-2019/not_a_dam_images/1111-1362_not_a_dam_bb.png',
 '../../../data/imagery-6-7-2019/not_a_dam_images/1004-1362_not_a_dam_bb.png']

#### Parameters

In [10]:
dev_set_portion = .2
DAMS_PER_RECORD = 500 # 5 in dev mode 

In [11]:
def int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def int64_list_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
def bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def bytes_list_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))
def float_list_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))


#### Get South Africa geometry

In [12]:
def download_url_to_file(url, target_file_path):
    """Use requests to download a file.

    Parameters:
        url (string): url to file.
        target_file_path (string): local path to download the file.

    Returns:
        None.

    """
    try:
        response = requests.get(url, stream=True, timeout=REQUEST_TIMEOUT)
        try:
            os.makedirs(os.path.dirname(target_file_path))
        except OSError:
            pass
        with open(target_file_path, 'wb') as target_file:
            shutil.copyfileobj(response.raw, target_file)
        del response
    except:
        LOGGER.exception('download of {url} to {target_file_path} failed')
        # mods from LOGGER.exception(f'download of {url} to {target_file_path} failed')
        raise

In [13]:
tm_world_borders_zip_path = os.path.join(
        WORKSPACE_DIR, os.path.basename(TM_WORLD_BORDERS_URL))
if not os.path.exists(tm_world_borders_zip_path):
    download_url_to_file(TM_WORLD_BORDERS_URL, tm_world_borders_zip_path)
    with zipfile.ZipFile(tm_world_borders_zip_path, 'r') as zip_ref:
        zip_ref.extractall(WORKSPACE_DIR)

tm_world_borders_vector_path = os.path.join(
    WORKSPACE_DIR, 'TM_WORLD_BORDERS-0.3.shp')
tm_world_borders_vector = gdal.OpenEx(
    tm_world_borders_vector_path, gdal.OF_VECTOR)
tm_world_borders_layer = tm_world_borders_vector.GetLayer()
for border_feature in tm_world_borders_layer:
    if border_feature.GetField('NAME') == 'South Africa':
        sa_geom = border_feature.GetGeometryRef()
        sa_geom_prep = shapely.prepared.prep(
            shapely.wkb.loads(sa_geom.ExportToWkb()))
        break
LOGGER.debug(sa_geom_prep)

2019-06-13 01:53:42,942 (246411) DEBUG urllib3.connectionpool [_new_conn:815] Starting new HTTPS connection (1): storage.googleapis.com:443
2019-06-13 01:53:43,341 (246810) DEBUG urllib3.connectionpool [_make_request:396] https://storage.googleapis.com:443 "GET /ecoshard-root/ipbes/TM_WORLD_BORDERS_SIMPL-0.3_md5_15057f7b17752048f9bd2e2e607fe99c.zip HTTP/1.1" 200 3461975


NameError: name 'gdal' is not defined

# Now Make TFRecords !

Utils: Function to make_TFRecords from a list of images_paths

In [None]:
def make_TFRecords(dam_file_list,tf_record_iteration):
    '''Function to make_TFRecords from a list of images_paths'''
    
    print('Starting to make TFRecords %d' % tf_record_iteration)
    
    with tf.Graph().as_default(), tf.Session() as sess:
        training_writer = tf.python_io.TFRecordWriter(os.path.join(WORKSPACE_DIR,
                         'training_set/dams_%d.record' % tf_record_iteration))
        validation_writer = tf.python_io.TFRecordWriter(os.path.join(WORKSPACE_DIR,
                         'validation_set/dams_%d.record' % tf_record_iteration))
        test_writer = tf.python_io.TFRecordWriter(os.path.join(WORKSPACE_DIR,
                         'test_set/dams_%d.record' % tf_record_iteration))
        southaf_writer = tf.python_io.TFRecordWriter(os.path.join(WORKSPACE_DIR,
                         'southaf_set/dams_%d.record' % tf_record_iteration))

        for image_path in dam_file_list:
#             current_time = time.time()
#             if current_time - last_time > 5.0:
#                 LOGGER.info('training_log: %s', training_log)
#                 LOGGER.info('validation_log: %s', validation_log)
#                 LOGGER.info('south_africa_log: %s', south_africa_log)
#                 LOGGER.info('training_writer_count: %d', training_writer_count)
#                 LOGGER.info('validation_writer_count: %d', validation_writer_count)
#                 last_time = current_time
            
            #  Note from Rich:
            # looks like anything can be used here, including serializing
            # a tensor tf.serialize_tensor
            image_string = tf.read_file(image_path)
            image_decoded = tf.image.decode_png(image_string).eval()
            image_string = open(image_path, 'rb').read()
            feature_dict = {
                'image/height': int64_feature(
                    image_decoded.shape[0]),
                'image/width': int64_feature(
                    image_decoded.shape[1]),
                'image/filename': bytes_feature(
                    bytes(image_path, 'utf8')),
                'image/source_id': bytes_feature(
                    bytes(image_path, 'utf8')),
                'image/encoded': bytes_feature(image_string),
                'image/format': bytes_feature(b'png'),
            }
            
            # if this image is a dam:

            json_path = image_path.replace('.png', '.json')
            if os.path.exists(json_path):
                dam_type = b'dam'
                
                with open(json_path, 'r') as json_file:
                    image_metadata = json.load(json_file)
                xmin = image_metadata['pixel_bounding_box'][0] / float(image_decoded.shape[0])
                xmax = image_metadata['pixel_bounding_box'][2] / float(image_decoded.shape[0])
                ymin = image_metadata['pixel_bounding_box'][3] / float(image_decoded.shape[1])
                ymax = image_metadata['pixel_bounding_box'][1] / float(image_decoded.shape[1])
                if (xmin < 0 or ymin < 0 or xmax >= 1 or ymax >= 1):
                    LOGGER.warning('bounding box out of bounds %s %s %s %s',
                                   xmin, xmax, ymin, ymax)
                    xmin = max(0, xmin)
                    xmax = min(xmax, 1)
                    ymin = max(0, ymin)
                    ymax = min(ymax, 1)

                feature_dict.update({
                    'image/object/bbox/xmin': float_list_feature([xmin]),
                    'image/object/bbox/xmax': float_list_feature([xmax]),
                    'image/object/bbox/ymin': float_list_feature([ymin]),
                    'image/object/bbox/ymax': float_list_feature([ymax]),
                    'image/object/class/label': int64_list_feature(
                        [1]),  # the '1' is type 1 which is a dam
                    'image/object/class/text': bytes_list_feature(
                        [b'dam']),
                })
                tf_record = tf.train.Example(features=tf.train.Features(
                    feature=feature_dict))
            else:
                dam_type = b'not_a_dam'
                
                tf_record = tf.train.Example(features=tf.train.Features(
                    feature=feature_dict))
                
            
            # Choose whether this record will go to training or validation (=dev) set 
            
            centroid = image_metadata['lng_lat_centroid']
            if sa_geom_prep.contains(shapely.geometry.Point(centroid[0], centroid[1])): # removed: and dam_type == b'dam'
                writer = southaf_writer
                log = southaf_log
            elif numpy.random.random() > dev_set_portion:
                writer = training_writer
                log = training_log
            #elif POSSIBLY ADD MORE TEST SET HERE?
                writer = test_writer
                log = test_log
            else:
                writer = validation_writer
                log = validation_log
            writer.write(tf_record.SerializeToString())
            
            # Add stats 
            log[dam_type] += 1
            
        LOGGER.info(
            "training writer full creating %d instance" %
            tf_record_iteration)
        tf_record_iteration += 1
        
        training_writer.close()
        validation_writer.close()
        south_africa_writer.close()
        
        return training_log, validation_log, test_log, southaf_log

Full loop:

In [None]:
# Make workspace outputs directories doesn't exist
directories_to_make = [WORKSPACE_DIR,
                       os.path.join(WORKSPACE_DIR,'training_set'),
                       os.path.join(WORKSPACE_DIR,'validation_set'),
                       os.path.join(WORKSPACE_DIR,'test_set'),
                       os.path.join(WORKSPACE_DIR,'southaf_set')]
for directory in directories_to_make:
    try:
        os.makedirs(directory)
    except OSError:
        pass
    
    
# Do the thing    
training_log = {b'dam': 0, b'not_a_dam': 0}
validation_log = {b'dam': 0, b'not_a_dam': 0}
test_log = {b'dam': 0, b'not_a_dam': 0}
southaf_log = {b'dam': 0, b'not_a_dam': 0}

last_time = time.time()

training_writer_count = 0
validation_writer_count = 0

tf_record_iteration = 0
max_tf_record_iteration = int(len(all_images_file_list)/DAMS_PER_RECORD)

while tf_record_iteration <= max_tf_record_iteration:
    print('tf_record_iteration %d' % tf_record_iteration)
    
    slice_dam_images_list = all_images_file_list[tf_record_iteration*DAMS_PER_RECORD:min((tf_record_iteration+1)*DAMS_PER_RECORD,len(all_images_file_list))]
    
    training_log, validation_log, test_log, southaf_log = make_TFRecords(slice_dam_images_list,tf_record_iteration)

    print(training_log, validation_log, test_log, southaf_log)


#@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)


## Notes 

In [None]:
### Rich's orginial function

def main():
    """Entry point."""
    try:
        os.makedirs(WORKSPACE_DIR)
    except OSError:
        pass

    training_log = {b'dam': 0, b'not_a_dam': 0}
    validation_log = {b'dam': 0, b'not_a_dam': 0}
    south_africa_log = {b'dam': 0, b'not_a_dam': 0}

    last_time = time.time()
    training_writer_count = 0
    validation_writer_count = 0
    tf_record_iteration = 0

    ## Subsets for Dev
    dam_file_iter = glob.iglob(os.path.join(DAM_IMAGE_DIR, '119*pped.png'))
    no_a_dam_file_iter = glob.iglob(os.path.join(NOT_A_DAM_IMAGE_DIR, '350*.png'))
    
    ## REAL DATASET FULL
    # dam_file_iter = glob.iglob(os.path.join(DAM_IMAGE_DIR, '*pped.png'))
    # no_a_dam_file_iter = glob.iglob(os.path.join(NOT_A_DAM_IMAGE_DIR, '*.png'))
    
    #  NOT USING THIS BELOW BECAUSE NOT DOING 50/50.
    #dam_list_iter = [
    #    path for path_tuple in zip(dam_file_iter, no_a_dam_file_iter)
    #    for path in path_tuple]

    #while True:
    #    # this makes DAMS_PER_RECORD list of files
    #    dam_file_list = [
    #        (path, dam_type) for path, dam_type in zip(
    #            itertools.islice(
    #                dam_list_iter,
    #                DAMS_PER_RECORD*tf_record_iteration,
    #                DAMS_PER_RECORD*(tf_record_iteration+1)),
    #            itertools.cycle([b'dam', b'not_a_dam']))]

        LOGGER.debug(dam_file_list)
        if not dam_file_list:
            break

        with tf.Graph().as_default(), tf.Session() as sess:
            training_writer = tf.python_io.TFRecordWriter(
                os.path.join(
                    WORKSPACE_DIR,
                    'dam_training_%d.record' % tf_record_iteration))
            validation_writer = tf.python_io.TFRecordWriter(
                os.path.join(
                    WORKSPACE_DIR,
                    'dam_validation_%d.record' % tf_record_iteration))
            south_africa_writer = tf.python_io.TFRecordWriter(os.path.join(
                WORKSPACE_DIR, 'south_africa_%d.record' %
                tf_record_iteration))

            for image_path, dam_type in dam_file_list:
                current_time = time.time()
                if current_time - last_time > 5.0:
                    LOGGER.info('training_log: %s', training_log)
                    LOGGER.info('validation_log: %s', validation_log)
                    LOGGER.info('south_africa_log: %s', south_africa_log)
                    LOGGER.info('training_writer_count: %d', training_writer_count)
                    LOGGER.info('validation_writer_count: %d', validation_writer_count)
                    last_time = current_time
                # looks like anything can be used here, including serializing
                # a tensor tf.serialize_tensor
                image_string = tf.read_file(image_path)
                image_decoded = tf.image.decode_png(image_string).eval()
                image_string = open(image_path, 'rb').read()
                feature_dict = {
                    'image/height': int64_feature(
                        image_decoded.shape[0]),
                    'image/width': int64_feature(
                        image_decoded.shape[1]),
                    'image/filename': bytes_feature(
                        bytes(image_path, 'utf8')),
                    'image/source_id': bytes_feature(
                        bytes(image_path, 'utf8')),
                    'image/encoded': bytes_feature(image_string),
                    'image/format': bytes_feature(b'png'),
                }
                if dam_type == b'dam':
                    json_path = image_path.replace('.png', '.json')
                    with open(json_path, 'r') as json_file:
                        image_metadata = json.load(json_file)
                    xmin = image_metadata['pixel_bounding_box'][0] / float(image_decoded.shape[0])
                    xmax = image_metadata['pixel_bounding_box'][2] / float(image_decoded.shape[0])
                    ymin = image_metadata['pixel_bounding_box'][3] / float(image_decoded.shape[1])
                    ymax = image_metadata['pixel_bounding_box'][1] / float(image_decoded.shape[1])
                    if (xmin < 0 or ymin < 0 or
                            xmax >= 1 or
                            ymax >= 1):
                        LOGGER.warning(
                            'bounding box out of bounds %s %s %s %s',
                            xmin, xmax, ymin, ymax)
                        xmin = max(0, xmin)
                        xmax = min(xmax, 1)
                        ymin = max(0, ymin)
                        ymax = min(ymax, 1)

                    feature_dict.update({
                        'image/object/bbox/xmin': float_list_feature([xmin]),
                        'image/object/bbox/xmax': float_list_feature([xmax]),
                        'image/object/bbox/ymin': float_list_feature([ymin]),
                        'image/object/bbox/ymax': float_list_feature([ymax]),
                        'image/object/class/label': int64_list_feature(
                            [1]),  # the '1' is type 1 which is a dam
                        'image/object/class/text': bytes_list_feature(
                            [b'dam']),
                    })
                    tf_record = tf.train.Example(features=tf.train.Features(
                        feature=feature_dict))

                    centroid = image_metadata['lng_lat_centroid']
                    if dam_type == b'dam' and sa_geom_prep.contains(
                            shapely.geometry.Point(centroid[0], centroid[1])):
                        writer = south_africa_writer
                        log = south_africa_log
                        writer.write(tf_record.SerializeToString())
                        log[dam_type] += 1
                        continue
                else:
                    tf_record = tf.train.Example(features=tf.train.Features(
                        feature=feature_dict))
                if numpy.random.random() > dev_set_portion:
                    writer = training_writer
                    log = training_log
                else:
                    writer = validation_writer
                    log = validation_log
                writer.write(tf_record.SerializeToString())
                log[dam_type] += 1

            LOGGER.info(
                "training writer full creating %d instance" %
                tf_record_iteration)
            tf_record_iteration += 1
            training_writer.close()
            validation_writer.close()
            south_africa_writer.close()

    with open('write_stats.txt', 'w') as write_stats_file:
        write_stats_file.write(
            f"""validation: dam({validation_log[b'dam']}) not_a_dam({
                validation_log[b'not_a_dam']})\n"""
            f"""training: dam({training_log[b'dam']}) not_a_dam({
                training_log[b'not_a_dam']})\n"""
            f"""south_africa: dam({south_africa_log[b'dam']}) not_a_dam({
                south_africa_log[b'not_a_dam']})\n""")


@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
