In [None]:
%reload_ext autoreload
%autoreload 2

from helpers import *
import numpy as np
import os
from datetime import datetime

DATASET_ROOT_PATH = '../../data/HiltonOfFern_crop_field_training_cloud_free_available_area'
CLIP_VALUE = 1811.6


now = datetime.now()
current_time = now.strftime("%H:%M:%S")
tf.print("Script started running at ", current_time)



if os.path.exists('datasets/filtered_filenames'):
    tf.print("Don't need to filter dataset as have saved dataset I can reload!")
    list_ds = tf.data.experimental.load('datasets/filtered_filenames')
else:
    # load list of our sentinel 2 files in random order (so when we split data later we get representative distributions rather than images next to each other - seed randomiser so my work is repeatable!) 
    list_ds = tf.data.Dataset.list_files(DATASET_ROOT_PATH + '/*/*/S2/Patches/*.tif', shuffle=True, seed=42)
    total_products = tf.data.experimental.cardinality(list_ds).numpy()


    test_size = int(total_products * 0.90)
    #print("test size = " + str(test_size))
    list_ds = list_ds.skip(test_size)


    # remove Sentinel 2 images which are cloudy, covered in shadow, have lots of missing data or sensor errors and Sentinel 1 images that have lots of missing data
    # do all filtering in one Frankenstein function so only have to count dataset size once 
    list_ds = list_ds.filter(filter_low_quality_data)
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    tf.print("finished filtering at ", current_time)
    # unfortunately after filtering tensorflow no longer knows how many items are in dataset and can no longer iterate through it due to this. So have to manually count items and set value
    count = list_ds.reduce(0, lambda x, _: x + 1).numpy()
    tf.print("after filtering have " + str(count) + " files")
    list_ds = list_ds.apply(tf.data.experimental.assert_cardinality(count))
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    tf.print("finished fixing cardinality at ", current_time)
    tf.data.experimental.save(list_ds, 'datasets/filtered_filenames')
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    tf.print("finished saving at ", current_time)
    #list_ds = list_ds.apply(tf.data.experimental.assert_cardinality(len(list(list_ds))))
    #total_products = tf.data.experimental.cardinality(list_ds).numpy()

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
tf.print("Have filtered dataset at ", current_time)

# split into test, train, validation and verification sets (do this before processing so can save individual datasets along way as processing takes a long time and if script crashes
# don't want to have to start from scratch)
train_ds, val_ds, test_ds, ver_ds = split_dataset(list_ds, tf.data.experimental.cardinality(list_ds).numpy())
# now actually load info from Sentinel files for our training dataset (create 2 datasets, one that includes the VH/VV ratio for Sentinel 1 files and 1 without)
train_with_ratio = train_ds.map(lambda x: process_path(x, CLIP_VALUE, "clip", 0, True ), num_parallel_calls=tf.data.AUTOTUNE)
train_with_ratio.apply(tf.data.experimental.ignore_errors())
tf.data.experimental.save(train_with_ratio, 'datasets/train_with_ratio')

train_without_ratio = train_ds.map(lambda x: process_path(x, CLIP_VALUE, "clip", 0, False ), num_parallel_calls=tf.data.AUTOTUNE)
train_without_ratio.apply(tf.data.experimental.ignore_errors())
tf.data.experimental.save(train_with_ratio, 'datasets/train_without_ratio')

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
tf.print("Have training dataset at ", current_time)

# process and save validation, test and verification datasets
val_with_ratio = val_ds.map(lambda x: process_path(x, CLIP_VALUE, "clip", 0, True ), num_parallel_calls=tf.data.AUTOTUNE)
val_with_ratio.apply(tf.data.experimental.ignore_errors())
tf.data.experimental.save(val_with_ratio, 'datasets/val_with_ratio')

val_without_ratio = val_ds.map(lambda x: process_path(x, CLIP_VALUE, "clip", 0, False ), num_parallel_calls=tf.data.AUTOTUNE)
val_without_ratio.apply(tf.data.experimental.ignore_errors())
tf.data.experimental.save(val_with_ratio, 'datasets/train_without_ratio')

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
tf.print("Have validation dataset at ", current_time)

test_with_ratio = test_ds.map(lambda x: process_path(x, CLIP_VALUE, "clip", 0, True ), num_parallel_calls=tf.data.AUTOTUNE)
test_with_ratio.apply(tf.data.experimental.ignore_errors())
tf.data.experimental.save(test_with_ratio, 'datasets/test_with_ratio')

test_without_ratio = test_ds.map(lambda x: process_path(x, CLIP_VALUE, "clip", 0, False ), num_parallel_calls=tf.data.AUTOTUNE)
test_without_ratio.apply(tf.data.experimental.ignore_errors())
tf.data.experimental.save(test_with_ratio, 'datasets/test_without_ratio')

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
tf.print("Have test dataset at ", current_time)

ver_with_ratio = ver_ds.map(lambda x: process_path(x, CLIP_VALUE, "clip", 0, True ),num_parallel_calls=tf.data.AUTOTUNE)
ver_with_ratio.apply(tf.data.experimental.ignore_errors())
tf.data.experimental.save(ver_with_ratio, 'datasets/test_with_ratio')

ver_without_ratio = ver_ds.map(lambda x: process_path(x, CLIP_VALUE, "clip", 0, False ), num_parallel_calls=tf.data.AUTOTUNE)
ver_without_ratio.apply(tf.data.experimental.ignore_errors())
tf.data.experimental.save(ver_with_ratio, 'datasets/test_without_ratio')


now = datetime.now()
current_time = now.strftime("%H:%M:%S")
tf.print("Have verification dataset at ", current_time)

Script started running at  12:43:29
finished filtering at  12:43:30
../../data/HiltonOfFern_crop_field_training_cloud_free_available_area/20190225/ROI1/S2/Patches/S1_150c763c-0ee6-4611-a6ce-d9cc89f3420b_S2_d5bb8ed3-4ad7-4373-afd0-cfbfb5523da7_4352_1792_256x256.tif
Passed all filtering tests and keeping file
../../data/HiltonOfFern_crop_field_training_cloud_free_available_area/20190325/ROI1/S2/Patches/S1_ed18301b-01fd-45ed-982e-f403b709dd3c_S2_8431e608-b9ed-47f9-8042-f96a7a441675_1024_3328_256x256.tif
../../data/HiltonOfFern_crop_field_training_cloud_free_available_area/20190513/ROI1/S2/Patches/S1_e316e53f-5e52-486c-a5d5-e82e63d0c3bc_S2_780cee33-2b59-4171-8f6b-be8f221fbfca_1280_2560_256x256.tif
../../data/HiltonOfFern_crop_field_training_cloud_free_available_area/20190325/ROI1/S2/Patches/S1_ed18301b-01fd-45ed-982e-f403b709dd3c_S2_8431e608-b9ed-47f9-8042-f96a7a441675_2304_1024_256x256.tif
../../data/HiltonOfFern_crop_field_training_cloud_free_available_area/20181015/ROI1/S2/Patches/S1_53