In [1]:
import numpy as np
import tensorflow as tf
import glob

data_dir = "/lustre_scratch/duncanwp/combined_v3_typed_new_composite"

IMG_SIZE = 448
INT_IMG_SIZE = (2240, 1344)  # 5*IMG_SIZE, 3*IMG_SIZE


def get_generator(all_data, all_labels):

    for data, labels in zip(all_data, all_labels):
#         print(data)
        # Resize the data
        _data = tf.image.resize(data, INT_IMG_SIZE) / 255.
        _labels = tf.image.resize(tf.expand_dims(labels, -1), INT_IMG_SIZE, 'nearest') # Adding an extra color dim for tf.image
#         print(_data)
#         print(_labels)

        # Slice the images to the final size...
        flat_patches = tf.image.extract_patches(images=tf.expand_dims(_data, axis=0),
                                                sizes=[1, IMG_SIZE, IMG_SIZE, 1],
                                                strides=[1, IMG_SIZE, IMG_SIZE, 1],  # This should be the same as sizes
                                                rates=[1, 1, 1, 1],
                                                padding='VALID')
        _data = tf.reshape(flat_patches, [-1, IMG_SIZE, IMG_SIZE, 3])  # Stack them along the leading dim

        # ...And the labels
        flat_patches = tf.image.extract_patches(images=tf.expand_dims(_labels, axis=0),
                                                sizes=[1, IMG_SIZE, IMG_SIZE, 1],
                                                strides=[1, IMG_SIZE, IMG_SIZE, 1],  # This should be the same as sizes
                                                rates=[1, 1, 1, 1],
                                                padding='VALID')
        _labels = tf.reshape(flat_patches, [-1, IMG_SIZE, IMG_SIZE])  # Stack them along the leading dim
#         print("done slicing")

        has_labels = tf.math.reduce_any(tf.reshape(_labels, [-1, IMG_SIZE*IMG_SIZE]) > 0, axis=1)
#         print(has_labels)
#         _data = tf.boolean_mask(_data, has_labels)
#         _labels = tf.boolean_mask(_labels, has_labels)
        for i in range(_data.shape[0]):
            yield _data[i], _labels[i], has_labels[i]
            

def create_mixed_dataset(training_dir, shuffle_size=1024, balance=None):
# Balance should be a tuple of shape (2, ) describing the positive/negative weighting
    positive = create_dataset(training_dir, shuffle_size=shuffle_size, cls_label='positive')
    negative = create_dataset(training_dir, shuffle_size=shuffle_size, cls_label='negative')
    if balance is None:
        balance = (0.5, 0.5)
    sampled_ds=tf.data.experimental.sample_from_datasets(datasets, weights=balance)
    return sampled_ds

# Note, if we wanted fewer classes, we can use glob syntax to grab multiple classes as once
# e.g. create_dataset(2015,"[67]")
# will take classes 6 & 7 together

def _parse_batch(record_batch):
    # Create a description of the features
    feature_description = {
        'data': tf.io.FixedLenFeature(insize, tf.float32),
        'mask': tf.io.FixedLenFeature(outsize, tf.float32),
    }

    # Parse the input `tf.Example` proto using the dictionary above
    example = tf.io.parse_example(record_batch, feature_description)
    return example['data'], example['mask']


def create_dataset(training_dir, shuffle_size=1024, cls_label='positive'):
    AUTOTUNE = tf.data.experimental.AUTOTUNE
    fl = glob.glob(f"/{training_dir}/tfrecords/*_{cls_label}.tfrecords")
    files_ds = tf.data.Dataset.list_files(fl)
    ds = tf.data.TFRecordDataset(files_ds, num_parallel_reads=AUTOTUNE)
    ds = ds.shuffle(shuffle_size)
    ds = ds.map(lambda x: _parse_batch(x))
    return ds

def _float_feature(list_of_floats):  # float32
    return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_floats))


def write_data(training_dir):
    from shiptrack import load_numpy_arrays
    from tqdm.notebook import tqdm
    
    all_data, all_labels = load_numpy_arrays(training_dir)


    for i, (data, label, has_shiptrack) in tqdm(enumerate(get_generator(all_data, all_labels)), total=len(all_data)*15):
        cls_label = 'positive' if has_shiptrack else 'negative'
        fhandle = tf.io.TFRecordWriter(f"/{training_dir}/tfrecords/{i:04d}_{cls_label}.tfrecords")
        feature = {
            'data': _float_feature(data.numpy().flatten()),
            'mask': _float_feature(label.numpy().flatten().astype(float)),
        }
        features = tf.train.Features(feature=feature)
        example = tf.train.Example(features=features)
        example_to_string = example.SerializeToString()
        fhandle.write(example_to_string)
        fhandle.close()


2021-07-13 15:56:15.768199: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [2]:
write_data(data_dir)

Segmentation Models: using `tf.keras` framework.
channels_last


  0%|          | 0/33075 [00:00<?, ?it/s]

2021-07-13 15:56:19.032384: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-07-13 15:56:19.177089: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:07:00.0 name: Tesla V100-SXM2-32GB-LS computeCapability: 7.0
coreClock: 1.44GHz coreCount: 80 deviceMemorySize: 31.72GiB deviceMemoryBandwidth: 772.48GiB/s
2021-07-13 15:56:19.180830: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 1 with properties: 
pciBusID: 0000:0a:00.0 name: Tesla V100-SXM2-32GB-LS computeCapability: 7.0
coreClock: 1.44GHz coreCount: 80 deviceMemorySize: 31.72GiB deviceMemoryBandwidth: 772.48GiB/s
2021-07-13 15:56:19.180867: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2021-07-13 15:56:19.186720: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcubl