In [1]:
from shiptrack import get_data, get_preprocessing, losses, fit_model
from segmentation_models import get_preprocessing
from segmentation_models import Unet
from segmentation_models.metrics import iou_score
import glob
    
from keras.optimizers import Adam
import tensorflow as tf

2021-07-13 17:45:52.554190: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


Segmentation Models: using `tf.keras` framework.
channels_last


In [2]:
# Hyperparams
epochs = 30
batch_size = 8
learning_rate = 0.01
augment = False
encoder_freeze = False
backbone = "resnet152"
test_prop = 5
loss = "bce_jaccard_loss"

INT_IMG_SIZE = (2240, 1344)
IMG_SIZE = 448

In [3]:
from datetime import datetime
now = datetime.now()

model_name = f"{now.strftime('%Y%m%d_%H%M%S')}_new_{backbone}_{loss}{'_augmented' if augment else ''}"
# System paths
training_dir = "/lustre_scratch/duncanwp/combined_v3_typed_new_composite"
tensorboard_dir = f"/lustre_scratch/duncanwp/tensorboard/{model_name}"
model_dir = f"/lustre_scratch/duncanwp/models/{model_name}"

In [4]:
def create_mixed_dataset(training_dir, shuffle_size=1024, balance=None):
# Balance should be a tuple of shape (2, ) describing the positive/negative weighting
    positive = create_dataset(training_dir, shuffle_size=shuffle_size, cls_label='positive')
    negative = create_dataset(training_dir, shuffle_size=shuffle_size, cls_label='negative')
    if balance is None:
        balance = (0.5, 0.5)
    sampled_ds=tf.data.experimental.sample_from_datasets(datasets, weights=balance)
    return sampled_ds

# Note, if we wanted fewer classes, we can use glob syntax to grab multiple classes as once
# e.g. create_dataset(2015,"[67]")
# will take classes 6 & 7 together

def _parse_batch(record_batch):
    # Create a description of the features
    feature_description = {
        'data': tf.io.FixedLenFeature((IMG_SIZE, IMG_SIZE, 3), tf.float32),
        'mask': tf.io.FixedLenFeature((IMG_SIZE, IMG_SIZE), tf.float32),
    }

    # Parse the input `tf.Example` proto using the dictionary above
    example = tf.io.parse_example(record_batch, feature_description)
    return example['data'], example['mask']


def create_dataset(training_dir, shuffle_size=1024, cls_label='positive'):

    AUTOTUNE = tf.data.experimental.AUTOTUNE
    fl = glob.glob(f"/{training_dir}/tfrecords/*_{cls_label}.tfrecords")
    files_ds = tf.data.Dataset.list_files(fl)
    ds = tf.data.TFRecordDataset(files_ds, num_parallel_reads=AUTOTUNE)
    ds = ds.shuffle(shuffle_size)
    ds = ds.map(lambda x: _parse_batch(x))
    return ds


ds_size = len(list(glob.glob(f"/{training_dir}/tfrecords/*_positive.tfrecords"))) # This assumes only taking positive examples
ds = create_dataset(training_dir)

train_split=0.8
val_split=0.1
test_split=0.1
train_size = int(train_split * ds_size)
val_size = int(val_split * ds_size)

train_ds = ds.take(train_size)    
val_ds = ds.skip(train_size).take(val_size)
test_ds = ds.skip(train_size).skip(val_size)


2021-07-13 17:46:11.253298: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-07-13 17:46:11.423931: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:07:00.0 name: Tesla V100-SXM2-32GB-LS computeCapability: 7.0
coreClock: 1.44GHz coreCount: 80 deviceMemorySize: 31.72GiB deviceMemoryBandwidth: 772.48GiB/s
2021-07-13 17:46:11.428393: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 1 with properties: 
pciBusID: 0000:0a:00.0 name: Tesla V100-SXM2-32GB-LS computeCapability: 7.0
coreClock: 1.44GHz coreCount: 80 deviceMemorySize: 31.72GiB deviceMemoryBandwidth: 772.48GiB/s
2021-07-13 17:46:11.443451: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2021-07-13 17:46:11.768413: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcubl

In [5]:
from keras.layers import Input
from keras.models import Model

# Automatically mirror training across all available GPUs
strategy = tf.distribute.MirroredStrategy(cross_device_ops=tf.distribute.HierarchicalCopyAllReduce())
with strategy.scope():

    model = Unet(backbone, encoder_weights='imagenet', encoder_freeze=encoder_freeze,
                 classes=1, activation='sigmoid')

#     print(model.summary())

    model.compile(Adam(learning_rate=learning_rate), loss=losses[loss], metrics=[iou_score])

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensor

In [6]:
from keras.callbacks import TensorBoard, ReduceLROnPlateau
tensorboard = TensorBoard(log_dir=tensorboard_dir, histogram_freq=0,
                          write_images=True, write_graph=False)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=5e-7, verbose=1)

2021-07-13 17:46:25.375062: I tensorflow/core/profiler/lib/profiler_session.cc:126] Profiler session initializing.
2021-07-13 17:46:25.375108: I tensorflow/core/profiler/lib/profiler_session.cc:141] Profiler session started.
2021-07-13 17:46:25.392996: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1611] Profiler found 2 GPUs
2021-07-13 17:46:25.579014: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcupti.so.11.2
2021-07-13 17:46:25.951317: I tensorflow/core/profiler/lib/profiler_session.cc:159] Profiler session tear down.
2021-07-13 17:46:25.963401: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1743] CUPTI activity buffer flushed


In [7]:

# def get_data_flow(data, labels, subset, batch_size=1):
#     # this is the augmentation configuration we will use for training
#     from keras.preprocessing.image import ImageDataGenerator
#     datagen = ImageDataGenerator(
#         shear_range=0.2,
#         zoom_range=0.2,
#         horizontal_flip=True,
#         validation_split=0.2)
#     generator = datagen.flow(
#         data, y=labels,
#         batch_size=batch_size if subset == 'training' else 1,
#         subset=subset)
#     return generator

def get_data_flow(data, labels, subset, batch_size=1):
    # we create two instances with the same arguments
    data_gen_args = dict(
                         shear_range=0.2,
                         zoom_range=0.2,
                         horizontal_flip=True,
                        )
    image_datagen = ImageDataGenerator(**data_gen_args)
    mask_datagen = ImageDataGenerator(**data_gen_args)
    # Provide the same seed and keyword arguments to the fit and flow methods
    seed = 1
    image_datagen.fit(data, augment=True, seed=seed)
    mask_datagen.fit(labels, augment=True, seed=seed)
    image_generator = image_datagen.flow(
        data,
        batch_size=batch_size,
        class_mode=None,
        seed=seed)
    mask_generator = mask_datagen.flow(
        labels,
        batch_size=batch_size,
        class_mode=None,
        seed=seed)
    # combine generators into one which yields image and masks
    train_generator = zip(image_generator, mask_generator)
    return train_generator

if augment:
    raise NotImplemented()

In [9]:
history = model.fit(train_ds.shuffle(1024).batch(8*2).prefetch(tf.data.AUTOTUNE), validation_data=val_ds.prefetch(tf.data.AUTOTUNE), verbose=1,
                    epochs=epochs, callbacks=[tensorboard, reduce_lr])

Epoch 1/30
      1/Unknown - 5s 5s/step - loss: 1.0010 - iou_score: 0.1565

2021-07-13 18:27:12.067429: I tensorflow/core/profiler/lib/profiler_session.cc:126] Profiler session initializing.
2021-07-13 18:27:12.067483: I tensorflow/core/profiler/lib/profiler_session.cc:141] Profiler session started.


      2/Unknown - 6s 2s/step - loss: 0.9294 - iou_score: 0.1936

2021-07-13 18:27:14.447324: I tensorflow/core/profiler/lib/profiler_session.cc:66] Profiler session collecting data.
2021-07-13 18:27:14.461945: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1743] CUPTI activity buffer flushed
2021-07-13 18:27:14.619332: I tensorflow/core/profiler/internal/gpu/cupti_collector.cc:673]  GpuTracer has collected 7572 callback api events and 7569 activity events. 
2021-07-13 18:27:14.799429: I tensorflow/core/profiler/lib/profiler_session.cc:159] Profiler session tear down.
2021-07-13 18:27:15.026689: I tensorflow/core/profiler/rpc/client/save_profile.cc:137] Creating directory: /lustre_scratch/duncanwp/tensorboard/20210713_174611_new_resnet152_bce_jaccard_loss/train/plugins/profile/2021_07_13_18_27_14
2021-07-13 18:27:15.155219: I tensorflow/core/profiler/rpc/client/save_profile.cc:143] Dumped gzipped tool data for trace.json.gz to /lustre_scratch/duncanwp/tensorboard/20210713_174611_new_resnet152_bce_jaccard_loss/train/plugins/profile/2021_07_13

Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30

Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.0003999999724328518.
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30

Epoch 00010: ReduceLROnPlateau reducing learning rate to 7.999999215826393e-05.
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30

Epoch 00015: ReduceLROnPlateau reducing learning rate to 1.599999814061448e-05.
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30

Epoch 00020: ReduceLROnPlateau reducing learning rate to 3.199999628122896e-06.
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30

Epoch 00025: ReduceLROnPlateau reducing learning rate to 6.399999165296323e-07.
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30

Epoch 00030: ReduceLROnPlateau reducing learning rate to 5e-07.


In [11]:
test_ds

<SkipDataset shapes: ((448, 448, 3), (448, 448)), types: (tf.float32, tf.float32)>

In [13]:
score = model.evaluate(val_ds.take(10).prefetch(tf.data.AUTOTUNE), verbose=0)

print('Test loss    :', score[0])
print('Test accuracy:', score[1])

INFO:tensorflow:Error reported to Coordinator: Input 0 of layer bn_data is incompatible with the layer: expected ndim=4, found ndim=3. Full shape received: (224, 448, 3)
Traceback (most recent call last):
  File "/lustre_scratch/duncanwp/conda-envs/shiptrack-env/lib/python3.9/site-packages/tensorflow/python/training/coordinator.py", line 297, in stop_on_exception
    yield
  File "/lustre_scratch/duncanwp/conda-envs/shiptrack-env/lib/python3.9/site-packages/tensorflow/python/distribute/mirrored_run.py", line 334, in run
    self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
  File "/lustre_scratch/duncanwp/conda-envs/shiptrack-env/lib/python3.9/site-packages/tensorflow/python/autograph/impl/api.py", line 692, in wrapper
    return converted_call(f, args, kwargs, options=options)
  File "/lustre_scratch/duncanwp/conda-envs/shiptrack-env/lib/python3.9/site-packages/tensorflow/python/autograph/impl/api.py", line 382, in converted_call
    return _call_unconverted(f, args

ValueError: in user code:

    /lustre_scratch/duncanwp/conda-envs/shiptrack-env/lib/python3.9/site-packages/tensorflow/python/keras/engine/training.py:1323 test_function  *
        return step_function(self, iterator)
    /lustre_scratch/duncanwp/conda-envs/shiptrack-env/lib/python3.9/site-packages/tensorflow/python/keras/engine/training.py:1314 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /lustre_scratch/duncanwp/conda-envs/shiptrack-env/lib/python3.9/site-packages/tensorflow/python/distribute/distribute_lib.py:1285 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /lustre_scratch/duncanwp/conda-envs/shiptrack-env/lib/python3.9/site-packages/tensorflow/python/distribute/distribute_lib.py:2833 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /lustre_scratch/duncanwp/conda-envs/shiptrack-env/lib/python3.9/site-packages/tensorflow/python/distribute/mirrored_strategy.py:678 _call_for_each_replica
        return mirrored_run.call_for_each_replica(
    /lustre_scratch/duncanwp/conda-envs/shiptrack-env/lib/python3.9/site-packages/tensorflow/python/distribute/mirrored_run.py:104 call_for_each_replica
        return _call_for_each_replica(strategy, fn, args, kwargs)
    /lustre_scratch/duncanwp/conda-envs/shiptrack-env/lib/python3.9/site-packages/tensorflow/python/distribute/mirrored_run.py:245 _call_for_each_replica
        coord.join(threads)
    /lustre_scratch/duncanwp/conda-envs/shiptrack-env/lib/python3.9/site-packages/tensorflow/python/training/coordinator.py:389 join
        six.reraise(*self._exc_info_to_raise)
    /lustre_scratch/duncanwp/conda-envs/shiptrack-env/lib/python3.9/site-packages/six.py:703 reraise
        raise value
    /lustre_scratch/duncanwp/conda-envs/shiptrack-env/lib/python3.9/site-packages/tensorflow/python/training/coordinator.py:297 stop_on_exception
        yield
    /lustre_scratch/duncanwp/conda-envs/shiptrack-env/lib/python3.9/site-packages/tensorflow/python/distribute/mirrored_run.py:334 run
        self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
    /lustre_scratch/duncanwp/conda-envs/shiptrack-env/lib/python3.9/site-packages/tensorflow/python/keras/engine/training.py:1307 run_step  **
        outputs = model.test_step(data)
    /lustre_scratch/duncanwp/conda-envs/shiptrack-env/lib/python3.9/site-packages/tensorflow/python/keras/engine/training.py:1266 test_step
        y_pred = self(x, training=False)
    /lustre_scratch/duncanwp/conda-envs/shiptrack-env/lib/python3.9/site-packages/tensorflow/python/keras/engine/base_layer.py:1030 __call__
        outputs = call_fn(inputs, *args, **kwargs)
    /lustre_scratch/duncanwp/conda-envs/shiptrack-env/lib/python3.9/site-packages/tensorflow/python/keras/engine/functional.py:420 call
        return self._run_internal_graph(
    /lustre_scratch/duncanwp/conda-envs/shiptrack-env/lib/python3.9/site-packages/tensorflow/python/keras/engine/functional.py:556 _run_internal_graph
        outputs = node.layer(*args, **kwargs)
    /lustre_scratch/duncanwp/conda-envs/shiptrack-env/lib/python3.9/site-packages/tensorflow/python/keras/engine/base_layer.py:1013 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
    /lustre_scratch/duncanwp/conda-envs/shiptrack-env/lib/python3.9/site-packages/tensorflow/python/keras/engine/input_spec.py:215 assert_input_compatibility
        raise ValueError('Input ' + str(input_index) + ' of layer ' +

    ValueError: Input 0 of layer bn_data is incompatible with the layer: expected ndim=4, found ndim=3. Full shape received: (224, 448, 3)


In [15]:
import os.path
# save Keras model for Tensorflow Serving

tf.saved_model.save(
    model,
    os.path.join(model_dir, 'model/1'))

2021-07-13 21:05:16.215624: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.



FOR DEVS: If you are overwriting _tracking_metadata in your class, this property has been used to save metadata in the SavedModel. The metadta field will be deprecated soon, so please move the metadata to a different file.
INFO:tensorflow:Assets written to: /lustre_scratch/duncanwp/models/20210713_174611_new_resnet152_bce_jaccard_loss/model/1/assets


In [None]:
model.save(os.path.join(model_dir, 'model/2'))