In [1]:
! pip install keras segmentation-models

Collecting keras
  Downloading keras-2.8.0-py2.py3-none-any.whl (1.4 MB)
     |████████████████████████████████| 1.4 MB 4.4 MB/s            
[?25hCollecting segmentation-models
  Using cached segmentation_models-1.0.1-py3-none-any.whl (33 kB)
Collecting efficientnet==1.0.0
  Using cached efficientnet-1.0.0-py3-none-any.whl (17 kB)
Collecting keras-applications<=1.0.8,>=1.0.7
  Using cached Keras_Applications-1.0.8-py3-none-any.whl (50 kB)
Collecting image-classifiers==1.0.0
  Using cached image_classifiers-1.0.0-py3-none-any.whl (19 kB)
Installing collected packages: keras-applications, image-classifiers, efficientnet, segmentation-models, keras
Successfully installed efficientnet-1.0.0 image-classifiers-1.0.0 keras-2.8.0 keras-applications-1.0.8 segmentation-models-1.0.1


In [2]:
from shiptrack import get_data, get_preprocessing, losses, fit_model
from segmentation_models import get_preprocessing
from segmentation_models import Unet, FPN
from segmentation_models.metrics import iou_score
import glob
    
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

2022-03-04 15:30:33.124935: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


Segmentation Models: using `tf.keras` framework.
channels_last


In [3]:
# Hyperparams
epochs = 100
batch_size = 8
learning_rate = 0.01
augment = False
encoder_freeze = False
backbone = "resnet152"
test_prop = 5
loss = "bce_jaccard_loss"

INT_IMG_SIZE = (2240, 1344)
IMG_SIZE = 448

In [4]:
from datetime import datetime
now = datetime.now()

model_name = f"{now.strftime('%Y%m%d_%H%M%S')}_new_{backbone}_{loss}{'_augmented' if augment else ''}"
# System paths
training_dir = "/lustre_scratch/duncanwp/combined_v3_typed_new_composite"
tensorboard_dir = f"/lustre_scratch/duncanwp/tensorboard/{model_name}"
model_dir = f"/lustre_scratch/duncanwp/models/{model_name}"

In [5]:
def create_mixed_dataset(training_dir, shuffle_size=1024, balance=None):
# Balance should be a tuple of shape (2, ) describing the positive/negative weighting
    positive = create_dataset(training_dir, shuffle_size=shuffle_size, cls_label='positive')
    negative = create_dataset(training_dir, shuffle_size=shuffle_size, cls_label='negative')
    if balance is None:
        balance = (1.0, 0.0)
    sampled_ds=tf.data.experimental.sample_from_datasets(datasets, weights=balance)
    return sampled_ds

# Note, if we wanted fewer classes, we can use glob syntax to grab multiple classes as once
# e.g. create_dataset(2015,"[67]")
# will take classes 6 & 7 together

def _parse_batch(record_batch):
    # Create a description of the features
    feature_description = {
        'data': tf.io.FixedLenFeature((IMG_SIZE, IMG_SIZE, 3), tf.float32),
        'mask': tf.io.FixedLenFeature((IMG_SIZE, IMG_SIZE), tf.float32),
    }

    # Parse the input `tf.Example` proto using the dictionary above
    example = tf.io.parse_example(record_batch, feature_description)
    return example['data'], example['mask']


def create_dataset(training_dir, shuffle_size=1024, cls_label='positive'):

    AUTOTUNE = tf.data.experimental.AUTOTUNE
    fl = glob.glob(f"/{training_dir}/tfrecords/*_{cls_label}.tfrecords")
    files_ds = tf.data.Dataset.list_files(fl)
    ds = tf.data.TFRecordDataset(files_ds, num_parallel_reads=AUTOTUNE)
    ds = ds.shuffle(shuffle_size)
    ds = ds.map(lambda x: _parse_batch(x))
    return ds


ds_size = len(list(glob.glob(f"/{training_dir}/tfrecords/*_positive.tfrecords"))) # This assumes only taking positive examples
ds = create_dataset(training_dir)

train_split=0.8
val_split=0.2
# test_split=0.1
train_size = int(train_split * ds_size)
val_size = int(val_split * ds_size)

train_ds = ds.take(train_size)    
val_ds = ds.skip(train_size).take(val_size)
# test_ds = ds.skip(train_size).skip(val_size)


2021-12-10 17:28:40.466813: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-12-10 17:28:40.467965: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-12-10 17:28:40.786248: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:85:00.0 name: Tesla V100-SXM2-32GB-LS computeCapability: 7.0
coreClock: 1.44GHz coreCount: 80 deviceMemorySize: 31.72GiB deviceMemoryBandwidth: 772.48GiB/s
2021-12-10 17:28:40.790481: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 1 with properties: 
pciBusID: 0000:86:00.0 name: Tesla V100-SXM2-32GB-LS computeCapability: 7.0
coreClock: 1.44GHz coreCount: 80 deviceMemorySize: 31.72GiB deviceMemoryBandwidth: 772.48GiB/s
2021-12-10 17:28:40.790540: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0

In [6]:
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model

# Automatically mirror training across all available GPUs
strategy = tf.distribute.MirroredStrategy(cross_device_ops=tf.distribute.HierarchicalCopyAllReduce())
with strategy.scope():
# TODO: I might want to explore without encoder weights again (especially if I get the augmentaiton working)
# TODO: I could also explore the activations. 'swish' is a popular one but I'll need to renormalize my data first I think (-0.5)
    model = Unet(backbone, encoder_weights='imagenet', encoder_freeze=encoder_freeze,
                 classes=1, activation='sigmoid')

#     print(model.summary())

    model.compile(Adam(learning_rate=learning_rate), loss=losses[loss], metrics=[iou_score])

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensor

In [7]:
from tensorflow.keras.callbacks import TensorBoard, ReduceLROnPlateau, ModelCheckpoint
tensorboard = TensorBoard(log_dir=tensorboard_dir, histogram_freq=5,
                          write_images=True, write_graph=False)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=5e-7, verbose=1)

model_checkpoint = ModelCheckpoint(
    filepath=model_dir+'/model/checkpoint',
    save_weights_only=False,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

# TODO:
# Add EarlyStopping and ModelCheckpoint callbacks

2021-12-10 17:28:52.809141: I tensorflow/core/profiler/lib/profiler_session.cc:136] Profiler session initializing.
2021-12-10 17:28:52.809190: I tensorflow/core/profiler/lib/profiler_session.cc:155] Profiler session started.
2021-12-10 17:28:52.809237: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1365] Profiler found 2 GPUs
2021-12-10 17:28:52.809605: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcupti.so.11.0'; dlerror: libcupti.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.3/lib:/usr/local/cuda-11.3/lib64:/usr/local/cuda/compat
2021-12-10 17:28:53.034022: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcupti.so
2021-12-10 17:28:53.528191: I tensorflow/core/profiler/lib/profiler_session.cc:172] Profiler session tear down.
2021-12-10 17:28:53.537295: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1487] 

In [8]:
# def get_data_flow(data, labels, subset, batch_size=1):
#     # this is the augmentation configuration we will use for training
#     from keras.preprocessing.image import ImageDataGenerator
#     datagen = ImageDataGenerator(
#         shear_range=0.2,
#         zoom_range=0.2,
#         horizontal_flip=True,
#         validation_split=0.2)
#     generator = datagen.flow(
#         data, y=labels,
#         batch_size=batch_size if subset == 'training' else 1,
#         subset=subset)
#     return generator

def get_data_flow(data, labels, subset, batch_size=1):
    # we create two instances with the same arguments
    data_gen_args = dict(
                         shear_range=0.2,
                         zoom_range=0.2,
                         horizontal_flip=True,
                        )
    image_datagen = ImageDataGenerator(**data_gen_args)
    mask_datagen = ImageDataGenerator(**data_gen_args)
    # Provide the same seed and keyword arguments to the fit and flow methods
    seed = 1
    image_datagen.fit(data, augment=True, seed=seed)
    mask_datagen.fit(labels, augment=True, seed=seed)
    image_generator = image_datagen.flow(
        data,
        batch_size=batch_size,
        class_mode=None,
        seed=seed)
    mask_generator = mask_datagen.flow(
        labels,
        batch_size=batch_size,
        class_mode=None,
        seed=seed)
    # combine generators into one which yields image and masks
    train_generator = zip(image_generator, mask_generator)
    return train_generator

def augment_images(image_label, seed):
    image, label = image_label
#     image, label = resize_and_rescale(image, label)
#     image = tf.image.resize_with_crop_or_pad(image, IMG_SIZE + IMG_SIZE//20, IMG_SIZE + IMG_SIZE//20)
#     label = tf.image.resize_with_crop_or_pad(label, IMG_SIZE + IMG_SIZE // 20, IMG_SIZE + IMG_SIZE // 20)
    # Make a new seed
    new_seed = tf.random.experimental.stateless_split(seed, num=1)[0, :]
    # Random crop back to the original size
#     image = tf.image.stateless_random_crop(image, size=[IMG_SIZE, IMG_SIZE, 3], seed=seed)
#     label = tf.image.stateless_random_crop(label, size=[IMG_SIZE, IMG_SIZE], seed=seed)
    # Random brightness
#     image = tf.image.stateless_random_brightness(image, max_delta=0.5, seed=new_seed)  # (not the label for this one)
    # Random flip
    image = tf.image.stateless_random_flip_left_right(image, seed=new_seed)
    label = tf.image.stateless_random_flip_left_right(label[..., tf.newaxis], seed=new_seed)[..., 0]
    image = tf.image.stateless_random_flip_up_down(image, seed=new_seed)
    label = tf.image.stateless_random_flip_up_down(label[..., tf.newaxis], seed=new_seed)[..., 0]
#     image = tf.clip_by_value(image, 0, 1)  # Why would I do this?
    return image, label

In [9]:
# import matplotlib.pyplot as plt

# def visualize(original, augmented):
#     fig = plt.figure()
#     plt.subplot(1,2,1)
#     plt.title('Original image')
#     plt.imshow(original)

#     plt.subplot(1,2,2)
#     plt.title('Augmented image')
#     plt.imshow(augmented)

# small_test, = test_ds.take(1)
# for i in range(3):
#     seed = (i, 0)
#     augmented_image, augmented_label = augment_images(small_test, seed)
#     print(augmented_label.shape)
#     visualize(small_test[0], augmented_image)
#     visualize(small_test[1], augmented_label)

In [10]:
if augment:
#     raise NotImplemented()
    counter = tf.data.experimental.Counter()
    train_ds = tf.data.Dataset.zip((train_ds, (counter, counter))).map(augment_images, num_parallel_calls=tf.data.AUTOTUNE)

In [None]:
history = model.fit(train_ds.shuffle(1024).batch(8*2).prefetch(tf.data.AUTOTUNE), validation_data=val_ds.batch(8).prefetch(tf.data.AUTOTUNE), verbose=1,
                    epochs=epochs, callbacks=[tensorboard, reduce_lr, model_checkpoint])

2021-12-10 17:28:53.737924: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2021-12-10 17:28:53.739609: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2195205000 Hz


Epoch 1/100
INFO:tensorflow:batch_all_reduce: 492 all-reduces with algorithm = hierarchical_copy, num_packs = 1
INFO:tensorflow:batch_all_reduce: 492 all-reduces with algorithm = hierarchical_copy, num_packs = 1


2021-12-10 17:30:56.737864: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.8
2021-12-10 17:31:02.016145: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11
2021-12-10 17:31:02.698224: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.11


      1/Unknown - 133s 133s/step - loss: 1.8987 - iou_score: 0.0123

2021-12-10 17:31:06.861139: I tensorflow/core/profiler/lib/profiler_session.cc:136] Profiler session initializing.
2021-12-10 17:31:06.861188: I tensorflow/core/profiler/lib/profiler_session.cc:155] Profiler session started.


      2/Unknown - 135s 2s/step - loss: 1.8204 - iou_score: 0.0132  

2021-12-10 17:31:09.265637: I tensorflow/core/profiler/lib/profiler_session.cc:71] Profiler session collecting data.
2021-12-10 17:31:09.298294: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1487] CUPTI activity buffer flushed
2021-12-10 17:31:09.802956: I tensorflow/core/profiler/internal/gpu/cupti_collector.cc:228]  GpuTracer has collected 7056 callback api events and 7044 activity events. 
2021-12-10 17:31:10.168109: I tensorflow/core/profiler/lib/profiler_session.cc:172] Profiler session tear down.
2021-12-10 17:31:10.574128: I tensorflow/core/profiler/rpc/client/save_profile.cc:137] Creating directory: /lustre_scratch/duncanwp/tensorboard/20211210_172840_new_resnet152_bce_jaccard_loss/train/plugins/profile/2021_12_10_17_31_10
2021-12-10 17:31:10.724311: I tensorflow/core/profiler/rpc/client/save_profile.cc:143] Dumped gzipped tool data for trace.json.gz to /lustre_scratch/duncanwp/tensorboard/20211210_172840_new_resnet152_bce_jaccard_loss/train/plugins/profile/2021_12_10



2021-12-10 17:34:37.054255: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: /lustre_scratch/duncanwp/models/20211210_172840_new_resnet152_bce_jaccard_loss/model/checkpoint/assets
Epoch 2/100
  6/270 [..............................] - ETA: 2:10 - loss: 1.0650 - iou_score: 0.0156

In [None]:
# score = model.evaluate(test_ds.batch(8).prefetch(tf.data.AUTOTUNE), verbose=0)

# print('Test loss    :', score[0])
# print('Test accuracy:', score[1])

In [None]:
import os.path
# save Keras model for Tensorflow Serving

tf.saved_model.save(
    model,
    os.path.join(model_dir, 'model/1'))