In [1]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [11]:
import tensorflow as tf
from tensorboard.plugins.hparams import api as hp
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
import numpy as np
import time

In [3]:
# Clear any logs from previous runs
import shutil
shutil.rmtree('logs')

## Initial setup

In [4]:
def build_vgg16():
    model = Sequential()
    
    # Block 1
    model.add(Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=img_shape))
    model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))

    # Block 2
    model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
    model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))

    # Block 3
    model.add(Conv2D(256, (3, 3), activation='relu', padding='same'))
    model.add(Conv2D(256, (3, 3), activation='relu', padding='same'))
    model.add(Conv2D(256, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))

    # Block 4
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))

    # Block 5
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(Conv2D(512, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))

    # Flatten the output and create fully connected layers
    model.add(Flatten())
    model.add(Dense(4096, activation='relu'))
    model.add(Dense(4096, activation='relu'))
#     model.add(tf.keras.layers.Dropout(dropout))
    model.add(Dense(3, activation='softmax'))  
    
    return model

- Load dataset

In [5]:
batch_size = 16
img_height = 184
img_width = 216
img_size = (img_height, img_width)
img_shape = img_size + (3,)
ds_path = r"/drive0-storage/Gracia/dataset_1"

with tf.device("CPU"):
    train_ds = tf.keras.utils.image_dataset_from_directory(ds_path,
                                            validation_split = 0.2,
                                            subset = "training",
                                            seed = 123,
                                            image_size = img_size,
                                            batch_size = batch_size)

    val_ds = tf.keras.utils.image_dataset_from_directory(ds_path,
                                          validation_split = 0.2,
                                          subset = "validation",
                                          seed = 123,
                                          image_size = img_size,
                                          batch_size = batch_size)

Found 21408 files belonging to 3 classes.
Using 17127 files for training.
Found 21408 files belonging to 3 classes.
Using 4281 files for validation.


In [6]:
with tf.device("CPU"):
    AUTOTUNE = tf.data.AUTOTUNE

    val_batches = tf.data.experimental.cardinality(val_ds)
    test_dataset = val_ds.take(val_batches // 2)
    validation_dataset = val_ds.skip(val_batches // 2)

    train_dataset = train_ds.prefetch(buffer_size = AUTOTUNE)
    validation_dataset = validation_dataset.prefetch(buffer_size = AUTOTUNE)
    test_dataset = test_dataset.prefetch(buffer_size = AUTOTUNE)

* Hyperparameter setup

In [19]:
HP_LEARNING_RATE = hp.HParam('learning_rate', hp.Discrete([1e-5, 1e-4, 1e-3]))
HP_BATCH_SIZE = hp.HParam('batch_size', hp.Discrete([16, 32, 64]))
HP_OPTIMIZER = hp.HParam('optimizer', hp.Discrete(['adam', 'sgd']))

METRIC_ACCURACY = 'accuracy'
# METRIC_SPECIFICITY = 'specificity'
# METRIC_SENSITIVITY = 'sensitivity'
# METRIC_RECALL = 'recall'

with tf.summary.create_file_writer('vgg_logs/hparam_tuning').as_default():
    hp.hparams_config(
        hparams=[HP_LEARNING_RATE, HP_BATCH_SIZE, HP_OPTIMIZER],
        metrics=[hp.Metric(METRIC_ACCURACY, display_name='Accuracy')])
#                  hp.Metric(METRIC_SPECIFICITY, display_name='Specificity'),
#                  hp.Metric(METRIC_SENSITIVITY, display_name='Sensitivity'),
#                  hp.Metric(METRIC_RECALL, display_name='Recall')]
#     )

## Define training & run function

In [27]:
def model_training(hparams):
    model = build_vgg16()
    
    # Compile model
    optimizer = hparams[HP_OPTIMIZER]
    LR = hparams[HP_LEARNING_RATE]
    if optimizer == "adam":
        optimizer = tf.optimizers.Adam(learning_rate=LR)
    elif optimizer == "sgd":
        optimizer = tf.optimizers.SGD(learning_rate=LR)
        
    model.compile(optimizer=optimizer,
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits = False),
                  metrics=['accuracy'])
    
    # Train model
    EPOCH = 8

    model.fit(train_dataset,
        validation_data = validation_dataset,
        epochs = EPOCH,
        batch_size=hparams[HP_BATCH_SIZE],
        callbacks=[tf.keras.callbacks.TensorBoard(logdir),
                   hp.KerasCallback(logdir, hparams)]
        )
    _, test_accuracy = model.evaluate(test_dataset)
    return test_accuracy

In [26]:
def run(run_dir, hparams):
    st = time.time()
    with tf.summary.create_file_writer(run_dir).as_default():
        hp.hparams(hparams)
        accuracy = model_training(hparams)
        tf.summary.scalar(METRIC_ACCURACY, accuracy, step=1)
    et = time.time()
    runtime = et - st
    print(f'total runtime: {runtime:.3f} s\n')

In [28]:
session_num = 0
logdir = r'vgg_logs/hparam_tuning'

for learning_rate in HP_LEARNING_RATE.domain.values:
    for optimizer in HP_OPTIMIZER.domain.values:
        for batch_size in HP_BATCH_SIZE.domain.values:
            hparams = {
                      HP_LEARNING_RATE: learning_rate,
                      HP_OPTIMIZER: optimizer,
                      HP_BATCH_SIZE: batch_size
                      }
            run_name = "run-%d" % session_num
            print('--- Starting trial: %s' % run_name)
            print({h.name: hparams[h] for h in hparams})
            run('vgg_logs/hparam_tuning/' + run_name, hparams)
            session_num += 1

--- Starting trial: run-0
{'learning_rate': 1e-05, 'optimizer': 'adam', 'batch_size': 16}
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
total runtime: 795.330 s

--- Starting trial: run-1
{'learning_rate': 1e-05, 'optimizer': 'adam', 'batch_size': 32}
Epoch 1/8


2023-06-09 15:00:12.290713: W tensorflow/core/common_runtime/bfc_allocator.cc:290] Allocator (GPU_0_bfc) ran out of memory trying to allocate 310.64MiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.


UnknownError: Graph execution error:

Detected at node 'gradient_tape/sequential_8/conv2d_105/Conv2D/Conv2DBackpropInput' defined at (most recent call last):
    File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "/usr/local/lib/python3.8/dist-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/usr/local/lib/python3.8/dist-packages/traitlets/config/application.py", line 978, in launch_instance
      app.start()
    File "/usr/local/lib/python3.8/dist-packages/ipykernel/kernelapp.py", line 712, in start
      self.io_loop.start()
    File "/usr/local/lib/python3.8/dist-packages/tornado/platform/asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "/usr/lib/python3.8/asyncio/base_events.py", line 570, in run_forever
      self._run_once()
    File "/usr/lib/python3.8/asyncio/base_events.py", line 1859, in _run_once
      handle._run()
    File "/usr/lib/python3.8/asyncio/events.py", line 81, in _run
      self._context.run(self._callback, *self._args)
    File "/usr/local/lib/python3.8/dist-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "/usr/local/lib/python3.8/dist-packages/ipykernel/kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "/usr/local/lib/python3.8/dist-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell
      await result
    File "/usr/local/lib/python3.8/dist-packages/ipykernel/kernelbase.py", line 730, in execute_request
      reply_content = await reply_content
    File "/usr/local/lib/python3.8/dist-packages/ipykernel/ipkernel.py", line 383, in do_execute
      res = shell.run_cell(
    File "/usr/local/lib/python3.8/dist-packages/ipykernel/zmqshell.py", line 528, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py", line 2885, in run_cell
      result = self._run_cell(
    File "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py", line 2940, in _run_cell
      return runner(coro)
    File "/usr/local/lib/python3.8/dist-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py", line 3139, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py", line 3318, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py", line 3378, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/tmp/ipykernel_82590/1010684322.py", line 15, in <module>
      run('vgg_logs/hparam_tuning/' + run_name, hparams)
    File "/tmp/ipykernel_82590/1629040698.py", line 5, in run
      accuracy = model_training_adam(hparams)
    File "/tmp/ipykernel_82590/3344560709.py", line 19, in model_training_adam
      model.fit(train_dataset,
    File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 1564, in fit
      tmp_logs = self.train_function(iterator)
    File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 1160, in train_function
      return step_function(self, iterator)
    File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 1146, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 1135, in run_step
      outputs = model.train_step(data)
    File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 997, in train_step
      self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
    File "/usr/local/lib/python3.8/dist-packages/keras/optimizers/optimizer_v2/optimizer_v2.py", line 576, in minimize
      grads_and_vars = self._compute_gradients(
    File "/usr/local/lib/python3.8/dist-packages/keras/optimizers/optimizer_v2/optimizer_v2.py", line 634, in _compute_gradients
      grads_and_vars = self._get_gradients(
    File "/usr/local/lib/python3.8/dist-packages/keras/optimizers/optimizer_v2/optimizer_v2.py", line 510, in _get_gradients
      grads = tape.gradient(loss, var_list, grad_loss)
Node: 'gradient_tape/sequential_8/conv2d_105/Conv2D/Conv2DBackpropInput'
CUDNN failed to allocate the scratch space for the runner or to find a working no-scratch runner.
	 [[{{node gradient_tape/sequential_8/conv2d_105/Conv2D/Conv2DBackpropInput}}]] [Op:__inference_train_function_149778]

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## Start runs

## Visualize in TensorBoard

In [3]:
%tensorboard --logdir vgg_logs/hparam_tuning