In [0]:
!nvidia-smi # P100
!git clone https://github.com/klinime/Tiny_Imagenet_200.git

Thu May 14 00:33:23 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.82       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [0]:
!git clone https://github.com/klinime/Tiny_Imagenet_200.git
!pip install neural_structured_learning

fatal: destination path 'Tiny_Imagenet_200' already exists and is not an empty directory.


In [0]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K
import neural_structured_learning as nsl

from google.colab import drive
drive.mount('/content/gdrive')
import pathlib
import os

from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False

PATH = '/content/gdrive/My Drive/CS182_Project/'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
# Data augmentation based on:
# https://www.wouterbulten.nl/blog/tech/data-augmentation-using-tensorflow-data-dataset/

def hflip(img: tf.Tensor) -> tf.Tensor:
  return tf.image.random_flip_left_right(img)

# interpolation pads 0 instead, undesired so not used
# import tensorflow_addons as tfa
# def random_rotate(img: tf.Tensor) -> tf.Tensor:
#   return tfa.image.rotate(img, tf.random.uniform([], -0.5, 0.5))

def random_crop(img: tf.Tensor) -> tf.Tensor:
  return tf.image.random_crop(tf.image.resize(
    img, tf.shape(img)[:2]+16), tf.shape(img))

def color_jitter(img: tf.Tensor) -> tf.Tensor:
  img = tf.image.random_hue(img, 0.04)
  img = tf.image.random_saturation(img, 0.4, 1.6)
  img = tf.image.random_brightness(img, 0.08)
  img = tf.image.random_contrast(img, 0.7, 1.3)
  return img

def noise_salt_pepper(img: tf.Tensor) -> tf.Tensor:
  mask_img = K.random_binomial(shape=tf.shape(img), p=0.01)
  mask_noise = K.random_binomial(shape=tf.shape(img), p=0.5)
  out = img * (1 - mask_img) + mask_noise * mask_img
  # Issue: https://github.com/tensorflow/tensorflow/issues/24520
  out.set_shape([None, None, None])
  return out

def noise_gaussian(img: tf.Tensor) -> tf.Tensor:
  out = img + tf.random.normal(tf.shape(img), 0, tf.sqrt(0.01))
  # Issue: https://github.com/tensorflow/tensorflow/issues/24520
  out.set_shape([None, None, None])
  return out

def noise_poisson(img: tf.Tensor) -> tf.Tensor:
  # idk what im doing for this one, but it kinda works
  vals = tf.cast(tf.math.reduce_prod(tf.shape(img)), tf.float32)
  vals = tf.math.exp(tf.math.floor(tf.math.log(vals)))
  out = tf.squeeze(tf.random.poisson([1], img * vals)) / vals
  # Issue: https://github.com/tensorflow/tensorflow/issues/24520
  out.set_shape([None, None, None])
  return out

# Image loading based on:
# https://www.tensorflow.org/tutorials/load_data/images

AUTOTUNE = tf.data.experimental.AUTOTUNE

def load_train_data(folder):
  train_dir = pathlib.Path(folder)
  train_image_count = len(list(train_dir.glob('**/*.JPEG')))
  train_list_ds = tf.data.Dataset.list_files(str(train_dir/'*/images/*'))
  print('Discovered {} images for training data.'.format(train_image_count))

  class_names = np.array(sorted(
      [item.name for item in pathlib.Path(train_dir).glob('*')]))
  def process_path(file_path):
    # convert the path to a list of path components
    parts = tf.strings.split(file_path, os.path.sep)
    # third to last is the class-directory
    label = parts[-3] == class_names
    
    # encode label from string to int index in class_names
    label = tf.argmax(tf.dtypes.cast(label, tf.int32), axis=0)
    # load the raw data from the file as a string
    img = tf.io.read_file(file_path)
    # convert the compressed string to a 3D uint8 tensor
    img = tf.image.decode_jpeg(img, channels=3)
    # use 'convert_image_dtype' to convert to floats in the [0, 1] range
    img = tf.image.convert_image_dtype(img, tf.float32)
    return img, label
  
  train_labeled_ds = train_list_ds.map(process_path, num_parallel_calls=AUTOTUNE)
  return class_names, train_labeled_ds, train_image_count

def load_val_data(folder, class_names):
  val_dir = pathlib.Path(folder)
  val_image_count = len(list(val_dir.glob('*/*.JPEG')))
  val_path = str(val_dir/'images')
  val_list_ds = tf.data.Dataset.from_tensor_slices(
      sorted(['{}/{}'.format(val_path, item.name)
              for item in val_dir.glob('images/*')],
             key=lambda x: int(x[x.rfind('_')+1:x.find('.')])))
  val_label_ds = tf.data.TextLineDataset(str(val_dir/'val_annotations.txt'))
  val_label_ds = val_label_ds.map(
      lambda x: tf.argmax(tf.dtypes.cast(
          tf.strings.split(x, '\t')[1] == class_names, tf.int32), axis=0), 
      num_parallel_calls=AUTOTUNE, deterministic=True)
  print('Discovered {} images for validation data.'.format(val_image_count))

  def process_images(file_path):
    img = tf.io.read_file(file_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.convert_image_dtype(img, tf.float32)
    return img
  
  val_img_ds = val_list_ds.map(
      process_images, num_parallel_calls=AUTOTUNE, deterministic=True)
  val_labeled_ds = tf.data.Dataset.zip((val_img_ds, val_label_ds))
  return val_labeled_ds, val_image_count

def prepare_for_training(ds, batch_size, cache=True, shuffle_buffer_size=1000):
  # use '.cache(filename)' to cache preprocessing work for big datasets
  if cache:
    if isinstance(cache, str):
      ds = ds.cache(cache)
    else:
      ds = ds.cache()
  for augment in [hflip, random_crop, color_jitter]:
    ds = ds.map(lambda img, label: (tf.cond(tf.random.uniform([], 0, 1) < 0.5, 
        lambda: augment(img), lambda: img), label), num_parallel_calls=AUTOTUNE)
  for augment in [noise_salt_pepper, noise_gaussian, noise_poisson]:
    ds = ds.map(lambda img, label: (tf.cond(tf.random.uniform([], 0, 1) < 0.2, 
        lambda: augment(img), lambda: img), label), num_parallel_calls=AUTOTUNE)
  ds = ds.map(lambda img, label: (tf.clip_by_value(img, 0, 1), label), 
              num_parallel_calls=AUTOTUNE)
  ds = ds.map(lambda img, label: (tf.image.per_image_standardization(img), label),
              num_parallel_calls=AUTOTUNE)
  
  ds = ds.shuffle(buffer_size=shuffle_buffer_size).repeat().batch(batch_size)
  # 'prefetch' fetches batches in the background while the model is training
  ds = ds.prefetch(buffer_size=AUTOTUNE)
  return ds

def prepare_for_validation(ds, batch_size, cache=True, shuffle_buffer_size=1000):
  if cache:
    if isinstance(cache, str):
      ds = ds.cache(cache)
    else:
      ds = ds.cache()
  ds = ds.map(lambda img, label: (tf.clip_by_value(img, 0, 1), label), 
              num_parallel_calls=AUTOTUNE)
  ds = ds.map(lambda img, label: (tf.image.per_image_standardization(img), label),
              num_parallel_calls=AUTOTUNE)
  ds = ds.shuffle(buffer_size=shuffle_buffer_size).batch(batch_size)
  ds = ds.prefetch(buffer_size=AUTOTUNE)
  return ds

def prepare_for_testing(folder, class_names, batch_size):
  test_dir = pathlib.Path(folder)
  test_image_count = len(list(test_dir.glob('*/*.JPEG')))
  test_path = str(test_dir/'images')
  test_list_ds = tf.data.Dataset.from_tensor_slices(
      sorted(['{}/{}'.format(test_path, item.name)
              for item in test_dir.glob('images/*')],
             key=lambda x: int(x[x.rfind('_')+1:x.find('.')])))
  print('Discovered {} images for test data.'.format(test_image_count))

  def process_images(file_path):
    img = tf.io.read_file(file_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.convert_image_dtype(img, tf.float32)
    return img
  
  ds = test_list_ds.map(process_images, num_parallel_calls=AUTOTUNE, deterministic=True)
  ds = ds.map(lambda img: tf.clip_by_value(img, 0, 1), 
              num_parallel_calls=AUTOTUNE, deterministic=True)
  ds = ds.map(lambda img: tf.image.per_image_standardization(img),
              num_parallel_calls=AUTOTUNE, deterministic=True)
  ds = ds.batch(batch_size)
  ds = ds.prefetch(buffer_size=AUTOTUNE)
  return ds

def prepare_data(params, folder):
  class_names, train_data, train_count = load_train_data(folder + 'train/')
  val_data, val_count = load_val_data(folder + 'val/', class_names)
  batch_size = params['batch_size']
  val_ratio = 10
  train_data = prepare_for_training(train_data, batch_size)
  val_data = prepare_for_validation(val_data, batch_size)
  train_data_adv = train_data.map(lambda x, y: {'image': x, 'label': y},
                                  num_parallel_calls=AUTOTUNE)
  val_data_adv = val_data.map(lambda x, y: {'image': x, 'label': y},
                              num_parallel_calls=AUTOTUNE, deterministic=True)
  steps_per_epoch = np.ceil(train_count / batch_size)
  validation_steps = np.ceil(val_count / batch_size / val_ratio)
  return class_names, train_data_adv, val_data_adv, steps_per_epoch, validation_steps

In [0]:
from tensorflow.python.keras import backend
from tensorflow.python.keras.applications import imagenet_utils
from tensorflow.python.keras.engine import training
from tensorflow.python.keras.utils import data_utils
from tensorflow.keras import regularizers

BASE_WEIGHT_PATH = ('https://storage.googleapis.com/tensorflow/'
                    'keras-applications/mobilenet_v2/')
layers = tf.keras.layers

def MobileNetV2(input_shape=None,
                alpha=1.0,
                include_top=True,
                weights='imagenet',
                pooling=None,
                l2_reg=1e-4):
  
  # If input_shape is not None, assume default size
  if backend.image_data_format() == 'channels_first':
    rows = input_shape[1]
    cols = input_shape[2]
  else:
    rows = input_shape[0]
    cols = input_shape[1]

  if rows == cols and rows in [96, 128, 160, 192, 224]:
    default_size = rows
  else:
    default_size = 224

  input_shape = imagenet_utils.obtain_input_shape(
      input_shape,
      default_size=default_size,
      min_size=32,
      data_format=backend.image_data_format(),
      require_flatten=include_top,
      weights=weights)

  if backend.image_data_format() == 'channels_last':
    row_axis, col_axis = (0, 1)
  else:
    row_axis, col_axis = (1, 2)
  rows = input_shape[row_axis]
  cols = input_shape[col_axis]

  img_input = layers.Input(shape=input_shape)

  channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1

  first_block_filters = _make_divisible(32 * alpha, 8)
  x = layers.ZeroPadding2D(
      padding=imagenet_utils.correct_pad(img_input, 3),
      name='Conv1_pad')(img_input)
  x = layers.Conv2D(
      first_block_filters,
      kernel_size=3,
      strides=(2, 2),
      padding='valid',
      use_bias=False,
      kernel_regularizer=regularizers.l2(l2_reg),
      name='Conv1')(
          x)
  x = layers.BatchNormalization(
      axis=channel_axis, epsilon=1e-3, momentum=0.999, name='bn_Conv1')(
          x)
  x = layers.ReLU(6., name='Conv1_relu')(x)

  x = _inverted_res_block(
      x, filters=16, alpha=alpha, stride=1, expansion=1, block_id=0)

  x = _inverted_res_block(
      x, filters=24, alpha=alpha, stride=2, expansion=6, block_id=1)
  x = _inverted_res_block(
      x, filters=24, alpha=alpha, stride=1, expansion=6, block_id=2)

  x = _inverted_res_block(
      x, filters=32, alpha=alpha, stride=2, expansion=6, block_id=3)
  x = _inverted_res_block(
      x, filters=32, alpha=alpha, stride=1, expansion=6, block_id=4)
  x = _inverted_res_block(
      x, filters=32, alpha=alpha, stride=1, expansion=6, block_id=5)

  x = _inverted_res_block(
      x, filters=64, alpha=alpha, stride=2, expansion=6, block_id=6)
  x = _inverted_res_block(
      x, filters=64, alpha=alpha, stride=1, expansion=6, block_id=7)
  x = _inverted_res_block(
      x, filters=64, alpha=alpha, stride=1, expansion=6, block_id=8)
  x = _inverted_res_block(
      x, filters=64, alpha=alpha, stride=1, expansion=6, block_id=9)

  x = _inverted_res_block(
      x, filters=96, alpha=alpha, stride=1, expansion=6, block_id=10)
  x = _inverted_res_block(
      x, filters=96, alpha=alpha, stride=1, expansion=6, block_id=11)
  x = _inverted_res_block(
      x, filters=96, alpha=alpha, stride=1, expansion=6, block_id=12)

  x = _inverted_res_block(
      x, filters=160, alpha=alpha, stride=2, expansion=6, block_id=13)
  x = _inverted_res_block(
      x, filters=160, alpha=alpha, stride=1, expansion=6, block_id=14)
  x = _inverted_res_block(
      x, filters=160, alpha=alpha, stride=1, expansion=6, block_id=15)

  x = _inverted_res_block(
      x, filters=320, alpha=alpha, stride=1, expansion=6, block_id=16)

  # no alpha applied to last conv as stated in the paper:
  # if the width multiplier is greater than 1 we
  # increase the number of output channels
  if alpha > 1.0:
    last_block_filters = _make_divisible(1280 * alpha, 8)
  else:
    last_block_filters = 1280

  x = layers.Conv2D(
      last_block_filters, kernel_size=1, use_bias=False, name='Conv_1',
      kernel_regularizer=regularizers.l2(l2_reg))(
          x)
  x = layers.BatchNormalization(
      axis=channel_axis, epsilon=1e-3, momentum=0.999, name='Conv_1_bn')(
          x)
  x = layers.ReLU(6., name='out_relu')(x)

  if pooling == 'avg':
    x = layers.GlobalAveragePooling2D()(x)
  elif pooling == 'max':
    x = layers.GlobalMaxPooling2D()(x)

  inputs = img_input

  # Create model.
  model = training.Model(inputs, x, name='mobilenetv2_%0.2f_%s' % (alpha, rows))

  # Load weights.
  if weights == 'imagenet':
    if include_top:
      model_name = ('mobilenet_v2_weights_tf_dim_ordering_tf_kernels_' +
                    str(alpha) + '_' + str(rows) + '.h5')
      weight_path = BASE_WEIGHT_PATH + model_name
      weights_path = data_utils.get_file(
          model_name, weight_path, cache_subdir='models')
    else:
      model_name = ('mobilenet_v2_weights_tf_dim_ordering_tf_kernels_' +
                    str(alpha) + '_' + str(rows) + '_no_top' + '.h5')
      weight_path = BASE_WEIGHT_PATH + model_name
      weights_path = data_utils.get_file(
          model_name, weight_path, cache_subdir='models')
    model.load_weights(weights_path)
  elif weights is not None:
    model.load_weights(weights)

  return model


def _inverted_res_block(inputs, expansion, stride, alpha, filters, block_id):
  """Inverted ResNet block."""
  channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1

  in_channels = backend.int_shape(inputs)[channel_axis]
  pointwise_conv_filters = int(filters * alpha)
  pointwise_filters = _make_divisible(pointwise_conv_filters, 8)
  x = inputs
  prefix = 'block_{}_'.format(block_id)

  if block_id:
    # Expand
    x = layers.Conv2D(
        expansion * in_channels,
        kernel_size=1,
        padding='same',
        use_bias=False,
        activation=None,
        kernel_regularizer=regularizers.l2(l2_reg),
        name=prefix + 'expand')(
            x)
    x = layers.BatchNormalization(
        axis=channel_axis,
        epsilon=1e-3,
        momentum=0.999,
        name=prefix + 'expand_BN')(
            x)
    x = layers.ReLU(6., name=prefix + 'expand_relu')(x)
  else:
    prefix = 'expanded_conv_'

  # Depthwise
  if stride == 2:
    x = layers.ZeroPadding2D(
        padding=imagenet_utils.correct_pad(x, 3),
        name=prefix + 'pad')(x)
  x = layers.DepthwiseConv2D(
      kernel_size=3,
      strides=stride,
      activation=None,
      use_bias=False,
      padding='same' if stride == 1 else 'valid',
      depthwise_regularizer=regularizers.l2(l2_reg),
      name=prefix + 'depthwise')(
          x)
  x = layers.BatchNormalization(
      axis=channel_axis,
      epsilon=1e-3,
      momentum=0.999,
      name=prefix + 'depthwise_BN')(
          x)

  x = layers.ReLU(6., name=prefix + 'depthwise_relu')(x)

  # Project
  x = layers.Conv2D(
      pointwise_filters,
      kernel_size=1,
      padding='same',
      use_bias=False,
      activation=None,
      kernel_regularizer=regularizers.l2(l2_reg),
      name=prefix + 'project')(
          x)
  x = layers.BatchNormalization(
      axis=channel_axis,
      epsilon=1e-3,
      momentum=0.999,
      name=prefix + 'project_BN')(
          x)

  if in_channels == pointwise_filters and stride == 1:
    return layers.Add(name=prefix + 'add')([inputs, x])
  return x

def _make_divisible(v, divisor, min_value=None):
  if min_value is None:
    min_value = divisor
  new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
  # Make sure that round down does not go down by more than 10%.
  if new_v < 0.9 * v:
    new_v += divisor
  return new_v

def create_model(name, img_dim, up_ratio, num_classes):
  img = keras.Input(shape=(img_dim, img_dim, 3), dtype='float32', name='image')
  upsample = keras.layers.UpSampling2D(up_ratio, name='upsample')
  base_model = MobileNetV2(
      input_shape=(img_dim * up_ratio, img_dim * up_ratio, 3),
      include_top=False,
      pooling='avg'
  )
  base_model.summary()
  base_model.trainable = False
  prediction = keras.layers.Dense(num_classes, name='prediction')
  out = prediction(base_model(upsample(img)))
  return keras.Model(inputs=img, outputs=out, name=name)

In [0]:
def main(params):
  tf.random.set_seed(0) # is not deterministic though
  classes, train_data_adv, val_data_adv, steps_per_epoch, validation_steps = \
      prepare_data(params, 'Tiny_Imagenet_200/data/tiny-imagenet-200/')
  adv_config = nsl.configs.make_adv_reg_config(
      multiplier=params['adv_multiplier'],
      adv_step_size=params['adv_step_size'],
      adv_grad_norm=params['adv_grad_norm']
  )
  base_model = create_model('base_model', params['img_dim'], 2, len(classes))
  base_model.summary()
  adv_model = nsl.keras.AdversarialRegularization(
      base_model, label_keys=['label'], adv_config=adv_config)
  adv_model.compile(optimizer=keras.optimizers.SGD(learning_rate=params['lr'], momentum=0.9),
                    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                    metrics=['sparse_categorical_crossentropy', 'sparse_categorical_accuracy',
                             'sparse_top_k_categorical_accuracy'])
  
  ckpt_callback = keras.callbacks.ModelCheckpoint(PATH + 'checkpoint/ckpt_{epoch:03d}')
  tsb_callback = keras.callbacks.TensorBoard(log_dir=PATH+'logs')
  stop_callback = keras.callbacks.EarlyStopping(
      monitor='loss', min_delta=0.001, patience=2, mode='min')
  ft_history = adv_model.fit(
      train_data_adv,
      epochs=params['init_epochs'],
      steps_per_epoch=steps_per_epoch,
      validation_data=val_data_adv,
      validation_steps=validation_steps,
      callbacks=[ckpt_callback, tsb_callback, stop_callback],
      verbose=2
  )
  print(ft_history.history)
  epochs_trained = len(ft_history.history['loss'])

  v2_model = base_model.get_layer(name='mobilenetv2_1.00_128')
  v2_model.trainable = True
  finetune_layer_0 = -12
  print('\n\nStart finetuning at {}'.format(v2_model.layers[finetune_layer_0].name))
  for layer in v2_model.layers[:finetune_layer_0]:
    layer.trainable = False
  adv_model.compile(optimizer=keras.optimizers.SGD(learning_rate=params['lr']/10, momentum=0.9),
                    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                    metrics=['sparse_categorical_crossentropy', 'sparse_categorical_accuracy',
                             'sparse_top_k_categorical_accuracy'])
  base_model.summary()
  ft0_history = adv_model.fit(
      train_data_adv,
      initial_epoch=epochs_trained,
      epochs=epochs_trained+params['ft0_epochs'],
      steps_per_epoch=steps_per_epoch,
      validation_data=val_data_adv,
      validation_steps=validation_steps,
      callbacks=[ckpt_callback, tsb_callback, stop_callback],
      verbose=2
  )
  print(ft0_history.history)
  epochs_trained += len(ft0_history.history['loss'])

  finetune_layer_1 = -30
  print('\n\nStart finetuning at {}'.format(v2_model.layers[finetune_layer_1].name))
  for layer in v2_model.layers[finetune_layer_1:finetune_layer_0]:
    layer.trainable = True
  adv_model.compile(optimizer=keras.optimizers.SGD(learning_rate=params['lr']/100, momentum=0.9),
                    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                    metrics=['sparse_categorical_crossentropy', 'sparse_categorical_accuracy',
                             'sparse_top_k_categorical_accuracy'])
  base_model.summary()
  ft1_history = adv_model.fit(
      train_data_adv,
      initial_epoch=epochs_trained,
      epochs=epochs_trained+params['ft1_epochs'],
      steps_per_epoch=steps_per_epoch,
      validation_data=val_data_adv,
      validation_steps=validation_steps,
      callbacks=[ckpt_callback, tsb_callback],
      verbose=2
  )
  print(ft1_history.history)
  epochs_trained += len(ft1_history.history['loss'])

  # print('\n\nFinetune all')
  # for layer in v2_model.layers[:finetune_layer_1]:
  #   layer.trainable = True
  # adv_model.compile(optimizer=keras.optimizers.SGD(learning_rate=params['lr']/1000, momentum=0.9),
  #                   loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
  #                   metrics=['sparse_categorical_crossentropy', 'sparse_categorical_accuracy',
  #                            'sparse_top_k_categorical_accuracy'])
  # base_model.summary()
  # ft2_history = adv_model.fit(
  #     train_data_adv,
  #     initial_epoch=epochs_trained,
  #     epochs=epochs_trained+params['ft2_epochs'],
  #     steps_per_epoch=steps_per_epoch,
  #     # validation_data=val_data_adv,
  #     # validation_steps=validation_steps,
  #     callbacks=[ckpt_callback, tsb_callback, stop_callback],
  #     verbose=2
  # )
  # print(ft2_history.history)

  return adv_model

In [0]:
if __name__ == '__main__':
  params = {
      'img_dim': 64,
      'lr': 0.01,
      'batch_size': 128,
      'val_ratio': 10,
      'adv_multiplier': 0.2,
      'adv_step_size': 0.2,
      'adv_grad_norm': 'infinity',
      # max number of epochs to train for each stage of finetuning
      'init_epochs': 100, # 108s per epoch, P100 batch_size 128
      'ft0_epochs':  100, # 111s per epoch, P100 batch_size 128
      'ft1_epochs':  100, # 115s per epoch, P100 batch_size 128
      # 'ft2_epochs':  100,
  }
  main(params)

# checkpoint_v2
# Epoch  78/144, val_loss: 3.1419 - val_sparse_categorical_accuracy: 0.4814 - val_sparse_top_k_categorical_accuracy: 0.7217
# Epoch  90/144, val_loss: 3.1792 - val_sparse_categorical_accuracy: 0.4814 - val_sparse_top_k_categorical_accuracy: 0.7168

# checkpoint_v2reg
# Epoch  91/154, val_loss: 7.5040 - val_sparse_categorical_accuracy: 0.5000 - val_sparse_top_k_categorical_accuracy: 0.7412
# Epoch  92/154, val_loss: 7.5210 - val_sparse_categorical_accuracy: 0.5010 - val_sparse_top_k_categorical_accuracy: 0.7285
# Epoch 113/154, val_loss: 7.5302 - val_sparse_categorical_accuracy: 0.4814 - val_sparse_top_k_categorical_accuracy: 0.7461
# Epoch 154/154, val_loss: 7.4702 - val_sparse_categorical_accuracy: 0.4941 - val_sparse_top_k_categorical_accuracy: 0.7432


Discovered 100000 images for training data.
Discovered 10000 images for validation data.
Model: "mobilenetv2_1.00_128"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 128, 128, 3) 0                                            
__________________________________________________________________________________________________
Conv1_pad (ZeroPadding2D)       (None, 129, 129, 3)  0           input_1[0][0]                    
__________________________________________________________________________________________________
Conv1 (Conv2D)                  (None, 64, 64, 32)   864         Conv1_pad[0][0]                  
__________________________________________________________________________________________________
bn_Conv1 (BatchNormalization)   (None, 64, 64, 32)   128         Conv1[0][0]             