# Imports

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import math
import multiprocessing as mp
import os

from keras.applications.vgg16 import VGG16  #, preprocess_input
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.initializations import get_fans, normal
from keras.layers import Dense, Dropout, Flatten, GlobalAveragePooling2D, Input, merge
from keras.metrics import fmeasure, precision, recall
from keras.models import Model
from keras.optimizers import SGD
# from keras.preprocessing.image import ImageDataGenerator
from keras.regularizers import l2
from keras.utils.np_utils import to_categorical
import keras.backend as K
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import tensorflow as tf

# from breast_cancer import input_data
from preprocessing.image import ImageDataGenerator  # multiprocessing ImageDataGenerator

plt.rcParams['figure.figsize'] = (10, 10)

# Settings

In [None]:
# os.environ['CUDA_VISIBLE_DEVICES'] = "0"
size = 224
channels = 3
classes = 3
p = 0.01
val_p = 0.01
batch_size = 64  # 1/2 per GPU; for 2 GPUs, 64 has 1.2x systems speedup over 32
num_gpus = 2
experiment = "vgg16-{p}%-{num_gpus}-gpu-{batch_size}-batch-size-sanity".format(num_gpus=num_gpus, p=p*100, batch_size=batch_size)

# Setup experiment directory

In [None]:
def get_run_dir(path, new_run):
  """Create a directory for this training run."""
  os.makedirs(path, exist_ok=True)
  num_experiments = len(os.listdir(path))
  if new_run:
    run = num_experiments  # run 0, 1, 2, ...
  else:
    run = min(0, num_experiments - 1)  # continue training
  run_dir = os.path.join(path, str(run))
  os.makedirs(run_dir, exist_ok=True)
  return run_dir

def get_experiment_dir(experiment, new_run=True):
  """Create an experiment directory for this experiment."""
  base_dir = os.path.join("experiments", "keras", experiment)
  exp_dir = get_run_dir(base_dir, new_run)
  return exp_dir

exp_dir = get_experiment_dir(experiment)
print(exp_dir)

# Create train & val data generators

In [None]:
def preprocess_input(x):
  """
  Preprocesses a tensor encoding a batch of images.

  Adapted from keras/applications/imagenet_utils.py

  # Arguments
      x: input Numpy tensor, 4D of shape (N, H, W, C).
  # Returns
      Preprocessed tensor.
  """
  # Zero-center by subtracting mean pixel value per channel
  # based on means from a 50%, evenly-distributed sample.
  x[:, :, :, 0] -= 193.84669495
  x[:, :, :, 1] -= 144.05839539
  x[:, :, :, 2] -= 180.20623779 
  x = x[:, :, :, ::-1]  # 'RGB'->'BGR'
  return x

# Multi-GPU exploitation
def split(x, num_splits):
  """Split batch into K equal-sized batches."""
  # Split tensors evenly, even if it means throwing away a few examples.
  samples = x.shape[0] // num_splits
  x_splits = [arr[:samples] for arr in np.array_split(x, num_splits)]
  return x_splits

def gen_preprocessed_batch(batch_generator, num_gpus):
  """Yield preprocessed batches of x,y data."""
  for xs, ys in batch_generator:
    yield split(preprocess_input(xs), num_gpus), split(ys, num_gpus)

In [None]:
tr_save_dir = "images/{stage}/{p}".format(stage="train", p=p)
val_save_dir = "images/{stage}/{p}".format(stage="val", p=val_p)
print(tr_save_dir, val_save_dir)

In [None]:
# Create train & val image generators
try:
  # For interactive work, kill any existing pool.
  pool.terminate()
except:
  pass
pool = mp.Pool(processes=8)
train_datagen = ImageDataGenerator(pool=pool)#, horizontal_flip=True, vertical_flip=True) #,
#                                    rotation_range=90, shear_range=0.1, zoom_range=0.5, fill_mode='reflect')
val_datagen = ImageDataGenerator(pool=pool)
train_generator_orig = train_datagen.flow_from_directory(tr_save_dir, batch_size=batch_size, target_size=(size, size))
val_generator_orig = val_datagen.flow_from_directory(val_save_dir, batch_size=batch_size, target_size=(size, size))

In [None]:
# Create train & val preprocessed generators
train_generator = gen_preprocessed_batch(train_generator_orig, num_gpus)
val_generator = gen_preprocessed_batch(val_generator_orig, num_gpus)

## Get number of samples

In [None]:
# Actual number of examples.
tc = train_generator_orig.nb_sample
vc = val_generator_orig.nb_sample

# Effective number of examples for multi-GPU exploitation.
# Note: Multi-GPU exploitation for data parallelism splits mini-batches
# into a set of micro-batches to be run in parallel on each GPU, but
# Keras will view the set of micro-batches as a single batch with
# multiple sources of inputs (i.e. Keras will view a set of examples
# being run in parallel as a single example with multiple sources of
# inputs), so the effective number of samples will be divided by the
# number of GPUs.
train_samples = int(batch_size / num_gpus * 100) # math.ceil(tc/batch_size))
val_samples = int(batch_size / num_gpus * 100) #math.ceil(vc/batch_size))

print(tc, vc)
print(train_samples, val_samples)

## Generate class weights for training

In [None]:
class_counts = np.bincount(train_generator_orig.classes)
class_weights = dict(zip(range(classes), min(class_counts) / class_counts))
print(class_counts)
print(class_weights)

## Plot random images (Optional)

In [None]:
def show_random_image(save_dir):
  c = np.random.randint(1, 4)
  class_dir = os.path.join(save_dir, str(c))
  files = os.listdir(class_dir)
  i = np.random.randint(0, len(files))
  fname = os.path.join(class_dir, files[i])
  print(fname)
  img = Image.open(fname)
  plt.imshow(img)

# show_random_image(tr_save_dir)

In [None]:
def plot(gen):
  r, c = 6, 6
  fig, ax = plt.subplots(r, c)
  plt.setp(ax, xticks=[], yticks=[])
  plt.tight_layout()
  x, y = next(gen)
  batch_size = x.shape[0]
  for i in range(r):
    for j in range(c):
      if i*c + j < batch_size:
        ax[i][j].imshow(x[i*c + j].astype(np.uint8))
        ax[i][j].set_xlabel(y[i*c + j])

plot(train_generator_orig)
plot(val_generator_orig)

# Training
1. Setup VGG16 pretrained model with new input & output layers.
2. Train new output layers (all others frozen).
3. Fine tune [some subset of the] original layers.
4. Profit.

## Setup training metrics & callbacks

In [None]:
# Setup training metrics & callbacks
# Careful, TensorBoard callback could OOM with large validation set
# TODO: Add input images to TensorBoard output (maybe as a separate callback)
# TODO: Monitor size of input queues with callbacks
model_filename = os.path.join(exp_dir, "{val_loss:.2f}-{epoch:02d}.hdf5")
checkpointer = ModelCheckpoint(model_filename)
tensorboard = TensorBoard(log_dir=exp_dir, write_graph=False)
callbacks = [checkpointer, tensorboard]
metrics = ['accuracy', fmeasure, precision, recall]

## Setup VGG16 model

In [None]:
K.clear_session()

# Custom final dense layer initializer
def my_init(shape, name=None):
  """Gaussian scaled by sqrt(1/fan_in)"""
  fan_in, fan_out = get_fans(shape, dim_ordering=K.image_dim_ordering())
  s = np.sqrt(1. / fan_in)
  return normal(shape, s, name=name)

# Create model by replacing classifier of VGG16 model with new
# classifier specific to the breast cancer problem.
with tf.device("/cpu"):
  inputs = Input(shape=(size,size,channels))
  vgg16 = VGG16(include_top=False)
  x = vgg16(inputs)
#   x = Flatten()(x)
  x = GlobalAveragePooling2D()(x)
#   x = Flatten(name='flatten')(x)
#   x = Dense(4096, activation='relu', name='fc1')(x)
#   x = Dense(4096, activation='relu', name='fc2')(x)
#   x = Dense(classes, activation='softmax', name='predictions')(x)
  preds = Dense(classes, init=my_init, activation="softmax")(x)
  base_model = Model(input=inputs, output=preds, name="vgg16")

# Multi-GPU exploitation via a linear combination of GPU loss functions.
ins = []
outs = []
for i in range(num_gpus):
  with tf.device("/gpu:{}".format(i)):
    x = Input(shape=(size,size,channels))  # split of batch
    out = base_model(x)  # run split on shared model
    ins.append(x)
    outs.append(out)
model = Model(input=ins, output=outs)  # multi-GPU, data-parallel model

# Freeze all pre-trained VGG16 layers.
for layer in vgg16.layers:
  layer.trainable = False

# Compile model.
optim = SGD(lr=0.1, momentum=0.9, decay=0.99, nesterov=True)
model.compile(optimizer=optim, loss="categorical_crossentropy",
              loss_weights=[1/num_gpus]*num_gpus, metrics=metrics)

In [None]:
# Explore model
for x in model.inputs + model.outputs + model.metrics_tensors + model.targets:
  print(x.name, x.device)  # check that tensor devices exploit multi-GPU

for i, layer in enumerate(vgg16.layers):
  print(i, layer.name)

print(base_model.summary())

In [None]:
# Visualize Model
# from IPython.display import SVG
# from keras.utils.visualize_util import model_to_dot
# SVG(model_to_dot(vgg16).create(prog='dot', format='svg'))

## Train new softmax classifier

In [None]:
# Dual-GPU speedup: ~1.7-1.8x
# Keras device placement improvements (metrics, losses) (no val or callbacks, full model):
#   batch_size=32,  2 gpus, 100 iters, no keras changes: 128s, 108s, 107s
#   batch_size=32,  2 gpus, 100 iters, w/ keras changes: 94s, 75s, 75s
#   batch_size=32,  1 gpu,  100 iters, w/ keras changes: 148s, 133s, 133s
#   batch_size=64,  2 gpus,  50 iters, w/ keras changes: 93s, 74s, 75s
#   batch_size=128, 2 gpus,  25 iters, w/ keras changes: 90s, 73s, 74s
epochs = 1
model.fit_generator(train_generator, samples_per_epoch=train_samples,
                    validation_data=val_generator, nb_val_samples=val_samples,
                    nb_epoch=epochs, class_weight=class_weights, callbacks=callbacks,
                    max_q_size=8, nb_worker=1, pickle_safe=False)

## Fine-tune model

In [None]:
# Unfreeze some subset of the model and fine-tune by training slowly with low lr.
for layer in vgg16.layers[15:]:  # unfreeze final 2 blocks + exit flow ([11:])
  layer.trainable = True

optim = SGD(lr=0.0001, momentum=0.9)
model.compile(optimizer=optim, loss='categorical_crossentropy', metrics=metrics)

In [None]:
# Explore model
print(model.summary())

In [None]:
initial_epoch = epochs
epochs = initial_epoch + 15
model.fit_generator(train_generator, samples_per_epoch=train_samples,
                    validation_data=val_generator, nb_val_samples=val_samples,
                    nb_epoch=epochs, initial_epoch=initial_epoch,
                    class_weight=class_weights, callbacks=callbacks,
                    max_q_size=8, nb_worker=1, pickle_safe=False)

## Evaluate model on validation set

In [None]:
raw_metrics = model.evaluate_generator(val_generator, val_samples=val_samples,
                                       max_q_size=8, nb_worker=1, pickle_safe=False)
labeled_metrics = list(zip(model.metrics_names, raw_metrics))
losses = [v for k,v in labeled_metrics if k == "loss"]
accuracies = [v for k,v in labeled_metrics if k.endswith("acc")]
loss = sum(losses) / num_gpus
acc = sum(accuracies) / num_gpus
metrics = {"loss": loss, "acc": acc}
print(labeled_metrics)
print(metrics)

## Save model

In [None]:
filename = "{acc:.5}_acc_{loss:.5}_loss_model.hdf5".format(**metrics)
model.save(os.path.join(exp_dir, filename))

# Cleanup

In [None]:
# Stop processes cleanly.  Otherwise, zombie processes will
# persist and hold onto GPU memory.
try:
    pool.terminate()
except:
    pass
for p in mp.active_children():
  p.terminate()
mp.active_children()