# Imports

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import math
import os

from keras.applications.resnet50 import ResNet50, preprocess_input
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.initializations import get_fans, normal
from keras.layers import Dense, Dropout, Flatten, GlobalAveragePooling2D, Input, merge
from keras.models import Model
from keras.optimizers import SGD
from keras.preprocessing.image import ImageDataGenerator
from keras.regularizers import l2
from keras.utils.np_utils import to_categorical
import keras.backend as K
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import tensorflow as tf

# from breast_cancer import input_data

plt.rcParams['figure.figsize'] = (10, 10)

# Settings

In [None]:
# os.environ['CUDA_VISIBLE_DEVICES'] = "0"
# c = tf.ConfigProto()
# c.gpu_options.visible_device_list="0"
# sess = tf.Session(config=c)
# K.set_session(sess)

In [None]:
size = 256
channels = 3
classes = 3
p = 0.01
val_p = 0.01
batch_size = 128  # 1/2 per GPU
num_gpus = 2

In [None]:
experiment = "resnet50-{p}%-{num_gpus}-gpu-{batch_size}-batch-size-sanity".format(num_gpus=num_gpus, p=p, batch_size=batch_size)

# Setup experiment directory

In [None]:
def get_run_dir(path):
  """Create a new directory for this training run."""
  os.makedirs(path, exist_ok=True)
  run = len(os.listdir(path))  # run 0, 1, 2, ...
  run_dir = os.path.join(path, str(run))
  os.makedirs(run_dir)
  return run_dir

def get_experiment_dir(experiment):
  """Create an experiment directory for this experiment."""
  base_dir = os.path.join("experiments", "keras", experiment)
  exp_dir = get_run_dir(base_dir)
  return exp_dir

exp_dir = get_experiment_dir(experiment)
print(exp_dir)

# Create train & val data generators

In [None]:
def preprocess_input(x):
  """
  Preprocesses a tensor encoding a batch of images.

  Adapted from keras/applications/imagenet_utils.py

  # Arguments
      x: input Numpy tensor, 4D of shape (N, H, W, C).
  # Returns
      Preprocessed tensor.
  """
  # Zero-center by subtracting mean pixel value per channel
  # based on means from a 50%, evenly-distributed sample.
  x[:, :, :, 0] -= 193.84669495
  x[:, :, :, 1] -= 144.05839539
  x[:, :, :, 2] -= 180.20623779 
  x = x[:, :, :, ::-1]  # 'RGB'->'BGR'
  return x

# Multi-GPU exploitation
def split(x, num_splits):
  """Split batch into K equal-sized batches."""
  # Split tensors evenly, even if it means throwing away a few examples.
  samples = x.shape[0] // num_splits
  x_splits = [arr[:samples] for arr in np.array_split(x, num_splits)]
  return x_splits

def gen_preprocessed_batch(batch_generator, num_gpus):
  """Yield preprocessed batches of x,y data."""
  for xs, ys in batch_generator:
    yield split(preprocess_input(xs), num_gpus), split(ys, num_gpus)

In [None]:
tr_save_dir = "images/{stage}/{p}".format(stage="train", p=p)
val_save_dir = "images/{stage}/{p}".format(stage="val", p=val_p)
print(tr_save_dir, val_save_dir)

In [None]:
# train_datagen = ImageDataGenerator(rotation_range=360, shear_range=0.5, zoom_range=0.5
#                                    horizontal_flip=True, vertical_flip=True)
train_datagen = ImageDataGenerator()
val_datagen = ImageDataGenerator()
train_generator_orig = train_datagen.flow_from_directory(tr_save_dir, batch_size=batch_size)
val_generator_orig = val_datagen.flow_from_directory(val_save_dir, batch_size=batch_size)

In [None]:
# Create train & val preprocessed generators
train_generator = gen_preprocessed_batch(train_generator_orig, num_gpus)
val_generator = gen_preprocessed_batch(val_generator_orig, num_gpus)

## Get number of samples

In [None]:
tc = train_generator_orig.nb_sample
vc = val_generator_orig.nb_sample
print(tc, vc)

## Generate class weights for training

In [None]:
class_counts = np.bincount(train_generator_orig.classes)
class_weights = dict(zip(range(classes), min(class_counts) / class_counts))
print(class_counts)
print(class_weights)

# Plot random images (Optional)

In [None]:
def show_random_image(save_dir):
  c = np.random.randint(1, 4)
  class_dir = os.path.join(save_dir, str(c))
  files = os.listdir(class_dir)
  i = np.random.randint(0, len(files))
  fname = os.path.join(class_dir, files[i])
  print(fname)
  img = Image.open(fname)
  plt.imshow(img)

In [None]:
# show_random_image(tr_save_dir)

In [None]:
def plot_images(img_gen, batch_size):
  # Adapted from https://github.com/stratospark/keras-multiprocess-image-data-generator
  r, c = 6, 6
  fig, ax = plt.subplots(r, c)
  plt.setp(ax, xticks=[], yticks=[])
  plt.tight_layout()
  for (imgs, labels) in img_gen:
    for i in range(r):
      for j in range(c):
        if i*c + j < batch_size:
          ax[i][j].imshow(imgs[i*c + j].astype(np.uint8))
    break

# plot_images(train_generator_orig, batch_size)

# Setup training callbacks

In [None]:
# Setup training callbacks
# Careful, TensorBoard callback could OOM with large validation set
# TODO: Add input images to TensorBoard output (maybe as a separate callback)
# TODO: Monitor size of input queues with callbacks
model_filename = os.path.join(exp_dir, "{val_loss:.2f}-{epoch:02d}.hdf5")
checkpointer = ModelCheckpoint(model_filename)
tensorboard = TensorBoard(log_dir=exp_dir, write_graph=False)
callbacks = [checkpointer, tensorboard]

# Keras

## ResNet50
1. Setup ResNet50 pretrained model with new input & output layers.
2. Train new output layers (all others frozen).
3. Fine tune [some subset of the] original layers.
4. Profit.

In [None]:
# Custom final dense layer initializer
def my_init(shape, name=None, dim_ordering='tf'):
  """Gaussian scaled by sqrt(1/fan_in)"""
  fan_in, fan_out = get_fans(shape, dim_ordering=dim_ordering)
  s = np.sqrt(1. / fan_in)
  return normal(shape, s, name=name)

In [None]:
K.clear_session()

# Create model by replacing classifier of ResNet50 model with new
# classifier specific for the breast cancer problem.
with tf.device("/cpu"):
  inputs = Input(shape=(size,size,channels))
  resnet50 = ResNet50(include_top=False)
  # resnet_smaller = Model(input=resnet50.input, output=resnet50.get_layer("activation_4").output)  # 1st residual block
  x = resnet50(inputs)
  x = Flatten()(x)
  # x = resnet_smaller(inputs)
  # x = GlobalAveragePooling2D()(x)
  preds = Dense(classes, init=my_init, activation="softmax")(x)
  base_model = Model(input=inputs, output=preds, name="resnet50")

# Multi-GPU exploitation via a linear combination of GPU loss functions.
ins = []
outs = []
for i in range(num_gpus):
  with tf.device("/gpu:{}".format(i)):
    x = Input(shape=(size,size,channels))  # split of initial batch
    out = base_model(x)  # run split on shared model
    ins.append(x)
    outs.append(out)
model = Model(input=ins, output=outs)  # data-parallel model

# with tf.device("/gpu:0"):
#   x0 = Input(shape=(size,size,channels))  # first split of batch
#   out0 = model(x0)  # run first split on shared model
# with tf.device("/gpu:1"):
#   x1 = Input(shape=(size,size,channels))  # second split of batch
#   out1 = model(x1)  # run second split on shared model
# model = Model(input=[x0, x1], output=[out0, out1])  # data-parallel model

# Freeze all pre-trained ResNet layers.
# for layer in resnet50.layers:
#   layer.trainable = False

# # Add L2 regularization ("weight decay")
# for layer in resnet50.layers:
#   if hasattr(layer, 'W_regularizer'):
#     layer.W_regularizer = l2(1e-4)

# Compile model.
# TODO: 0.1 LR divided by 10 at 32k & 48k iterations, 0.9 momentum, l2 reg 1e-4
optim = SGD(lr=0.1, momentum=0.9, decay=0.99, nesterov=True)
model.compile(optimizer=optim, loss="categorical_crossentropy",
              loss_weights=[1/num_gpus]*num_gpus, metrics=['accuracy'])

In [None]:
for x in model.inputs:
  print(x.device)

In [None]:
for x in model.outputs:
  print(x.device)

In [None]:
for x in model.metrics_tensors:
  print(x.name, x.device)

In [None]:
model._make_train_function()
model._make_test_function()

In [None]:
graph_def = tf.get_default_graph().as_graph_def()

In [None]:
# sorted([(n.name, n.device) for n in graph_def.node if n.device == ''])

In [None]:
# From: http://nbviewer.jupyter.org/github/tensorflow/tensorflow/blob/master/tensorflow/examples/tutorials/deepdream/deepdream.ipynb
# Helper functions for TF Graph visualization
from IPython.display import clear_output, Image, display, HTML
def strip_consts(graph_def, max_const_size=32):
    """Strip large constant values from graph_def."""
    strip_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = strip_def.node.add() 
        n.MergeFrom(n0)
        if n.op == 'Const':
            tensor = n.attr['value'].tensor
            size = len(tensor.tensor_content)
            if size > max_const_size:
                tensor.tensor_content = bytes("<stripped %d bytes>"%size, 'utf-8')
    return strip_def
  
def rename_nodes(graph_def, rename_func):
    res_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = res_def.node.add() 
        n.MergeFrom(n0)
        n.name = rename_func(n.name)
        for i, s in enumerate(n.input):
            n.input[i] = rename_func(s) if s[0]!='^' else '^'+rename_func(s[1:])
    return res_def
  
def show_graph(graph_def, max_const_size=32):
    """Visualize TensorFlow graph."""
    if hasattr(graph_def, 'as_graph_def'):
        graph_def = graph_def.as_graph_def()
    strip_def = strip_consts(graph_def, max_const_size=max_const_size)
    code = """
        <script>
          function load() {{
            document.getElementById("{id}").pbtxt = {data};
          }}
        </script>
        <link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
        <div style="height:600px">
          <tf-graph-basic id="{id}"></tf-graph-basic>
        </div>
    """.format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))
  
    iframe = """
        <iframe seamless style="width:800px;height:620px;border:0" srcdoc="{}"></iframe>
    """.format(code.replace('"', '&quot;'))
    display(HTML(iframe))

# Visualizing the network graph. Be sure expand the "mixed" nodes to see their 
# internal structure. We are going to visualize "Conv2D" nodes.
tmp_def = rename_nodes(graph_def, lambda s:"/".join(s.split('_',1)))
show_graph(tmp_def)

In [None]:
print(model.summary())

In [None]:
# from IPython.display import SVG
# from keras.utils.visualize_util import model_to_dot
# # SVG(model_to_dot(resnet50).create(prog='dot', format='svg'))
# SVG(model_to_dot(resnet_smaller).create(prog='dot', format='svg'))

In [None]:
# for i, layer in enumerate(resnet50.layers):
#   print(i, layer.name)

In [None]:
# Train the new softmax layer
# Note: Multi-GPU exploitation for data parallelism splits mini-batches
# into a set of micro-batches to be run in parallel on each GPU, but
# Keras will view the set of micro-batches as a single batch with
# multiple sources of inputs (i.e. Keras will view a set of examples
# being run in parallel as a single example with multiple sources of
# inputs), so the effective number of samples will be divided by the
# number of GPUs.
# GPU speedup:
#   1000s/epoch -> 620s/epoch (1.6x) for batch_size=32
# Keras device placement improvements (metrics, losses) (no val or callbacks):
#   batch_size=32,  2 gpus, 100 iters, no keras changes: 128s, 108s, 107s
#   batch_size=32,  2 gpus, 100 iters, w/ keras changes: 94s, 75s, 75s
#   batch_size=32,  1 gpu,  100 iters, w/ keras changes: 148s, 133s, 133s
#   batch_size=64,  2 gpus,  50 iters, w/ keras changes: 93s, 74s, 75s
#   batch_size=128, 2 gpus,  25 iters, w/ keras changes: 90s, 73s, 74s
train_samples = batch_size / num_gpus * 100 / 4 #math.ceil(tc/batch_size)
val_samples = batch_size / num_gpus * math.ceil(vc/batch_size)
epochs = 1
model.fit_generator(train_generator, samples_per_epoch=train_samples,
                    #validation_data=val_generator, nb_val_samples=val_samples,
                    nb_epoch=epochs, class_weight=class_weights, #callbacks=callbacks,
                    max_q_size=8, nb_worker=4, pickle_safe=True)

In [None]:
# Fine tune by unfreezing some subset of the model
# and training slowly with low lr.
for layer in resnet50.layers[154:]:  # unfreeze final 2 residual blocks + exit flow
  layer.trainable = True

optim = SGD(lr=0.0001, momentum=0.9)
model.compile(optimizer=optim, loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
print(model.summary())

In [None]:
initial_epoch = epochs
epochs = initial_epoch + 1
model.fit_generator(train_generator, samples_per_epoch=train_samples,
                    validation_data=val_generator, nb_val_samples=val_samples,
                    nb_epoch=epochs, initial_epoch=initial_epoch,
                    class_weight=class_weights, callbacks=callbacks,
                    max_q_size=8, nb_worker=1, pickle_safe=False)

In [None]:
# Evaluate model on validation set
raw_metrics = model.evaluate_generator(val_generator, val_samples=val_samples,
                                       max_q_size=8, nb_worker=1, pickle_safe=False)
labeled_metrics = list(zip(model.metrics_names, raw_metrics))
losses = [v for k,v in labeled_metrics if k == "loss"]
accuracies = [v for k,v in labeled_metrics if k.endswith("acc")]
loss = sum(losses) / num_gpus
acc = sum(accuracies) / num_gpus
metrics = {"loss": loss, "acc": acc}
print(labeled_metrics)
print(metrics)

In [None]:
# Save model
filename = "{acc:.5}_acc_{loss:.5}_loss_model.hdf5".format(**metrics)
model.save(os.path.join(exp_dir, filename))

In [None]:
# Stop processes cleanly
# (otherwise, zombie processes will persist and hold onto GPU memory)
import multiprocessing as mp
for p in mp.active_children():
  p.terminate()
mp.active_children()