# Imports

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import math
import multiprocessing as mp
import os
import threading

from keras.applications.resnet50 import ResNet50, preprocess_input
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.initializations import get_fans, normal
from keras.layers import Dense, Dropout, Flatten, GlobalAveragePooling2D, Input, merge
from keras.models import Model
from keras.optimizers import SGD
# from keras.preprocessing.image import ImageDataGenerator
from tools.image import ImageDataGenerator
from keras.regularizers import l2
from keras.utils.np_utils import to_categorical
import keras.backend as K
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import tensorflow as tf

# from breast_cancer import input_data

plt.rcParams['figure.figsize'] = (10, 10)

# Settings

In [None]:
# os.environ['CUDA_VISIBLE_DEVICES'] = ""  #0"
# c = tf.ConfigProto()
# c.gpu_options.visible_device_list="0"
# sess = tf.Session(config=c)
# K.set_session(sess)

In [None]:
SIZE = 256
CHANNELS = 3
FEATURES = SIZE * SIZE * CHANNELS
CLASSES = 3
p = 0.01
val_p = 0.01
batch_size = 64  # 1/2 per GPU
num_gpus = 1
use_caching = False

In [None]:
EXPERIMENT = "resnet50-{p}%-multi-gpu-{batch_size}-batch-size-sanity".format(p=p, batch_size=batch_size)
# EXPERIMENT = "resnet50-{p}%-multi-gpu-{batch_size}-batch-size".format(p=p, batch_size=batch_size)

# Setup experiment directory

In [None]:
def get_run_dir(path):
  """Create a new directory for this training run."""
  os.makedirs(path, exist_ok=True)
  run = len(os.listdir(path))  # run 0, 1, 2, ...
  run_dir = os.path.join(path, str(run))
  os.makedirs(run_dir)
  return run_dir

def get_experiment_dir(experiment):
  """Create an experiment directory for this experiment."""
  base_dir = os.path.join("experiments", "keras", experiment)
  exp_dir = get_run_dir(base_dir)
  return exp_dir

exp_dir = get_experiment_dir(EXPERIMENT)
print(exp_dir)

# Create train & val data generators

In [None]:
def preprocess_input(x):
  """
  Preprocesses a tensor encoding a batch of images.

  Adapted from keras/applications/imagenet_utils.py

  # Arguments
      x: input Numpy tensor, 4D of shape (N, H, W, C).
  # Returns
      Preprocessed tensor.
  """
  # Zero-center by subtracting mean pixel value per channel
  # based on means from a 50%, evenly-distributed sample.
  x[:, :, :, 0] -= 193.84669495
  x[:, :, :, 1] -= 144.05839539
  x[:, :, :, 2] -= 180.20623779
  # 'RGB'->'BGR' 
  x = x[:, :, :, ::-1]
  return x

# Multi-GPU exploitation
def split(x, num_splits):
  """Split batch into K equal-sized batches."""
  # Split tensors evenly, even if it means throwing away a few examples.
  samples = x.shape[0] // num_splits
  x_splits = [arr[:samples] for arr in np.array_split(x, num_splits)]
  return x_splits

# class threadsafe_iter:
#     """Takes an iterator/generator and makes it thread-safe by
#     serializing call to the `next` method of given iterator/generator.
#     """
#     def __init__(self, it):
#         self.it = it
#         self.lock = threading.Lock()

#     def __iter__(self):
#         return self

#     def __next__(self):
#         with self.lock:
#             return next(self.it)


# def threadsafe_generator(f):
#     """A decorator that takes a generator function and makes it thread-safe.
#     """
#     def g(*a, **kw):
#         return threadsafe_iter(f(*a, **kw))
#     return g

# @threadsafe_generator
def gen_preprocessed_batch(batch_generator, num_gpus):
  """Yield preprocessed batches of x,y data."""
  for xs, ys in batch_generator:
    yield split(preprocess_input(xs), num_gpus), split(ys, num_gpus)

In [None]:
tr_save_dir = "images/{stage}/{p}".format(stage="train", p=p)
val_save_dir = "images/{stage}/{p}".format(stage="val", p=val_p)
print(tr_save_dir, val_save_dir)

In [None]:
processes = 8
try:
    pool.terminate()
except:
    pass
pool = None
# pool = mp.Pool(processes=processes)

In [None]:
# train_datagen = ImageDataGenerator(rotation_range=360, shear_range=0.5, zoom_range=0.5
#                                    horizontal_flip=True, vertical_flip=True)
train_datagen = ImageDataGenerator(pool=pool)
val_datagen = ImageDataGenerator(pool=pool)
train_generator_orig = train_datagen.flow_from_directory(tr_save_dir, batch_size=batch_size)
val_generator_orig = val_datagen.flow_from_directory(val_save_dir, batch_size=batch_size)

In [None]:
# Create train & val preprocessed generators
train_generator = gen_preprocessed_batch(train_generator_orig, num_gpus)
val_generator = gen_preprocessed_batch(val_generator_orig, num_gpus)

In [None]:
def plot_images(img_gen, title, batch_size):
  r, c = 6, 6
  fig, ax = plt.subplots(r, c)
  plt.setp(ax, xticks=[], yticks=[])
  plt.tight_layout(rect=[0, 0.03, 1, 0.95])
  for (imgs, labels) in img_gen:
    for i in range(r):
      for j in range(c):
        if i*c + j < batch_size:
          ax[i][j].imshow(imgs[i*c + j].astype(np.uint8))
    break

plot_images(train_generator_orig, "hi", batch_size)

## Get number of samples

In [None]:
tc = train_generator_orig.nb_sample
vc = val_generator_orig.nb_sample
print(tc, vc)

## Generate class weights for training

In [None]:
class_counts = np.bincount(train_generator_orig.classes)
class_weights = dict(zip(range(CLASSES), min(class_counts) / class_counts))
print(class_counts)
print(class_weights)

# Fetch random image (Optional)

In [None]:
def show_random_image(save_dir):
  c = np.random.randint(1, 4)
  class_dir = os.path.join(save_dir, str(c))
  files = os.listdir(class_dir)
  i = np.random.randint(0, len(files))
  fname = os.path.join(class_dir, files[i])
  print(fname)
  img = Image.open(fname)
  plt.imshow(img)

In [None]:
# show_random_image(tr_save_dir)

# Setup training callbacks

In [None]:
# Setup training callbacks
# Careful, TensorBoard callback could OOM with large validation set
# TODO: Add input images to TensorBoard output (maybe as a separate callback)
# TODO: Monitor size of input queues with callbacks
model_filename = os.path.join(exp_dir, "{val_loss:.2f}-{epoch:02d}.hdf5")
checkpointer = ModelCheckpoint(model_filename)
tensorboard = TensorBoard(log_dir=exp_dir, write_graph=False)  #, histogram_freq=1, write_images=True)
callbacks = [checkpointer, tensorboard]

# Keras

## ResNet50
1. Setup ResNet50 pretrained model with new input & output layers.
2. Train new output layers (all others frozen).
3. Fine tune [some subset of the] original layers.
4. Profit.

In [None]:
# Custom final dense layer initializer
def my_init(shape, name=None, dim_ordering='tf'):
  """Gaussian scaled by sqrt(1/fan_in)"""
  fan_in, fan_out = get_fans(shape, dim_ordering=dim_ordering)
  s = np.sqrt(1. / fan_in)
  return normal(shape, s, name=name)

In [None]:
K.clear_session()

In [None]:
# Create model by replacing classifier of ResNet50 model with new
# classifier specific for the breast cancer problem.
# with tf.device("/cpu:0"):  # This isn't needed, nor is the Sequential model needed
inputs = Input(shape=(SIZE,SIZE,CHANNELS))
resnet50 = ResNet50(include_top=False)
# resnet_smaller = Model(input=resnet50.input, output=resnet50.get_layer("activation_4").output)  # 1st residual block
x = resnet50(inputs)
x = Flatten()(x)
# x = resnet_smaller(inputs)
# x = GlobalAveragePooling2D()(x)
preds = Dense(CLASSES, init=my_init, activation="softmax")(x)
model = Model(input=inputs, output=preds, name="resnet50")

# # Multi-GPU exploitation via a linear combination of GPU loss functions.
# with tf.device("/gpu:0"):
#   x0 = Input(shape=(SIZE,SIZE,CHANNELS))  # first split of batch
#   out0 = model(x0)  # run first split on shared model
# with tf.device("/gpu:1"):
#   x1 = Input(shape=(SIZE,SIZE,CHANNELS))  # second split of batch
#   out1 = model(x1)  # run second split on shared model
# model = Model(input=[x0, x1], output=[out0, out1])  # data-parallel model

# Freeze all pre-trained ResNet layers.
# for layer in resnet50.layers:
#   layer.trainable = False

# # Add L2 regularization ("weight decay")
# for layer in resnet50.layers:
#   if hasattr(layer, 'W_regularizer'):
#     layer.W_regularizer = l2(1e-4)

# Compile model.
# TODO: 0.1 LR divided by 10 at 32k & 48k iterations, 0.9 momentum, l2 reg 1e-4
optim = SGD(lr=0.1, momentum=0.9, decay=0.99, nesterov=True)
model.compile(optimizer=optim, loss="categorical_crossentropy",
              loss_weights=[1/num_gpus]*num_gpus, metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
# from IPython.display import SVG
# from keras.utils.visualize_util import model_to_dot
# # SVG(model_to_dot(resnet50).create(prog='dot', format='svg'))
# SVG(model_to_dot(resnet_smaller).create(prog='dot', format='svg'))

In [None]:
# for i, layer in enumerate(resnet50.layers):
#   print(i, layer.name)

In [None]:
#pip3 install git+https://github.com/fbcotter/py3nvml.git
import tools.sysmonitor as SM
sys_mon = SM.SysMonitor()
sys_mon.start()

In [None]:
# Train the new softmax layer
# Note: Multi-GPU exploitation for data parallelism splits mini-batches
# into a set of micro-batches to be run in parallel on each GPU, but
# Keras will view the set of micro-batches as a single batch with
# multiple sources of inputs (i.e. Keras will view a set of examples
# being run in parallel as a single example with multiple sources of
# inputs), so the effective number of samples will be divided by the
# number of GPUs.
# GPU speedup:
#   1000s/epoch -> 620s/epoch (1.6x) for batch_size=32
# num workers + queue size experiments (100 batches):
#   231s w/ 1 worker + 10 q-size; 117s w/ 2 workers + 10 q-size
#   77s w/ 4 workers + 10 q-size; 79 w/ 8 workers + 10 q-size
#   75s w/ 4 workers + 4 q-size; 77s w/ 4 workers + 8 q-size;
#   76s w/ 4 workers + 16 q-size; 76s w/ 8 workers + 16 q-size;
train_samples = batch_size / num_gpus * 20  #math.ceil(tc/batch_size)
val_samples = batch_size / num_gpus * math.ceil(vc/batch_size)
epochs = 1
model.fit_generator(train_generator, samples_per_epoch=train_samples,
                    #validation_data=val_generator, nb_val_samples=val_samples,
                    nb_epoch=epochs, class_weight=class_weights, #callbacks=callbacks,
                    max_q_size=8, nb_worker=1, pickle_safe=False)

In [None]:
sys_mon.stop()
title = '{0:.2f} seconds of computation, using {1} processes, batch size = {2}'.format(sys_mon.duration, processes, batch_size)
sys_mon.plot(title, True)

In [None]:
sys_mon.stop()
title = '{0:.2f} seconds of computation, using {1} processes, batch size = {2}'.format(sys_mon.duration, processes, batch_size)
sys_mon.plot(title, True)

In [None]:
# Fine tune by unfreezing some subset of the model
# and training slowly with low lr.
for layer in resnet50.layers[154:]:  # unfreeze final 2 residual blocks + exit flow
  layer.trainable = True

optim = SGD(lr=0.0001, momentum=0.9)
model.compile(optimizer=optim, loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
initial_epoch = epochs
epochs = initial_epoch + 1
model.fit_generator(train_generator, samples_per_epoch=train_samples,
                    validation_data=val_generator, nb_val_samples=val_samples,
                    nb_epoch=epochs, initial_epoch=initial_epoch,
                    class_weight=class_weights, callbacks=callbacks,
                    max_q_size=8, nb_worker=4, pickle_safe=True)

In [None]:
# Evaluate model on validation set
raw_metrics = model.evaluate_generator(val_generator, val_samples=val_samples,
                                       max_q_size=8, nb_worker=4, pickle_safe=True)
labeled_metrics = list(zip(model.metrics_names, raw_metrics))
losses = [v for k,v in labeled_metrics if k == "loss"]
accuracies = [v for k,v in labeled_metrics if k.endswith("acc")]
loss = sum(losses) / num_gpus
acc = sum(accuracies) / num_gpus
metrics = {"loss": loss, "acc": acc}
print(labeled_metrics)
print(metrics)

In [None]:
# Save model
filename = "{acc:.5}_acc_{loss:.5}_loss_model.hdf5".format(**metrics)
model.save(os.path.join(exp_dir, filename))

In [None]:
# Stop processes cleanly
# (otherwise, zombie processes will persist and hold onto GPU memory)
import multiprocessing as mp
for p in mp.active_children():
  p.terminate()
mp.active_children()

In [None]:
pool.terminate()

In [None]:
mp.active_children()