In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import math
import multiprocessing as mp
import os
import queue
import shutil
import threading

from keras.applications.resnet50 import ResNet50, preprocess_input
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.initializations import get_fans, normal
from keras.layers import Activation, Dense, Dropout, Flatten, Input, Lambda, Permute, Reshape, merge
from keras.models import Model
from keras.optimizers import SGD
from keras.utils.np_utils import to_categorical
import keras.backend as K
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from breast_cancer import input_data

plt.rcParams['figure.figsize'] = (10, 6)

In [None]:
#from pyspark.sql import SparkSession
#spark = (SparkSession.builder.appName("KerasResNet50").getOrCreate())

In [None]:
# os.environ['CUDA_VISIBLE_DEVICES'] = "0"
# c = tf.ConfigProto()
# c.gpu_options.visible_device_list="0"
# sess = tf.Session(config=c)
# K.set_session(sess)

In [None]:
EXPERIMENT = "resnet50-collect-0.02-multi-gpu"

# Setup model and log directories

In [None]:
def get_run_dir(path):
  """Create a new directory for this training run."""
  os.makedirs(path, exist_ok=True)
  run = len(os.listdir(path))  # run 0, 1, 2, ...
  run_dir = os.path.join(path, str(run))
  os.makedirs(run_dir)
  return run_dir

def get_log_dir(experiment):
  """Create a log directory for this experiment."""
  base_log_dir = os.path.join("tf_logs", "keras", experiment)
  log_dir = get_run_dir(base_log_dir)
  return log_dir

def get_model_dir(experiment):
  """Create a model directory for this experiment."""
  base_model_dir = os.path.join("models", "keras", experiment)
  model_dir = get_run_dir(base_model_dir)
  return model_dir

log_dir = get_log_dir(EXPERIMENT)
model_dir = get_model_dir(EXPERIMENT)
print(log_dir, model_dir)

# Read in train & val data

In [None]:
SIZE = 256
CHANNELS = 3
FEATURES = SIZE * SIZE * CHANNELS
CLASSES = 3
p = 0.02

In [None]:
train_df, val_df = input_data.read_train_val_data(spark, SIZE, CHANNELS, p)

p = 0.01  # just for val
# Debug generators -- use NumPy array
# tr_filename = os.path.join("data", "train_{}_sample_{}.parquet".format(p, SIZE))
val_filename = os.path.join("data", "val_{}_sample_{}.parquet".format(p, SIZE))
# train_df = spark.read.load(tr_filename)
val_df = spark.read.load(val_filename)

In [None]:
train_df.cache()
val_df.cache()

In [None]:
tc = train_df.count()
vc = val_df.count()
print(tc, vc)
# print(train_df.rdd.getNumPartitions(), val_df.rdd.getNumPartitions())

## Compute image channel means

In [None]:
means = input_data.compute_channel_means(train_df, CHANNELS, SIZE)
print(means.shape)
print(means)

## Generate class weights for training

In [None]:
class_weights = input_data.gen_class_weights(train_df)
print(class_weights)

## Create asynchronous queuing batch generators

In [None]:
# # Create train & val generators
batch_size = 128
# train_generator_orig, train_ps, train_queues, train_stop_event = input_data.create_batch_generator(train_df.rdd, batch_size=batch_size)
# val_generator_orig, val_ps, val_queues, val_stop_event = input_data.create_batch_generator(val_df.rdd, batch_size=batch_size)

In [None]:
# # Debugging - Print queue sizes
# for q in train_queues + val_queues:
#   print(q.qsize())

# Keras

## ResNet50
1. Read in 256x256x3 dataframe.
2. Create random crops to 224x224x3.
3. Setup ResNet50 pretrained model with new input & output layers.
4. Train new output layers (all others frozen).
5. Fine tune additional layers.
6. Profit.

In [None]:
# Preprocess with slide image means
# Adapted from keras/applications/imagenet_utils.py
def preprocess_input(x):
    """Preprocesses a tensor encoding a batch of images.
    # Arguments
        x: input Numpy tensor, 4D.
    # Returns
        Preprocessed tensor.
    """
    # 'RGB'->'BGR'
    x = x[:, :, :, ::-1]
    # Zero-center by mean pixel
    # `means` is stored in RGB, but we need BGR
    x[:, :, :, 0] -= means[2]  #103.939
    x[:, :, :, 1] -= means[0]  #116.779
    x[:, :, :, 2] -= means[1]  #123.68
    return x

In [None]:
# def to_categorical(y, classes):
#   # Avoid cast to float64 as done in keras.utils.np_utils.to_categorical
#   n = len(y)
#   Y = np.zeros((n, classes), dtype=np.int32)
#   Y[np.arange(n), y] = 1
#   return Y

# TODO: Clean this up -- remove access to global variables
def gen_preprocessed_batch(batch_generator):
  for xs, ys in batch_generator:
    xs = (xs.reshape((-1,CHANNELS,SIZE,SIZE))  # shape (N,C,H,W)
            .transpose((0,2,3,1))  # shape (N,H,W,C)
            .astype(np.float32))
    yield preprocess_input(xs), to_categorical(ys-1, CLASSES)

In [None]:
# # Create train & val preprocessed generators
# train_generator = gen_preprocessed_batch(train_generator_orig)
# val_generator = gen_preprocessed_batch(val_generator_orig)

###
# Debug generators -- use NumPy array
def to_arrays(df):
  rows = (df.select("sample", "tumor_score")
            .rdd
            .map(lambda row: (np.array(row.sample.values).astype(np.uint8), row.tumor_score))
            .collect())
  x_rows, y_rows = zip(*rows)
  x = (np.array(x_rows)
         .reshape((-1,CHANNELS,SIZE,SIZE))  # shape (N,C,H,W)
         .transpose((0,2,3,1))
         .astype(np.float32))  # shape (N,H,W,C)
  y = np.array(y_rows).astype(np.uint32)
  return preprocess_input(x), to_categorical(y-1, CLASSES)

train_x, train_y = to_arrays(train_df)
val_x, val_y = to_arrays(val_df)

# Multi-GPU exploitation
# Determine equal split size and split tensors, even if it means throwing away a few examples.
def split(x, num_splits):
  """Split batch into K equal-sized batches."""
  samples = x.shape[0] // num_splits
  x_splits = [arr[:samples] for arr in np.array_split(x, num_splits)]
  return x_splits

num_gpus = 2
train_x = split(train_x, num_gpus)
train_y = split(train_y, num_gpus)
val_x = split(val_x, num_gpus)
val_y = split(val_y, num_gpus)
###

In [None]:
# Setup training callbacks
# Careful, TensorBoard callback could OOM with large validation set
model_filename = os.path.join(model_dir, "{val_loss:.2f}-{epoch:02d}.hdf5")
checkpointer = ModelCheckpoint(model_filename)
tensorboard = TensorBoard(log_dir=log_dir)  #, histogram_freq=1, write_images=True)
callbacks = [checkpointer, tensorboard]

In [None]:
# Custom final dense layer initializer
def my_init(shape, name=None, dim_ordering='tf'):
    """Guassian scaled by sqrt(1/fan_in)"""
    fan_in, fan_out = get_fans(shape, dim_ordering=dim_ordering)
    s = np.sqrt(1. / fan_in)
    return normal(shape, s, name=name)

In [None]:
# Create model by replacing classifier of ResNet50 model with new classifier
# specific for the breast cancer problem.
inputs = Input(shape=(SIZE,SIZE,CHANNELS))
resnet50 = ResNet50(include_top=False)
x = resnet50(inputs)
x = Flatten()(x)
preds = Dense(CLASSES, init=my_init, activation="softmax")(x)
model = Model(input=inputs, output=preds)

# Multi-GPU exploitation via a linear combination of GPU loss functions.
x0 = Input(shape=(SIZE,SIZE,CHANNELS))  # first split of batch
x1 = Input(shape=(SIZE,SIZE,CHANNELS))  # second split of batch
with tf.device("/gpu:0"):
  out0 = model(x0)  # run first split on shared model
with tf.device("/gpu:1"):
  out1 = model(x1)  # run second split on shared model
model = Model(input=[x0, x1], output=[out0, out1])  # data-parallel model

# Freeze all pre-trained ResNet layers.
for layer in resnet50.layers:
  layer.trainable = False

# Compile model.
optim = SGD(lr=0.01, momentum=0.5, decay=0.0, nesterov=True)
# model.compile(optimizer=optim, loss="categorical_crossentropy", metrics=['accuracy'])
model.compile(optimizer=optim, loss="categorical_crossentropy",
              loss_weights=[1/num_gpus, 1/num_gpus], metrics=['accuracy'])

In [None]:
# # Train the new softmax layer
train_samples = math.ceil(tc/batch_size) * batch_size
val_samples = math.ceil(vc/batch_size) * batch_size
epochs = 1  #10
# model.fit_generator(train_generator, samples_per_epoch=train_samples, nb_epoch=epochs,
#                     validation_data=val_generator, nb_val_samples=val_samples,
#                     class_weight=class_weights, callbacks=callbacks,
#                     nb_worker=1, pickle_safe=True)

###
# Debug generators -- use NumPy array
# model.fit(train_x, train_y, batch_size=batch_size, validation_data=[val_x, val_y],
#           nb_epoch=epochs, class_weight=class_weights, callbacks=callbacks)
# Multi-GPU: Speedup 1000s/epoch -> 620s/epoch (1.6x) for batch_size=32, 608s/epoch for batch_size=128
# TODO: Edit https://github.com/fchollet/keras/blob/c07d0e6448bb63762bc7a19d87814f6fba79fa32/keras/engine/training.py#L659-L659
#  to place each loss function on the device of `y_pred` w/ `with tf.device(y_pred.device):`
#  Speedup: 620s/epoch -> 646s (need to look into this more....)
model.fit(train_x, train_y, batch_size=batch_size, validation_data=[val_x, val_y],
          nb_epoch=epochs, class_weight=class_weights, callbacks=callbacks)
###

In [None]:
# Fine tune by unfreezing the rest of the model   ##final 2 identity blocks
# and train slowly with low lr.
for layer in base_model.layers:  #[-21:]:
  layer.trainable = True

optim = SGD(lr=0.0001, momentum=0.9)
model.compile(optimizer=optim, loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
initial_epoch = epochs
epochs = initial_epoch + 1  #15
# model.fit_generator(train_generator, samples_per_epoch=train_samples, nb_epoch=epochs,
#                     validation_data=val_generator, nb_val_samples=val_samples,
#                     nb_worker=1, pickle_safe=True)

###
# Debug generators -- use NumPy array
model.fit(train_x, train_y, batch_size=batch_size, validation_data=[val_x, val_y],
          nb_epoch=epochs, initial_epoch=initial_epoch,
          class_weight=class_weights, callbacks=callbacks)
###

In [None]:
# # Evaluate model on validation set
# raw_metrics = model.evaluate_generator(val_generator, val_samples=val_samples)
# metrics = dict(zip(model.metrics_names, raw_metrics))
# print(metrics)

###
# Debug generators -- use NumPy array
raw_metrics = model.evaluate(val_x, val_y, batch_size=batch_size)
metrics = dict(zip(model.metrics_names, raw_metrics))
print(metrics)
###

In [None]:
# Multi-GPU - TODO: Generalize this
accuracies = [v for k,v in list(zip(model.metrics_names, raw_metrics)) if k.endswith("acc")]
losses = [v for k,v in list(zip(model.metrics_names, raw_metrics)) if k.endswith("_loss")]
metrics = {}
metrics["acc"] = sum(accuracies) / len(accuracies)
metrics["loss"] = sum(losses) / len(losses)
metrics

In [None]:
# Save model
filename = "{acc:.5}_acc_{loss:.5}_loss_model.hdf5".format(**metrics)
model.save(os.path.join(model_dir, filename))

In [None]:
# TODO: Monitor size of input queues with callbacks