In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import math
import multiprocessing as mp
import os
import queue
import shutil
import threading

from keras.applications.resnet50 import ResNet50, preprocess_input
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.initializations import get_fans, normal
from keras.layers import Activation, Convolution2D, Dense, Dropout, Flatten, Input, MaxPooling2D, Permute, Reshape
from keras.models import Model
from keras.optimizers import SGD
# from keras.utils.np_utils import to_categorical
import keras.backend as K
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from breast_cancer import input_data

plt.rcParams['figure.figsize'] = (10, 6)

In [None]:
os.environ['CUDA_VISIBLE_DEVICES'] = "1"
# c = tf.ConfigProto()
# c.gpu_options.visible_device_list="0"
# sess = tf.Session(config=c)
# K.set_session(sess)

# Read in train & val data

In [None]:
SIZE = 256
CHANNELS = 3
FEATURES = SIZE * SIZE * CHANNELS
CLASSES = 3
p = 0.01
train_df, val_df = input_data.read_train_val_data(spark, SIZE, CHANNELS, p)

In [None]:
tc = train_df.count()
vc = val_df.count()
print(tc, vc)
print(train_df.rdd.getNumPartitions(), val_df.rdd.getNumPartitions())

## Compute image channel means

In [None]:
means = input_data.compute_channel_means(train_df, CHANNELS, SIZE)
print(means.shape)
print(means)

## Generate class weights for training

In [None]:
class_weights = input_data.gen_class_weights(train_df)
print(class_weights)

## Create asynchronous queuing batch generators

In [None]:
# Create train & val generators
batch_size = 32
train_generator_orig, train_ps, train_queues, train_stop_event = input_data.create_batch_generator(train_df.rdd, batch_size=batch_size)
val_generator_orig, val_ps, val_queues, val_stop_event = input_data.create_batch_generator(val_df.rdd, batch_size=batch_size)

In [None]:
# # Print queue sizes (for debugging)
# for q in train_queues + val_queues:
#   print(q.qsize())

# Keras

## "LeNet"-like Model

In [None]:
# Setup model and log directories
log_dir = os.path.join("tf_logs", "keras", "lenet")
model_dir = os.path.join("models", "keras", "lenet")
for path in [log_dir, model_dir]:
  if not os.path.exists(path):
    os.makedirs(path)  # make all intermediate dirs, unlike `os.mkdir(path)`

In [None]:
# # Clear out any existing Keras logs and model checkpoints
# for path in [log_dir, model_dir]:
#   if os.path.exists(path):
#     #os.rmdir(path)  # fails if directory is not empty
#     shutil.rmtree(path)

# # Reset any current Keras session
# import keras.backend as K
# K.clear_session()  # reset TensorFlow session for iterative work

In [None]:
# Preprocess with slide image means
def preprocess_input(x, dim_ordering='default'):
    """Preprocesses a tensor encoding a batch of images.
    # Arguments
        x: input Numpy tensor, 4D.
    # Returns
        Preprocessed tensor.
    """
    if dim_ordering == 'default':
        dim_ordering = K.image_dim_ordering()
    assert dim_ordering in {'tf', 'th'}

    if dim_ordering == 'th':
        # 'RGB'->'BGR'
        x = x[:, ::-1, :, :]
        # Zero-center by mean pixel
        x[:, 0, :, :] -= 103.939
        x[:, 1, :, :] -= 116.779
        x[:, 2, :, :] -= 123.68
    else:
        # 'RGB'->'BGR'
        x = x[:, :, :, ::-1]
        # Zero-center by mean pixel
        # `means` is stored in RGB, but we need BGR
        x[:, :, :, 0] -= means[2]  #103.939
        x[:, :, :, 1] -= means[0]  #116.779
        x[:, :, :, 2] -= means[1]  #123.68
    return x

In [None]:
def to_categorical(y, classes):
  # Avoid cast to float64 as done in keras.utils.np_utils.to_categorical
  n = len(y)
  Y = np.zeros((n, classes), dtype=np.int32)
  Y[np.arange(n), y] = 1
  return Y

# TODO: Clean this up -- remove access to global variables
def gen_preprocessed_batch(batch_generator):
  for xs, ys in batch_generator:
    xs = (xs.reshape((-1,CHANNELS,SIZE,SIZE))  # shape (N,C,H,W)
            .transpose((0,2,3,1))  # shape (N,H,W,C)
            .astype(np.float32))
    yield preprocess_input(xs), to_categorical(ys-1, CLASSES)

In [None]:
# Create train & val preprocessed generators
train_generator = gen_preprocessed_batch(train_generator_orig)
val_generator = gen_preprocessed_batch(val_generator_orig)

In [None]:
# Setup training callbacks
model_filename = os.path.join(model_dir, "{val_loss:.2f}-{epoch:02d}.hdf5")
# Careful, TensorBoard callback could OOM with large validation set
tensorboard = TensorBoard(log_dir=log_dir)  #, histogram_freq=1, write_images=True)
checkpointer = ModelCheckpoint(model_filename)
callbacks = [tensorboard, checkpointer]

In [None]:
# Custom final dense layer initializer
def my_init(shape, name=None, dim_ordering='tf'):
    """Guassian scaled by sqrt(1/fan_in)"""
    fan_in, fan_out = get_fans(shape, dim_ordering=dim_ordering)
    s = np.sqrt(1. / fan_in)
    return normal(shape, s, name=name)

In [None]:
# Create a "LeNet"-like model
f = 3
inputs = Input(shape=(SIZE,SIZE,CHANNELS))
x = Convolution2D(32, f, f, init="he_normal", border_mode="same", activation="relu")(inputs)
x = MaxPooling2D()(x)
x = Convolution2D(64, f, f, init="he_normal", border_mode="same", activation="relu")(x)
x = MaxPooling2D()(x)
x = Convolution2D(128, f, f, init="he_normal", border_mode="same", activation="relu")(x)
x = MaxPooling2D()(x)
x = Flatten()(x)
x = Dense(256, init="he_normal", W_regularizer='l2', activation="relu")(x)
# x = Dropout(0.5)(x)
predictions = Dense(CLASSES, init=my_init, activation="softmax")(x)

# Create overall model
model = Model(input=inputs, output=predictions)

In [None]:
model.summary()

In [None]:
# Compile model
# optim = SGD(lr=0.01, momentum=0.5, decay=0.0, nesterov=True)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=['accuracy'])

In [None]:
# Train these new layers at the end
train_samples = math.ceil(tc/batch_size) * batch_size
val_samples = math.ceil(vc/batch_size) * batch_size
epochs = 5
model.fit_generator(train_generator, samples_per_epoch=train_samples, nb_epoch=epochs,
                    validation_data=val_generator, nb_val_samples=val_samples,
                    class_weight=class_weights,
#                     max_q_size=10000, # vary the queue size
#                     callbacks=callbacks,
                    nb_worker=1, pickle_safe=True)

In [None]:
# Evaluate model on validation set
raw_metrics = model.evaluate_generator(val_generator, val_samples=val_samples)
metrics = dict(zip(model.metrics_names, raw_metrics))
metrics

In [None]:
metrics

In [None]:
# Save model
filename = "{acc:.5}_acc_{loss:.5}_loss_model.hdf5".format(**metrics)
model.save(os.path.join(model_dir, filename))

In [None]:
# TODO: Monitor size of input queues with callbacks

In [None]:
# TODO: Expand the size of the Dense layer, and perhaps replace Flatten with GlobalAveragePooling2D