In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

plt.rcParams['figure.figsize'] = (10, 6)

# Read in train & val data

In [None]:
train_df = sqlContext.read.load("data/train_100_grayscale.parquet")
val_df = sqlContext.read.load("data/val_100_grayscale.parquet")
train_df, val_df

In [None]:
tc = train_df.count()
vc = val_df.count()
tc, vc  # 100

In [None]:
train_df.select("tumor_score").groupBy("tumor_score").count().show()
val_df.select("tumor_score").groupBy("tumor_score").count().show()

In [None]:
C = 1
SIZE = train_df.first().sample.toArray().shape[0]
SIZE

# Create batch generator

In [None]:
def gen_batch(rdd, batch_size=32):
  """
  RDD data generator.
  
  Generator that cycles through the data and yields a
  batch at a time, reinitializing the iterator as needed
  to continue yielding batches.
  
  Args:
    rdd: A PySpark RDD containing the training data.
    batch_size: Size of batches to return.
  """
  iterator = rdd.toLocalIterator()
  while True:
    features = []
    labels = []
    for i in range(batch_size):
      # Generate batch
      try:
        row = next(iterator)
      except StopIteration:
        # Restart iterator
        iterator = rdd.toLocalIterator()
        row = next(iterator)
      features.append(row.sample.values)
      labels.append(row.tumor_score)
    x_batch = np.array(features).astype(np.float32)
    y_batch = np.array(labels).astype(np.int32)
    yield x_batch, y_batch

In [None]:
generator = gen_batch(train_df.rdd, 32)

In [None]:
x, y = next(generator)
x, y

In [None]:
x.shape, x.dtype, y.shape, y.dtype

# Get validation data

In [None]:
val_df

In [None]:
rows = val_df.take(3)  #.collect()

In [None]:
x = np.array([row.sample.toArray().astype(np.float32) for row in rows])
y = np.array([row.tumor_score for row in rows])

In [None]:
x.dtype, x.shape, y.dtype, y.shape

In [None]:
def extract_data(df):
  """
  Extract data from a PySpark DataFrame into a NumPy array.
  """
  rows = df.collect()
  x = np.array([row.sample.toArray().astype(np.float32) for row in rows])
  y = np.array([row.tumor_score for row in rows])
  return x, y

# Create convnet model graph
Create network:
  conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> conv3 -> relu3 -> pool3 -> affine1 -> relu1 -> affine2 -> softmax

In [None]:
tf.reset_default_graph()

# Hyperparams & Settings
classes = 3
features = 65536
C = 1  # Number of input channels (dimensionality of input depth)
Hin = 256  # Input height
Win = 256  # Input width
Hf = 3  # conv filter height
Wf = 3  # conv filter width
Hfp = 2  # pool filter height
Wfp = 2  # pool filter width
stride = 1  # conv stride
pstride = 2  # pool stride
pad = 1  # For same dimensions, (Hf - stride) / 2
F1 = 32  # num conv filters in conv1
F2 = 32  # num conv filters in conv2
F3 = 32  # num conv filters in conv3
N1 = 512  # num nodes in affine1
lr = 1e-4 # learning rate

# Inputs
with tf.name_scope("input") as scope:
  x = tf.placeholder(tf.float32, [None, features], name="x")
  x_image = tf.transpose(tf.reshape(x, [-1, C, Hin, Win]), perm=[0,2,3,1])  # shape (N,H,W,C)
  y_ = tf.placeholder(tf.int64, [None, ], name="y_")
  y_one_hot = tf.one_hot(y_-1, classes)  # or use sparse cross entropy
  tf.summary.image("x", x_image)
  tf.summary.histogram("y", y_)

# Conv layer 1: conv1 -> relu1 -> pool1
with tf.name_scope("conv1") as scope:
  W = tf.Variable(tf.random_normal([Hf, Wf, C, F1]) * np.sqrt(2.0/(Hf*Wf*C)), name="W")
  b = tf.Variable(tf.zeros([F1]), name="b")
  conv = tf.nn.conv2d(x_image, W, [1,stride,stride,1], padding="SAME") + b
  relu = tf.nn.relu(conv)
  pool = tf.nn.max_pool(relu, ksize=[1,Hfp,Wfp,1], strides=[1,pstride,pstride,1], padding="SAME")
  tf.summary.image("conv1", tf.transpose(W, [3,0,1,2]), max_outputs=F1)  # transpose to [N,H,W,C]

# Conv layer 2: conv2 -> relu2 -> pool2
with tf.name_scope("conv2") as scope:
  W = tf.Variable(tf.random_normal([Hf, Wf, F1, F2]) * np.sqrt(2.0/(Hf*Wf*F1)), name="W")
  b = tf.Variable(tf.zeros([F2]), name="b")
  conv = tf.nn.conv2d(pool, W, [1,stride,stride,1], padding="SAME") + b
  relu = tf.nn.relu(conv)
  pool = tf.nn.max_pool(relu, ksize=[1,Hfp,Wfp,1], strides=[1,pstride,pstride,1], padding="SAME")

# Conv layer 3: conv3 -> relu3 -> pool3
with tf.name_scope("conv3") as scope:
  W = tf.Variable(tf.random_normal([Hf, Wf, F2, F3]) * np.sqrt(2.0/(Hf*Wf*F2)), name="W")
  b = tf.Variable(tf.zeros([F3]), name="b")
  conv = tf.nn.conv2d(pool, W, [1,stride,stride,1], padding="SAME") + b
  relu = tf.nn.relu(conv)
  pool = tf.nn.max_pool(relu, ksize=[1,Hfp,Wfp,1], strides=[1,pstride,pstride,1], padding="SAME")

# Affine layer 1:  affine1 -> relu1 -> dropout
with tf.name_scope("affine1") as scope:
  D = int(F3*(Hin/2**3)*(Win/2**3))
  W = tf.Variable(tf.random_normal([D,N1]) * np.sqrt(2.0/D), name="W")
  b = tf.Variable(tf.zeros([N1]), name="b")
  affine = tf.matmul(tf.reshape(pool, [-1,D]), W) + b
  relu = tf.nn.relu(affine)
  keep_prob = tf.placeholder(tf.float32, name="keep_prob")
  dropout = tf.nn.dropout(relu, keep_prob)

# Affine layer 2:  affine2 -> softmax
with tf.name_scope("affine2") as scope:
  W = tf.Variable(tf.random_normal([N1,classes]) * np.sqrt(2.0/N1), name="W")
  b = tf.Variable(tf.zeros([classes]), name="b")
  logits = tf.matmul(dropout, W) + b
  probs = tf.nn.softmax(logits)
  tf.summary.histogram("logits", logits)
  tf.summary.histogram("probs", probs)

# Loss
with tf.name_scope("loss") as scope:
  cross_entropy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, y_one_hot))
  tf.summary.scalar("loss", cross_entropy_loss)

# Train
# train_step = tf.train.GradientDescentOptimizer(lr).minimize(cross_entropy)
train_step = tf.train.AdamOptimizer(lr).minimize(cross_entropy_loss)

# Eval metrics
with tf.name_scope("eval") as scope:
  correct_pred = tf.equal(tf.argmax(logits,1), tf.argmax(y_one_hot,1))
  accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
  tf.summary.scalar("accuracy", accuracy)

In [None]:
# Run `tensorboard --logdir=tf_logs --host=localhost --debug --reload_interval 5`
with tf.Session() as sess:
  # Summaries
  log_dir = "tf_logs"
  summary_op = tf.summary.merge_all()
  train_writer = tf.train.SummaryWriter(log_dir + "/train", sess.graph)
  val_writer = tf.train.SummaryWriter(log_dir + "/val")
  
  # Data Gen
  train_generator = gen_batch(train_df.rdd, 64)
  x_val, y_val = extract_data(val_df)
  
  # Train
  sess.run(tf.global_variables_initializer())
  steps = 100
  for i in range(steps):
    xs, ys = next(train_generator)
    _ = sess.run([train_step], feed_dict={x: xs, y_:ys, keep_prob:0.5})
    if i % 10 == 0:
      # train stats
      summary, train_acc = sess.run([summary_op, accuracy], feed_dict={x: xs, y_:ys, keep_prob:0.5})
      train_writer.add_summary(summary, i)
      # val stats
      summary, val_acc = sess.run([summary_op, accuracy], feed_dict={x: x_val, y_:y_val, keep_prob:1})
      val_writer.add_summary(summary, i)
      print("Iter: {}, \t Train Accuracy: {:.4f}, \t Val Accuracy: {:.4f}".format(i, train_acc, val_acc))
  train_writer.flush()  # Make sure everything is written before exiting
  val_writer.flush()  # Make sure everything is written before exiting

In [None]:
# 1. Add TensorBoard summaries and track.
# 2. Plug into larger dataset.
# 3. Run on cluster.
# 4. Explore saving to TFRecord format, then reading from files shared on DFS (gfs).