In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

plt.rcParams['figure.figsize'] = (10, 6)

# Read in train & val data

In [None]:
train_df = sqlContext.read.load("data/train_100_grayscale.parquet")
val_df = sqlContext.read.load("data/val_100_grayscale.parquet")
train_df, val_df

In [None]:
tc = train_df.count()
vc = val_df.count()
tc, vc  # 100

In [None]:
train_df.select("tumor_score").groupBy("tumor_score").count().show()
val_df.select("tumor_score").groupBy("tumor_score").count().show()

In [None]:
C = 1
SIZE = train_df.first().sample.toArray().shape[0]
SIZE

In [None]:
def extract_data(df):
  rows = df.collect()
  x = np.array([row.sample.toArray().astype(np.float32) for row in rows])
  y = np.array([row.tumor_score for row in rows])
  return x, y

# Convert data to TFRecords

In [None]:
import os
import uuid

def _bytes_feature(value):
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def encode_and_write(rows, folder):
  """Convert data to file of tfrecords."""
  # TODO: A uuid is highly unlikely to overlap, but we implement a truly
  # unique identifier.  We could possibly `mapPartitions` to lists, then
  # `zipWithUniqueID` on each list, then write each entry, using the ID
  # as the filename.
  filename = os.path.join(folder, str(uuid.uuid4()) + ".tfrecords")
  writer = tf.python_io.TFRecordWriter(filename)
  for row in rows:
    example = tf.train.Example(features=tf.train.Features(feature={
          '__INDEX': _int64_feature(row['__INDEX']),
          'slide_num': _int64_feature(row.slide_num),
          'tumor_score': _int64_feature(row.tumor_score),
          'molecular_score': _float_feature(row.molecular_score),
          'sample': _bytes_feature(row.sample.toArray().astype(np.float32).tostring()) # should use the uint8 DataFrame
          # TODO: ENCODE SHAPE
        }))
    writer.write(example.SerializeToString())
  writer.close()

In [None]:
def read_and_decode(filename_queue):
  """Read TFRecords from a file and decode into single examples."""
  reader = tf.TFRecordReader()
  filename, serialized_example = reader.read(filename_queue)
  features = tf.parse_single_example(
      serialized_example,
      features={
          '__INDEX': tf.FixedLenFeature([], tf.int64),
          'slide_num': tf.FixedLenFeature([], tf.int64),
          'tumor_score': tf.FixedLenFeature([], tf.int64),
          'molecular_score': tf.FixedLenFeature([], tf.float32), # default for Python float is float32
          'sample': tf.FixedLenFeature([], tf.string)
          # TODO: DECODE SHAPE
      })
  index = features["__INDEX"]  #tf.cast(features["__INDEX"], tf.int8)
  slide_num = features["slide_num"]  #tf.cast(features["slide_num"], tf.int32)
  tumor_score = features["tumor_score"]
  molecular_score = features["molecular_score"]
  sample = tf.decode_raw(features["sample"], tf.float32)  # should use the uint8 DataFrame
  
  return index, slide_num, tumor_score, molecular_score, sample

# Convert rdd partitions -> tfrecord files

In [None]:
base_dir = "data/tf"
train_dir = os.path.join(base_dir, "train")
val_dir = os.path.join(base_dir, "val")
for folder in [train_dir, val_dir]:
  if not os.path.exists(folder):
    os.mkdir(folder)
train_dir, val_dir

In [None]:
# TODO: Try `mapPartitionsWithIndex` and forced file writing side-effect by calling `collect()`.
train_df.repartition(3).foreachPartition(lambda rows: encode_and_write(rows, train_dir))
val_df.repartition(3).foreachPartition(lambda rows: encode_and_write(rows, val_dir))

# Create convnet model graph
Create network:
  conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> conv3 -> relu3 -> pool3 -> affine1 -> relu1 -> affine2 -> softmax

In [None]:
tf.reset_default_graph()

# Hyperparams & Settings
folder=train_dir
min_after_dequeue = 1000
batch_size = 100  #64
capacity = min_after_dequeue + 4 * batch_size
num_threads = 2
num_epochs = 100
classes = 3
features = 65536
C = 1  # Number of input channels (dimensionality of input depth)
Hin = 256  # Input height
Win = 256  # Input width
Hf = 3  # conv filter height
Wf = 3  # conv filter width
Hfp = 2  # pool filter height
Wfp = 2  # pool filter width
stride = 1  # conv stride
pstride = 2  # pool stride
pad = 1  # For same dimensions, (Hf - stride) / 2
F1 = 32  # num conv filters in conv1
F2 = 32  # num conv filters in conv2
F3 = 32  # num conv filters in conv3
N1 = 512  # num nodes in affine1
lr = 1e-4 # learning rate

# Inputs
with tf.name_scope("input") as scope:
  # Input queues
  filenames = [os.path.join(folder, f) for f in os.listdir(folder)]
  filename_queue = tf.train.string_input_producer(filenames, num_epochs=num_epochs)
  index, slide_num, tumor_score, molecular_score, sample = read_and_decode(filename_queue)
  sample.set_shape([Hin*Win])
  x, y_ = tf.train.shuffle_batch([sample, tumor_score], batch_size=batch_size, capacity=capacity,
                                 min_after_dequeue=min_after_dequeue, num_threads=num_threads)
  
  #x = tf.placeholder(tf.float32, [None, features], name="x")
  x_image = tf.transpose(tf.reshape(x, [-1, C, Hin, Win]), perm=[0,2,3,1])  # shape (N,H,W,C)
  #y_ = tf.placeholder(tf.int64, [None, ], name="y_")
  y_one_hot = tf.one_hot(y_-1, classes)  # or use sparse cross entropy
  tf.summary.image("x", x_image)
  tf.summary.histogram("y", y_)

# Conv layer 1: conv1 -> relu1 -> pool1
with tf.name_scope("conv1") as scope:
  W = tf.Variable(tf.random_normal([Hf, Wf, C, F1]) * np.sqrt(2.0/(Hf*Wf*C)), name="W")
  b = tf.Variable(tf.zeros([F1]), name="b")
  conv = tf.nn.conv2d(x_image, W, [1,stride,stride,1], padding="SAME") + b
  relu = tf.nn.relu(conv)
  pool = tf.nn.max_pool(relu, ksize=[1,Hfp,Wfp,1], strides=[1,pstride,pstride,1], padding="SAME")
  tf.summary.image("conv1", tf.transpose(W, [3,0,1,2]), max_outputs=F1)  # transpose to [F1,H,W,C]

# Conv layer 2: conv2 -> relu2 -> pool2
with tf.name_scope("conv2") as scope:
  W = tf.Variable(tf.random_normal([Hf, Wf, F1, F2]) * np.sqrt(2.0/(Hf*Wf*F1)), name="W")
  b = tf.Variable(tf.zeros([F2]), name="b")
  conv = tf.nn.conv2d(pool, W, [1,stride,stride,1], padding="SAME") + b
  relu = tf.nn.relu(conv)
  pool = tf.nn.max_pool(relu, ksize=[1,Hfp,Wfp,1], strides=[1,pstride,pstride,1], padding="SAME")

# Conv layer 3: conv3 -> relu3 -> pool3
with tf.name_scope("conv3") as scope:
  W = tf.Variable(tf.random_normal([Hf, Wf, F2, F3]) * np.sqrt(2.0/(Hf*Wf*F2)), name="W")
  b = tf.Variable(tf.zeros([F3]), name="b")
  conv = tf.nn.conv2d(pool, W, [1,stride,stride,1], padding="SAME") + b
  relu = tf.nn.relu(conv)
  pool = tf.nn.max_pool(relu, ksize=[1,Hfp,Wfp,1], strides=[1,pstride,pstride,1], padding="SAME")

# Affine layer 1:  affine1 -> relu1 -> dropout
with tf.name_scope("affine1") as scope:
  D = int(F3*(Hin/2**3)*(Win/2**3))
  W = tf.Variable(tf.random_normal([D,N1]) * np.sqrt(2.0/D), name="W")
  b = tf.Variable(tf.zeros([N1]), name="b")
  affine = tf.matmul(tf.reshape(pool, [-1,D]), W) + b
  relu = tf.nn.relu(affine)
  keep_prob = tf.placeholder(tf.float32, name="keep_prob")
  dropout = tf.nn.dropout(relu, keep_prob)

# Affine layer 2:  affine2 -> softmax
with tf.name_scope("affine2") as scope:
  W = tf.Variable(tf.random_normal([N1,classes]) * np.sqrt(2.0/N1), name="W")
  b = tf.Variable(tf.zeros([classes]), name="b")
  logits = tf.matmul(dropout, W) + b
  probs = tf.nn.softmax(logits)
  tf.summary.histogram("logits", logits)
  tf.summary.histogram("probs", probs)

# Loss
with tf.name_scope("loss") as scope:
  cross_entropy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, y_one_hot))
  tf.summary.scalar("loss", cross_entropy_loss)

# Train
# train_step = tf.train.GradientDescentOptimizer(lr).minimize(cross_entropy)
train_step = tf.train.AdamOptimizer(lr).minimize(cross_entropy_loss)

# Eval metrics
with tf.name_scope("eval") as scope:
  correct_pred = tf.equal(tf.argmax(logits,1), tf.argmax(y_one_hot,1))
  accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
  tf.summary.scalar("accuracy", accuracy)

In [None]:
# Run `tensorboard --logdir=tf_logs --host=localhost --debug --reload_interval 5`
with tf.Session() as sess:
  # Summaries
  log_dir = "tf_logs"
  summary_op = tf.summary.merge_all()
  train_writer = tf.train.SummaryWriter(log_dir + "/train", sess.graph)
  val_writer = tf.train.SummaryWriter(log_dir + "/val")
  
  init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
  sess.run(init_op)
  
  # Start input queues
  coord = tf.train.Coordinator()
  threads = tf.train.start_queue_runners(sess=sess, coord=coord)
  
  # Data Gen
  #train_generator = gen_batch(train_df.rdd, 64)
  x_val, y_val = extract_data(val_df)
  
  # Train
  try:
    i = 0
    while not coord.should_stop():
      _ = sess.run([train_step], feed_dict={keep_prob:0.5})
      if i % 10 == 0:
        # train stats
        summary, train_acc = sess.run([summary_op, accuracy], feed_dict={keep_prob:0.5})
        train_writer.add_summary(summary, i)
        # val stats
        summary, val_acc = sess.run([summary_op, accuracy], feed_dict={x: x_val, y_:y_val, keep_prob:1})
        val_writer.add_summary(summary, i)
        #train_writer.flush()  # To force write -- probably slower (usually asynchronous)
        #val_writer.flush()  # To force write -- probably slower (usually asynchronous)
        print("Iter: {}, \t Train Accuracy: {:.4f}, \t Val Accuracy: {:.4f}".format(i, train_acc, val_acc))
      i += 1
  except tf.errors.OutOfRangeError:
    print("Done training!")
  finally:
    # Ask threads to stop when finished
    coord.request_stop()
  # Wait for all threads to finish  
  coord.join(threads)
  
train_writer.flush()  # Make sure everything is written before exiting
val_writer.flush()  # Make sure everything is written before exiting