# Imports

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import math
import os

import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import tensorflow as tf

from breast_cancer import input_data

plt.rcParams['figure.figsize'] = (10, 6)

In [None]:
# from pyspark.sql import SparkSession
# spark = (SparkSession.builder.appName("KerasResNet50").getOrCreate())

# Settings

In [None]:
SIZE = 256
CHANNELS = 3
FEATURES = SIZE * SIZE * CHANNELS
CLASSES = 3
p = 1
val_p = 0.01
use_caching = False

# Read in train & val data

In [None]:
# Read and sample from full DataFrames
keep_class_distribution = False
train_df = input_data.read_train_data(spark, SIZE, CHANNELS, p, keep_class_distribution)
val_df = input_data.read_val_data(spark, SIZE, CHANNELS, val_p, keep_class_distribution)

In [None]:
# # Save DataFrames (Optional)
# mode = "error"
# tr_sample_filename = os.path.join("data", "train_{}_sample_{}.parquet".format(p, SIZE))
# val_sample_filename = os.path.join("data", "val_{}_sample_{}.parquet".format(val_p, SIZE))
# train_df.write.mode(mode).save(tr_sample_filename, format="parquet")
# val_df.write.mode(mode).save(val_sample_filename, format="parquet")

In [None]:
if use_caching:
  train_df.cache()
  val_df.cache()

In [None]:
for df in [train_df, val_df]:
  df.select("tumor_score").groupBy("tumor_score").count().show()

In [None]:
tc = train_df.count()
vc = val_df.count()
print(tc, vc)

## Compute image channel means

In [None]:
means = input_data.compute_channel_means(val_df, CHANNELS, SIZE)
print(means.shape)
print(means)

## Save every image as a JPEG to distributed filesystem

In [None]:
def array_to_img(x, channels, size):
  x = x.reshape((channels,size,size)).transpose((1,2,0))  # shape (N,H,W,C)
  img = Image.fromarray(x.astype('uint8'), 'RGB')
  return img

In [None]:
def helper(row, channels, size, save_dir):
  tumor_score = row.tumor_score
  sample = row.sample.values
  img = array_to_img(sample, channels, size)
  filename = '{index}_{slide_num}_{hash}.jpeg'.format(index=row["__INDEX"], slide_num=row.slide_num, hash=np.random.randint(1e4))
  class_dir = os.path.join(save_dir, str(tumor_score))
  path = os.path.join(class_dir, filename)
  img.save(path)

In [None]:
def show_random_image(save_dir):
  c = np.random.randint(1, 4)
  class_dir = os.path.join(save_dir, str(c))
  files = os.listdir(class_dir)
  i = np.random.randint(0, len(files))
  fname = os.path.join(class_dir, files[i])
  print(fname)
  img = Image.open(fname)
  plt.imshow(img)

In [None]:
tr_save_dir = "images/{stage}/{p}".format(stage="train", p=p)
val_save_dir = "images/{stage}/{p}".format(stage="val", p=val_p)
print(tr_save_dir, val_save_dir)

In [None]:
%%bash -s "$tr_save_dir" "$val_save_dir"
for i in 1 2 3
do
  sudo mkdir -p $1/$i
  sudo mkdir -p $2/$i
done
sudo chmod 777 -R $1
sudo chmod 777 -R $2

In [None]:
train_df.rdd.foreach(lambda row: helper(row, CHANNELS, SIZE, tr_save_dir))
val_df.rdd.foreach(lambda row: helper(row, CHANNELS, SIZE, val_save_dir))

In [None]:
show_random_image(tr_save_dir)