# Imports

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import math
import os

import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import tensorflow as tf
import pyspark.sql.functions as F

from breast_cancer import input_data

plt.rcParams['figure.figsize'] = (10, 6)

In [None]:
# from pyspark.sql import SparkSession
# spark = (SparkSession.builder.appName("KerasResNet50").getOrCreate())

# Settings

In [None]:
size = 256
channels = 3
features = size * size * channels
classes = 3
p = 0.01
val_p = 0.01
use_caching = False
normalize_class_distribution = False
seed = 123

# Read in train & val data

In [None]:
# Read and sample from full DataFrames
train_df = input_data.read_train_data(spark, size, channels, p, normalize_class_distribution, seed)
val_df = input_data.read_val_data(spark, size, channels, val_p, normalize_class_distribution, seed)

In [None]:
# # Save DataFrames (Optional)
# mode = "error"
# tr_sample_filename = os.path.join("data", "train_{}_sample_{}.parquet".format(p, size))
# val_sample_filename = os.path.join("data", "val_{}_sample_{}.parquet".format(val_p, size))
# train_df.write.mode(mode).save(tr_sample_filename, format="parquet")
# val_df.write.mode(mode).save(val_sample_filename, format="parquet")

In [None]:
if use_caching:
  train_df.cache()
  val_df.cache()

In [None]:
# Explore class distributions.
for df in [train_df, val_df]:
  df.select("tumor_score").groupBy("tumor_score").count().show()

In [None]:
tc = train_df.count()
vc = val_df.count()
print(tc, vc)  # 3560187 910918

In [None]:
# Sanity check that there are no duplicates.
assert train_df.dropDuplicates().count() == tc
assert val_df.dropDuplicates().count() == vc

## Compute image channel means

In [None]:
tr_means = input_data.compute_channel_means(train_df, channels, size)
val_means = input_data.compute_channel_means(val_df, channels, size)
print(tr_means.shape)
print(tr_means, val_means)
# Train: [ 194.27633667  145.3067627   181.27861023]
# Val: [ 192.92971802  142.83534241  180.18870544]

## Save every image as a JPEG

In [None]:
def array_to_img(x, channels, size):
  x = x.reshape((channels,size,size)).transpose((1,2,0))  # shape (N,H,W,C)
  img = Image.fromarray(x.astype('uint8'), 'RGB')
  return img

In [None]:
def helper(row, channels, size, save_dir):
  tumor_score = row.tumor_score
  sample = row.sample.values
  img = array_to_img(sample, channels, size)
  filename = '{index}_{slide_num}_{hash}.jpeg'.format(
      index=row["__INDEX"], slide_num=row.slide_num, hash=np.random.randint(1e4))
  class_dir = os.path.join(save_dir, str(tumor_score))
  path = os.path.join(class_dir, filename)
  img.save(path)

In [None]:
tr_save_dir = "images/{stage}/{p}".format(stage="train", p=p)
val_save_dir = "images/{stage}/{p}".format(stage="val", p=val_p)
print(tr_save_dir, val_save_dir)

In [None]:
%%bash -s "$tr_save_dir" "$val_save_dir"
for i in 1 2 3
do
  sudo mkdir -p $1/$i
  sudo mkdir -p $2/$i
done
sudo chmod 777 -R $1
sudo chmod 777 -R $2

In [None]:
# Note: Use this if the DataFrame doesn't have an __INDEX column yet.
train_df = train_df.withColumn("__INDEX", F.monotonically_increasing_id())
val_df = val_df.withColumn("__INDEX", F.monotonically_increasing_id())

In [None]:
train_df.rdd.foreach(lambda row: helper(row, channels, size, tr_save_dir))
val_df.rdd.foreach(lambda row: helper(row, channels, size, val_save_dir))

---

In [None]:
def show_random_image(save_dir):
  c = np.random.randint(1, 4)
  class_dir = os.path.join(save_dir, str(c))
  files = os.listdir(class_dir)
  i = np.random.randint(0, len(files))
  fname = os.path.join(class_dir, files[i])
  print(fname)
  img = Image.open(fname)
  plt.imshow(img)

In [None]:
show_random_image(tr_save_dir)