In [19]:
# personal imports
from dataloader import DataLoader
import utils
from utils import calculate_auc, auc
from callbacks import *

%load_ext autoreload
%autoreload 2

# python stuffs
import os
import numpy as np
import torchvision.models as models
from torchvision import transforms as trn
import skimage.io
import skimage
import torch.utils.model_zoo as model_zoo
import torch

from keras.callbacks import ReduceLROnPlateau
from keras.models import Model, load_model
from keras.optimizers import SGD, Adam
from keras.layers import Dense, Dropout, GlobalAveragePooling2D, GlobalMaxPooling2D, Flatten, Concatenate 
# Conv2D, Input, Flatten, MaxPooling2D, UpSampling2D, concatenate, Cropping2D, Reshape, BatchNormalization
from keras.applications.vgg19 import VGG19


from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 11008254872900864376
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 1109531646718909695
physical_device_desc: "device: XLA_CPU device"
]


## Data loading
Numpy tensors exist in /scratch/users/gmachi/codex/data/train

In [4]:
ppb = 1 # "patches-per-batch"; batch size to see all 25 slices in a patch

train_loader = DataLoader(utils.train_dir, batch_size=ppb, transfer=True)
val_loader = DataLoader(utils.val_dir, batch_size=ppb, transfer=True)
test_loader = DataLoader(utils.test_dir, batch_size=ppb, transfer=True)

print("begin sanity checks for shapes...\n")
for f, d, l in train_loader: # filename, batched data, label
    print("train <filenames, data batch, labels>:\n", len(f), d.shape, l.shape)
    break
    
for f, d, l in val_loader:
    print("val <filenames, data batch, labels>:\n", len(f), d.shape, l.shape)
    break
    
for f, d, l in test_loader:
    print("test <filenames, data batch, labels>:\n", len(f), d.shape, l.shape)
    break


begin sanity checks for shapes...

train <filenames, data batch, labels>:
 25 (25, 3, 96, 96) (25,)
val <filenames, data batch, labels>:
 25 (25, 3, 96, 96) (25,)
test <filenames, data batch, labels>:
 25 (25, 3, 96, 96) (25,)


In [6]:
# Get image summary stats

from utils import labels_dict

def count_files(dir):
    return len([1 for x in list(os.scandir(dir)) if x.is_file()])

def unique_files(dir):
    return set([x.split("_")[0].split("reg")[1] for x in os.listdir(dir)])

def set_splits(dir):
    all_files = [x.split("_")[0].split("reg")[1] for x in os.listdir(dir)]
    labels = [labels_dict[u][1] for u in all_files]
    pos = np.sum(labels)
    neg = len(labels) - pos
    return pos, neg
    

print("After augmentation/up-sampling, we have...\n------------------------------------------")
print("train set size:", count_files(utils.train_dir))
print("val set size:", count_files(utils.val_dir))
print("test set size:", count_files(utils.test_dir))

print("\nSee composition of patients in sets...\n--------------------------------------")
print("train set unique files:", unique_files(utils.train_dir))
print("val set unique files:", unique_files(utils.val_dir))
print("test set unique files:", unique_files(utils.test_dir))

print("\n(+/-) splits in sets...\n-----------------------")
print("train set split:", set_splits(utils.train_dir))
print("val set split:", set_splits(utils.val_dir))
print("test set split:", set_splits(utils.test_dir))


After augmentation/up-sampling, we have...
------------------------------------------
train set size: 13069
val set size: 4449
test set size: 4507

See composition of patients in sets...
--------------------------------------
train set unique files: {'007', '024', '014', '004', '015', '012', '008', '034', '027', '020'}
val set unique files: {'016', '011', '030', '023'}
test set unique files: {'005', '017', '019', '006'}

(+/-) splits in sets...
-----------------------
train set split: (6897, 6172)
val set split: (3574, 875)
test set split: (3625, 882)


## Model definition - VGG19

In [16]:
# use appropriate device

import tensorflow as tf
from keras import backend as K

K.tensorflow_backend._get_available_gpus()


if tf.test.is_gpu_available():
    print("gpu avilable")
    with K.tf.device('/gpu:1'):
        config = tf.ConfigProto(intra_op_parallelism_threads=4,\
               inter_op_parallelism_threads=4, allow_soft_placement=True,\
               device_count = {'CPU' : 1, 'GPU' : 1})
        session = tf.Session(config=config)
        K.set_session(session)
        
        # sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
else:
    print("gpu NOT avilable")

gpu NOT avilable


In [9]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 10048019050400888493
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 6639925133493565300
physical_device_desc: "device: XLA_CPU device"
]


In [20]:
vgg19 = VGG19(weights='imagenet', include_top=False, input_shape=(96, 96, 3))
x = vgg19.output
print("out layer shape:", x.shape)

model = Model(inputs=vgg19.input, outputs=x) 
# the outputs are sent to Alex for pooling and FCC
model.summary()

# Train only the top layer
for layer in vgg19.layers:
    layer.trainable = False

out layer shape: (?, 3, 3, 512)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 96, 96, 3)         0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 96, 96, 64)        1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 96, 96, 64)        36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 48, 48, 64)        0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 48, 48, 128)       73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 48, 48, 128)       147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 24, 24, 

In [21]:
learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', patience=2, verbose=1, factor=0.5, min_lr=0.000005)
optimizer = Adam(lr=0.001)
loss = 'binary_crossentropy'
metrics = ['acc', auc]
model.compile(optimizer, loss, metrics)

## Transfer learning: apply pre-trained weights to 96x96x3 patch-slices 
This process gives us slice-level tensors for aggregation for the whole patch (96x96x75)

In [53]:
# from dataloader import TransferLoader
from transfer_classifier import *
%load_ext autoreload
%autoreload 2
from keras.callbacks import TensorBoard
from keras.optimizers import *


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [51]:
# def write_log(callback, names, logs, batch_no):
#     for name, value in zip(names, logs):
#         summary = tf.Summary()
#         summary_value = summary.value.add()
#         summary_value.simple_value = value
#         summary_value.tag = name
#         callback.writer.add_summary(summary, batch_no)
#         callback.writer.flush()
        

In [59]:
ppb = 1             # patches per batch
N,H,W,C = x.shape   # VGG shape
N = ppb*25          # batch size

dummy_tensor = np.zeros((N,H,W,C))
pooled = pool(dummy_tensor)
print(dummy_tensor.shape, "-->", pooled.shape)

# TL model instantiation
#------------------------
model_t = classify_from_pooled(pooled.shape)

# learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', patience=2, verbose=1, factor=0.5, min_lr=0.000005)

learning_rate = 1e-5 
optimizer = Adam(lr=learning_rate)
loss = 'binary_crossentropy'
metrics = ['acc', auc]
model_t.compile(optimizer, loss, metrics)

(25, 3, 3, 512) --> (3, 3, 512)


In [60]:
# directory to save the best model
# file_dir = './TransferModel'
# if not os.path.exists(file_dir):
#     os.mkdir(file_dir)
    
# model_name = 'vgg19_transfer'
# network_filepath = os.path.join(file_dir, model_name + '.h5')

# callback = TensorBoard(file_dir)
# callback.set_model(model_t)

In [83]:
def train_transfer(model, model_t):
    ppb = 1
    print_every = 10
    epochs = 10

    train_loader = DataLoader(utils.data_dir + 'train/', batch_size=ppb, transfer=True)
    val_loader = DataLoader(utils.data_dir + 'val/', batch_size=ppb, transfer=True)

    i = 0 # batch number
    train_losses, val_losses = [], []
    cur_val = 999
    consec_increases = 0

    print("printing: loss, accuracy, AUC:\n-----------------------------")

    for e in range(epochs):
        for fdl_train, fdl_val in zip(train_loader, val_loader):

            # train
            (f_train, d_train, l_train) = fdl_train
            d_slice_t = model.predict_on_batch(x=d_train.transpose(0,2,3,1))
            d_pooled_t = pool(d_slice_t)
            d_pooled_t = d_pooled_t[np.newaxis, :, :, :]
            l_pooled_t = np.array([l_train[0]])[np.newaxis, :] # all should be the same

            # val
            (f_val, d_val, l_val) = fdl_val
            d_slice_v = model.predict_on_batch(x=d_val.transpose(0,2,3,1))
            d_pooled_v = pool(d_slice_v)
            d_pooled_v = d_pooled_v[np.newaxis, :, :, :]
            l_pooled_v = np.array([l_val[0]])[np.newaxis, :] # all should be the same

            # get train metrics
            train_loss = model_t.train_on_batch(d_pooled_t, l_pooled_t)
            train_losses.append(train_loss)
        #     write_log(callback, train_names, logs, i)

            # get val metrics
            if i % 10 == 0:
                val_loss = model_t.test_on_batch(d_pooled_v, l_pooled_v)
                val_losses.append(val_loss)
                print("iter:", (i+1)*10, "train:", train_loss, "val:", val_loss)
        #         write_log(callback, val_names, logs, i)

            # early stopping criteria
            if val_loss[0] > cur_val:
                consec_increases += 1
            else:
                consec_increases = 0
            
            # update cur_val
            cur_val = val_loss[0]

            if consec_increases >= 3:
                print('Stopping early due to validation loss increase')
                model_t.save(utils.model_dir + "transfer_earlystop_epoch{}.h5".format(e))
                return train_losses, val_losses

            model_t.save(utils.model_dir + "transfer_epoch%s.h5" % e)
            i += 1
        
    return train_losses, val_losses


In [84]:
train_losses, val_losses = train_transfer(model, model_t)

printing: loss, accuracy, AUC:
-----------------------------
iter: 0 train: [15.942385, 0.0, 0.3958333] val: [16.118095, 0.0, 0.3611111]
iter: 10 train: [1.1920933e-07, 1.0, 0.43333328] val: [1.1920933e-07, 1.0, 0.44374996]
iter: 20 train: [1.1920933e-07, 1.0, 0.5064935] val: [1.8232073e-05, 1.0, 0.530303]


KeyboardInterrupt: 

In [None]:

#     # cute printout for sanity (~27,000 train)
#     if (i > 0) and ((i+1) % print_every == 0):
#         print("%i patches complete" % ((i+1)*ppb))


# train_names = ['train_loss', 'train_acc', "train_auc"]
# val_names = ['val_loss', 'val_acc', "val_auc"]