**Inference Notebook**

Due to computation time limit on the final submission notebook, inference and training notebooks have been seperated. 

This notebook makes inference using the trained model and calculates the Cohen's Kappa which is the figure of merit for this competition.

In [1]:
# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import tensorflow as tf
import pathlib,os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import skimage.io
from sklearn.model_selection import StratifiedKFold
import time
import albumentations
import tensorflow_addons as tfa
import tensorflow_hub as hub
from tensorflow.keras import layers,models

AUTOTUNE = tf.data.experimental.AUTOTUNE

import random as python_random
np.random.seed(123)
python_random.seed(123)
tf.random.set_seed(1234)
os.environ['PYTHONHASHSEED']=str(0)

#from keras import backend as K
#session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
#sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
#K.set_session(sess)

In [2]:
kaggle_data= '/data/users/rpravahan/panda'

data_dir=kaggle_data+'/train_images'
data_simple_dir='/home/emungan/panda/data_simple/images'
ds_dir = pathlib.Path(data_simple_dir)

label_dir=kaggle_data+'/train.csv'
train_labels = pd.read_csv(label_dir).set_index('image_id')


In [3]:
def read_tiff(img_path_tensor,level):      
    
    #read the image 
    img_path = img_path_tensor.numpy().decode("utf-8")     #input needs to be a tensor, skimage wants a string to iterate over
    img      = skimage.io.MultiImage(str(img_path))[level]
    im_ID    = img_path.split(os.sep)[-1].split('.')[0]

    return img

def augment(image_array):
    
    # Augmentations    
    op_train = albumentations.Compose([
        albumentations.VerticalFlip(p=0.5),
        albumentations.HorizontalFlip(p=0.5),
        albumentations.Rotate(limit=90,border_mode=4,p=0.5),    #mode 1 wraps around
        #albumentations.ElasticTransform(alpha=1, sigma=50, alpha_affine=50,p=0.5),
        albumentations.HueSaturationValue(hue_shift_limit=(0,20), sat_shift_limit=0, val_shift_limit=0,p=0.5),
        #albumentations.RGBShift(r_shift_limit=20, g_shift_limit=20, b_shift_limit=20,p=0.5),
        albumentations.GaussianBlur(blur_limit=3,p=0.25)
    ])  
    
    return op_train(image=image_array)['image']        


def tile_tiff(img,level,n_tiles):
    # get the patches with tissue    
    
    if (level==1): tile_size=256                          # tile size depends on the downsampling of the level
    elif (level == 2): tile_size =128
    else: raise Exception("level is not 1 or 2")
    
    mode=0
    sub_imgs=False
    is_rand=False
    
    tiles = []
    h, w, c = img.shape
    pad_h = (tile_size - h % tile_size) % tile_size + ((tile_size * mode) // 2)
    pad_w = (tile_size - w % tile_size) % tile_size + ((tile_size * mode) // 2)

    img2 = np.pad(img,[[pad_h // 2, pad_h - pad_h // 2], [pad_w // 2,pad_w - pad_w//2], [0,0]], constant_values=255)
    img3 = img2.reshape(img2.shape[0] // tile_size,tile_size,img2.shape[1] // tile_size,tile_size, 3)
    
    img3 = img3.transpose(0,2,1,3,4).reshape(-1, tile_size, tile_size,3)
    n_tiles_with_info = (img3.reshape(img3.shape[0],-1).sum(1) < tile_size ** 2 * 3 * 255).sum()
    if len(img) < n_tiles:
        img3 = np.pad(img3,[[0,N-len(img3)],[0,0],[0,0],[0,0]], constant_values=255)
    idxs = np.argsort(img3.reshape(img3.shape[0],-1).sum(-1))[:n_tiles]
    img3 = img3[idxs]
    for i in range(len(img3)):
        tiles.append({'img':img3[i], 'idx':i})
    
    # create the patchwork  
    if is_rand:
        idxes = np.random.choice(list(range(n_tiles)), n_tiles, replace=False)
    else:
        idxes = list(range(n_tiles))
    idxes = np.asarray(idxes) + n_tiles if sub_imgs else idxes

    n_row_tiles = int(np.sqrt(n_tiles))
    images = np.zeros((tile_size * n_row_tiles, tile_size * n_row_tiles, 3))
    for h in range(n_row_tiles):
        for w in range(n_row_tiles):
            i = h * n_row_tiles + w
    
            if len(tiles) > idxes[i]:
               this_img = tiles[idxes[i]]['img']
            else:
                this_img = np.ones((tile_size, tile_size, 3)).astype(np.uint8) * 255
            this_img = 255 - this_img
            h1 = h * tile_size
            w1 = w * tile_size
            images[h1:h1+tile_size, w1:w1+tile_size] = this_img

    images = 255 - images
    images = images.astype(np.float32)
    images /= 255                                              # no need to normalize later
    
    return images
    
def tile_and_aug_tiff(img_path_tensor,level,aug,n_tiles):      # combining three python functions to be wrapped
    
    #read 
    img = read_tiff(img_path_tensor,level)
    
    # Augment the image
    if(aug): img= augment(img)

    # get the patches with tissue    
    images=tile_tiff(img,level,n_tiles)
    
    return images

def tile_aug_tiff(image_path,level,aug,n_tiles):  #inputs should be tensors
    
    [image,] = tf.py_function(tile_and_aug_tiff,[image_path,level,aug,n_tiles],[tf.float32])
    im_shape = image.shape
    image.set_shape(im_shape)
    
    return image  


@tf.autograph.experimental.do_not_convert   # ow gives out a lot of warnings..
def create_batches(path_list, level, subset_ratio, batch_size,n_tiles, tiled_input=1, debug=0, aug=0):
    
    dataset = tf.data.Dataset.from_tensor_slices(tf.convert_to_tensor(path_list,tf.string))
    if (debug): list(dataset.as_numpy_iterator())  
    
    if(tiled_input):
        batches = (
            dataset
            .cache()
            #.shuffle(len(path_list))  
            .take(int(len(path_list)*subset_ratio))
            .map(lambda x: tile_aug_tiff(x,tf.constant(level),tf.constant(aug),tf.constant(n_tiles)),num_parallel_calls=AUTOTUNE )
            .batch(batch_size)
            .prefetch(AUTOTUNE)
        )
    
    return batches   

def path_list(hold_out_DF,test_path,debug):
    
    hold_out_paths = []
    im_ID_ho = hold_out_DF.loc[:,'image_id']
    for ID in im_ID_ho:   hold_out_paths.append(os.path.join(test_path,ID+'.tiff')) # changed from data_simple to data dir
    if(debug): 
        print('holdout')
        for i in hold_out_paths[0:2]: print(i)
        print("Num of samples: ",len(hold_out_paths),"\n")   
    
    return hold_out_paths

In [4]:
def load_keras_model(model_filepath):
    model = tf.keras.models.load_model(
        model_filepath,
        custom_objects=None,
        compile=True
    )
    
    return model

In [12]:
level=2                        
batch_size=16
subset_ratio=1                 # can use a subset of the training data
tiled_input=1                  # image is tiled instead of cropped and padded
n_tiles=25
debug=1
aug=0


if(level==1):    input_dimx= int((n_tiles**0.5)*256)
elif(level==2):  input_dimx= int((n_tiles**0.5)*128)
else:            raise Exception("level is not 1 or 2")
input_dimy=input_dimx

model_filepath ='/home/emungan/panda/saved_models/KFold_fold0'

loaded_model = load_keras_model(model_filepath)

if (debug):
    hold_out_DF = pd.read_csv('{}/train.csv'.format(kaggle_data))
#    hold_out_DF=hold_out_DF[0:1000]
    test_path=data_dir
    print(test_path)
    
else:
    hold_out_DF = pd.read_csv('{}/test.csv'.format(kaggle_data))

print(hold_out_DF.shape)
pred = np.zeros((len(hold_out_DF), 6))

with tf.device('/CPU:0'):
    if os.path.exists(test_path):
        print('Test path exists: ',test_path)
        hold_out_paths = path_list(hold_out_DF,test_path,debug)
        hold_out_batch = create_batches(hold_out_paths, level, subset_ratio, batch_size,n_tiles, tiled_input, debug=debug, aug=0)
        pred = loaded_model.predict(hold_out_batch)
        print('Predict for {} images'.format(len(pred)))
    else:
        print('No test path. Predict zeros')    

if (debug): actuals=np.array(hold_out_DF['isup_grade'])

hold_out_DF['isup_grade'] = np.argmax(pred, axis = 1)
hold_out_DF.drop('data_provider', axis = 1, inplace = True)
hold_out_DF.to_csv('submission.csv', index = False)
!head /home/emungan/panda/codes/submission.csv

if (debug): train_labels[0:10]

/data/users/rpravahan/panda/train_images
(10616, 4)
Test path exists:  /data/users/rpravahan/panda/train_images
holdout
/data/users/rpravahan/panda/train_images/0005f7aaab2800f6170c399693a96917.tiff
/data/users/rpravahan/panda/train_images/000920ad0b612851f8e01bcc880d9b3d.tiff
Num of samples:  10616 

Predict for 10616 images
image_id,isup_grade,gleason_score
0005f7aaab2800f6170c399693a96917,0,0+0
000920ad0b612851f8e01bcc880d9b3d,0,0+0
0018ae58b01bdadc8e347995b69f99aa,4,4+4
001c62abd11fa4b57bf7a6c603a11bb9,5,4+4
001d865e65ef5d2579c190a0e0350d8f,0,0+0
002a4db09dad406c85505a00fb6f6144,1,0+0
003046e27c8ead3e3db155780dc5498e,0,3+3
0032bfa835ce0f43a92ae0bbab6871cb,1,3+3
003a91841da04a5a31f808fb5c21538a,1,3+3


In [13]:
if(debug):
    pred_probs= np.argmax(pred, axis = 1)
    m = tfa.metrics.CohenKappa(weightage='quadratic', num_classes=6,sparse_labels=True)
    m.update_state(actuals, pred_probs)
    print('Cohen\'s Kappa: ', m.result().numpy())

Cohen's Kappa:  0.7503367


In [15]:
if(debug): 
    confusion=tf.math.confusion_matrix(actuals, pred_probs, num_classes=6, weights=None, dtype=tf.dtypes.int32,name=None ).numpy()
    print(confusion)

[[2761   68    2   45    2   14]
 [ 796 1662  125   59    5   19]
 [ 162  538  385  202   24   32]
 [ 141  116  112  586  116  171]
 [ 213   61   44  273  354  304]
 [ 143    9    7  224  108  733]]


In [16]:
confusion_ratio= np.asarray(np.around(confusion/np.sum(confusion,axis=1),2))
print(confusion_ratio)

[[0.95 0.03 0.   0.04 0.   0.01]
 [0.28 0.62 0.09 0.05 0.   0.02]
 [0.06 0.2  0.29 0.16 0.02 0.03]
 [0.05 0.04 0.08 0.47 0.09 0.14]
 [0.07 0.02 0.03 0.22 0.28 0.25]
 [0.05 0.   0.01 0.18 0.09 0.6 ]]


In [17]:
class_prediction_acc=np.diag(confusion_ratio)
for isup in range(6):
    print("Prediction accuracy for class {} is {}".format(isup,class_prediction_acc[isup]))

Prediction accuracy for class 0 is 0.95
Prediction accuracy for class 1 is 0.62
Prediction accuracy for class 2 is 0.29
Prediction accuracy for class 3 is 0.47
Prediction accuracy for class 4 is 0.28
Prediction accuracy for class 5 is 0.6


**The model classifies the healthy samples the best. ISUP grades for 1 and 5 are classified similarly ok but it doesn't work well for grades 2 and 4 at all.**