# Neural Image Compression # 

Using local Data to test functionality with a few WSIs


## Imports ##

In [4]:
# Import NIC to python path
import sys
nic_dir = '/mnt/netcache/pathology/projects/pathology-weakly-supervised-lung-cancer-growth-pattern-prediction/code/neural-image-compression-private'
sys.path.append(nic_dir +'/source')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from tqdm import tqdm
import os, shutil
from os.path import join, dirname, exists
import keras
from gradcam_wsi import gradcam_on_dataset
from preprocessing import data_to_csv, create_csv, generate_csv_files

## Data ##

To demonstrate the functionality of NIC, we will need a set of whole-slide images (WSIs) with their respective slide-level labels. In this case, we will use the WSIs that can be found using the following pattern:

These data was already reorganized, it is, all the tiff files are contained in one folder for each class. 

These are a small version of the TCGA dataset:

`/mnt/netcache/pathology/projects/data/tcga_luad/images_diagnostic`

`/mnt/netcache/pathology/projects/data/tcga_lusc/images_diagnostic`

The data we are going to use is only the **diagnostic** data and no the **tissue** data. The mask are already given, but we will have to implementa script to create this masks that filter out the background.


Because there is no slide-level csv file, we have to create one, this will be created after once we get the featurized wsi. FIle should be located at  from:

`/mnt/netcache/pathology/projects/data/slide_original_list_tcga.csv`


In [15]:
# Creates csv from original data

# project and data directories
root_dir = r'/mnt/netcache/pathology/projects/pathology-weakly-supervised-lung-cancer-growth-pattern-prediction'
data_dir = r'/mnt/netcache/pathology/archives/lung'

# wsi directories
dir_luad_wsi = os.path.join(data_dir, 'TCGA_LUAD', 'wsi_diagnostic_tif')
dir_lusc_wsi = os.path.join(data_dir, 'TCGA_LUSC', 'wsi_diagnostic_tif')
dir_luad_wsi_mask = os.path.join(data_dir, 'TCGA_LUAD', 'tissue_masks_diagnostic')
dir_lusc_wsi_mask = os.path.join(data_dir, 'TCGA_LUSC', 'tissue_masks_diagnostic')

# compressed image directories
vectorized_luad_dir = join(root_dir, 'results', 'tcga_luad', 'vectorized')
vectorized_lusc_dir = join(root_dir, 'results', 'tcga_lusc', 'vectorized')
featurized_luad_dir = join(root_dir, 'results', 'tcga_luad', 'featurized', 'no_augmentations')
featurized_lusc_dir = join(root_dir, 'results', 'tcga_lusc', 'featurized', 'no_augmentations')

# results directory 
result_dir = join(root_dir, 'results', 'baseline_model_full_data')  # store the results from trained model
gradcam_dir = join(result_dir, 'gradcam')        # store gradcam results

# Set paths
model_path = './neural-image-compression-private/models/encoders_patches_pathology/encoder_bigan.h5'
csv_train = os.path.join(root_dir, 'data','train_slide_list_tcga.csv')
csv_val = os.path.join(root_dir, 'data','validation_slide_list_tcga.csv')
csv_test = os.path.join(root_dir, 'data','test_slide_list_tcga.csv')

# csv paths
csv_path_wsi =  os.path.join(root_dir,'data/slide_original_list_tcga.csv')
csv_path_compressed_wsi = os.path.join(root_dir, 'data', 'slide_compressed_list_tcga.csv')

cache_dir = '/home/user/'  # used to store local copies of files during I/O operations (useful in cluster

## 0. Preprocessing

We need to create a csv file with the file names and labels. Plus we need to create csv files with the split file names for trainig/testing and validation. 

This csv files must be fixed! So no changes can be made later, for this reason is a good idea to create a csv containing the partitions

In [16]:
# create main csv file 
print('Creating main csv data files from original data...')
create_csv(dir_luad_wsi, dir_lusc_wsi, csv_path_wsi, '.tif')

# read files to check shapes
df = pd.read_csv(csv_path_wsi)
print(f'Files were read with shapes: {df.shape}')

100%|██████████| 531/531 [00:00<00:00, 181632.31it/s]
100%|██████████| 506/506 [00:00<00:00, 163531.96it/s]

Creating main csv data files from original data...
Csv file sucessfully exported!
Files were read with shapes: (1037, 2)





In [17]:
np.random.seed(7)
# creat split csv file for training/valid/testing
featurized_dir = {'data_dir_luad': featurized_luad_dir, 'data_dir_lusc': featurized_lusc_dir}
csv_path = {'csv_train': csv_train, 'csv_val': csv_val, 'csv_test': csv_test}

# Create csv files 
print('Creating compressed wsi csv file ...')
create_csv(featurized_luad_dir, featurized_lusc_dir, csv_path_compressed_wsi)

print('Creating split train/validation/test csv files ...')
generate_csv_files(csv_path_compressed_wsi, csv_train, csv_val, csv_test, test_size=0.2, validation_size = 0.3)

# read files to check shapes
df_train = pd.read_csv(csv_train);  df_val = pd.read_csv(csv_val);   df_test = pd.read_csv(csv_test)
print(f'Files were read with shapes: Training: {df_train.shape}, Validation {df_val.shape}, Testing {df_test.shape}')
print(f'Total files: Files were read with shapes: {df_train.shape[0]+df_val.shape[0]+df_test.shape[0]}')

100%|██████████| 1593/1593 [00:00<00:00, 409357.08it/s]

Creating compressed wsi csv file ...



100%|██████████| 1518/1518 [00:00<00:00, 305501.34it/s]


Csv file sucessfully exported!
Creating split train/validation/test csv files ...
Train/validation/test csv files sucessfully exported!
Files were read with shapes: Training: (580, 2), Validation (249, 2), Testing (208, 2)
Total files: Files were read with shapes: 1037


In [18]:
df_train['partition'] ='training'
df_val['partition'] ='validation'
df_test['partition'] ='testing'


In [21]:
pd.concat([df_train, df_val, df_test]).to_csv(join(root_dir,'data', 'data.csv'),index=False)

## 1. Encoder network ##

To perform NIC, we will need an encoder network to transform small image patches into embedding vectors. According to the paper, BiGAN produces the best unsupervised encoder and it is the one we will train here.

Alternatively, a collection of pretrained encoders (the one used in the NIC paper) can be found in 

`./models/encoders_patches_pathology/*.h5`

Remember that these pretrained encoders accept 128x128x3 patches taken at 0.5 um/px resolution (often level 1), except for the BiGAN model that takes 64x64x3 at 1 um/px (often level 2).



In order to train the BiGAN model, we will first extract patches from the slides in the `encoder` partition. We will sample 10K patches per slide, producing ~260K patches in total. We select 96x96 patches to perform crop augmentation during training later.

In [4]:
# # Dont run this we, will train later the encoder but not now. 

# from source.extract_patches import create_patch_dataset

# patches_npy_path = join(root_dir, 'results', 'patches', 'training.npy')

# # Extracts patches from whole-slide images and store them in a numpy array file
# create_patch_dataset(
#     input_dir=slide_dir,
#     csv_path=csv_path,
#     partition_tag='encoder',
#     output_path=patches_npy_path,
#     image_level=2,
#     patch_size=96,
#     n_patches_per_image=10000,
#     cache_dir=join(cache_dir, 'patches')
# )

Once we have extracted the patches, we can proceed to train the BiGAN model. We will use the hyper-parameters described in the NIC paper. 

In [5]:
# from source.train_bigan_model import BiganModel

# model_bigan_dir = join(root_dir, 'results', 'encoders', 'bigan', 'rotterdam1_96_noaug', '0.0001')

# # Trains BiGAN
# bigan = BiganModel(
#     latent_dim=128,
#     n_filters=128,
#     lr=0.0001,
#     patch_size=64,
# )
# bigan.train(
#     x_path=patches_npy_path,
#     output_dir=model_bigan_dir,
#     epochs=400000,
#     batch_size=64,
#     sample_interval=1000,
#     save_models_on_epoch=True
# )

Beware that training this model is highly unstable, thus it can fail or collapse with ease. If this happens, restart the training. Selecting a checkpoint model is a manual procedure: check the generated images and loss values and avoid abnormal results. 

## 2. Compress images ##

Once we have a trained encoder, we can proceed with the WSI compression. I recommend running several `IDLE` instances of the following code in the cluster to speed up the lenghty process.

Before the actual compression, we need to vectorize the WSIs. This process extracts all non-background patches from the slide and store them in numpy array format for quick access. In this case, we will read 64x64 patches at 1 um/px resolution (level 2).

In [6]:
# Vectorize LUAD WSIs
from vectorize_wsi import vectorize_images

vectorize_images(
    input_dir=dir_luad_wsi,
    mask_dir=dir_luad_wsi_mask, 
    output_dir=vectorized_luad_dir, 
    cache_dir=cache_dir, 
    image_level=2, 
    patch_size=128
    )

Already existing file TCGA-05-4244-01Z-00-DX1 - 530 images left
Already existing file TCGA-05-4245-01Z-00-DX1 - 529 images left
Already existing file TCGA-05-4249-01Z-00-DX1 - 528 images left
Already existing file TCGA-05-4250-01Z-00-DX1 - 527 images left
Already existing file TCGA-05-4382-01Z-00-DX1 - 526 images left
Already existing file TCGA-05-4395-01Z-00-DX1 - 525 images left
Already existing file TCGA-05-4396-01Z-00-DX1 - 524 images left
Already existing file TCGA-05-4397-01Z-00-DX1 - 523 images left
Already existing file TCGA-05-4398-01Z-00-DX1 - 522 images left
Already existing file TCGA-05-4402-01Z-00-DX1 - 521 images left
Already existing file TCGA-05-4403-01Z-00-DX1 - 520 images left
Already existing file TCGA-05-4405-01Z-00-DX1 - 519 images left
Already existing file TCGA-05-4415-01Z-00-DX1 - 518 images left
Already existing file TCGA-05-4417-01Z-00-DX1 - 517 images left
Already existing file TCGA-05-4418-01Z-00-DX1 - 516 images left
Already existing file TCGA-05-4420-01Z-0

Already existing file TCGA-55-6971-01Z-00-DX1 - 322 images left
Already existing file TCGA-55-6972-01Z-00-DX1 - 321 images left
Already existing file TCGA-55-6975-01Z-00-DX1 - 320 images left
Already existing file TCGA-55-6978-01Z-00-DX1 - 319 images left
Already existing file TCGA-55-6979-01Z-00-DX1 - 318 images left
Already existing file TCGA-55-6980-01Z-00-DX1 - 317 images left
Already existing file TCGA-55-6981-01Z-00-DX1 - 316 images left
Already existing file TCGA-55-6982-01Z-00-DX1 - 315 images left
Already existing file TCGA-55-6983-01Z-00-DX1 - 314 images left
Already existing file TCGA-55-6984-01Z-00-DX1 - 313 images left
Already existing file TCGA-55-6985-01Z-00-DX1 - 312 images left
Already existing file TCGA-55-6986-01Z-00-DX1 - 311 images left
Already existing file TCGA-55-6987-01Z-00-DX1 - 310 images left
Already existing file TCGA-55-7227-01Z-00-DX1 - 309 images left
Already existing file TCGA-55-7281-01Z-00-DX1 - 308 images left
Already existing file TCGA-55-7283-01Z-0

Already existing file TCGA-86-A4JF-01Z-00-DX1 - 101 images left
Already existing file TCGA-86-A4P7-01Z-00-DX1 - 100 images left
Already existing file TCGA-86-A4P8-01Z-00-DX1 - 99 images left
Already existing file TCGA-91-A4BC-01Z-00-DX1 - 98 images left
Already existing file TCGA-91-A4BD-01Z-00-DX1 - 97 images left
Already existing file TCGA-93-7347-01Z-00-DX1 - 96 images left
Already existing file TCGA-93-7348-01Z-00-DX1 - 95 images left
Already existing file TCGA-93-8067-01Z-00-DX1 - 94 images left
Already existing file TCGA-93-A4JN-01Z-00-DX1 - 93 images left
Already existing file TCGA-93-A4JO-01Z-00-DX1 - 92 images left
Already existing file TCGA-93-A4JP-01Z-00-DX1 - 91 images left
Already existing file TCGA-93-A4JQ-01Z-00-DX1 - 90 images left
Already existing file TCGA-95-7039-01Z-00-DX1 - 89 images left
Already existing file TCGA-95-7043-01Z-00-DX1 - 88 images left
Already existing file TCGA-95-7562-01Z-00-DX1 - 87 images left
Already existing file TCGA-95-7567-01Z-00-DX1 - 86 im

In [7]:
# Vectorize LUSC WSIs

vectorize_images(
    input_dir=dir_lusc_wsi,
    mask_dir=dir_lusc_wsi_mask, 
    output_dir=vectorized_lusc_dir, 
    cache_dir=cache_dir, 
    image_level=2, 
    patch_size=128
    )

Already existing file TCGA-18-3406-01Z-00-DX1 - 505 images left
Already existing file TCGA-18-3407-01Z-00-DX1 - 504 images left
Already existing file TCGA-18-3408-01Z-00-DX1 - 503 images left
Already existing file TCGA-18-3409-01Z-00-DX1 - 502 images left
Already existing file TCGA-18-3410-01Z-00-DX1 - 501 images left
Already existing file TCGA-18-3411-01Z-00-DX1 - 500 images left
Already existing file TCGA-18-3412-01Z-00-DX1 - 499 images left
Already existing file TCGA-18-3414-01Z-00-DX1 - 498 images left
Already existing file TCGA-18-3415-01Z-00-DX1 - 497 images left
Already existing file TCGA-18-3416-01Z-00-DX1 - 496 images left
Already existing file TCGA-18-3417-01Z-00-DX1 - 495 images left
Already existing file TCGA-18-3419-01Z-00-DX1 - 494 images left
Already existing file TCGA-18-3421-01Z-00-DX1 - 493 images left
Already existing file TCGA-18-4083-01Z-00-DX1 - 492 images left
Already existing file TCGA-18-4086-01Z-00-DX1 - 491 images left
Already existing file TCGA-18-4721-01Z-0

Already existing file TCGA-56-7222-01Z-00-DX1 - 299 images left
Already existing file TCGA-56-7223-01Z-00-DX1 - 298 images left
Already existing file TCGA-56-7579-01Z-00-DX1 - 297 images left
Already existing file TCGA-56-7580-01Z-00-DX1 - 296 images left
Already existing file TCGA-56-7582-01Z-00-DX1 - 295 images left
Already existing file TCGA-56-7730-01Z-00-DX1 - 294 images left
Already existing file TCGA-56-7731-01Z-00-DX1 - 293 images left
Already existing file TCGA-56-7822-01Z-00-DX1 - 292 images left
Already existing file TCGA-56-7823-01Z-00-DX1 - 291 images left
Already existing file TCGA-56-8082-01Z-00-DX1 - 290 images left
Already existing file TCGA-56-8083-01Z-00-DX1 - 289 images left
Already existing file TCGA-56-8201-01Z-00-DX1 - 288 images left
Already existing file TCGA-56-8304-01Z-00-DX1 - 287 images left
Already existing file TCGA-56-8305-01Z-00-DX1 - 286 images left
Already existing file TCGA-56-8307-01Z-00-DX1 - 285 images left
Already existing file TCGA-56-8308-01Z-0

Already existing file TCGA-85-8580-01Z-00-DX1 - 90 images left
Already existing file TCGA-85-8582-01Z-00-DX1 - 89 images left
Already existing file TCGA-85-8584-01Z-00-DX1 - 88 images left
Already existing file TCGA-85-8664-01Z-00-DX1 - 87 images left
Already existing file TCGA-85-8666-01Z-00-DX1 - 86 images left
Already existing file TCGA-85-A4CL-01Z-00-DX1 - 85 images left
Already existing file TCGA-85-A4CN-01Z-00-DX1 - 84 images left
Already existing file TCGA-85-A4JB-01Z-00-DX1 - 83 images left
Already existing file TCGA-85-A4JC-01Z-00-DX1 - 82 images left
Already existing file TCGA-85-A4PA-01Z-00-DX1 - 81 images left
Already existing file TCGA-85-A4QQ-01Z-00-DX1 - 80 images left
Already existing file TCGA-85-A4QR-01Z-00-DX1 - 79 images left
Already existing file TCGA-85-A50M-01Z-00-DX1 - 78 images left
Already existing file TCGA-85-A50Z-01Z-00-DX1 - 77 images left
Already existing file TCGA-85-A510-01Z-00-DX1 - 76 images left
Already existing file TCGA-85-A511-01Z-00-DX1 - 75 imag

Now we can compress the WSIs. Each WSI (vectorized file) will be processed 8 times due to WSI-level augmentation (rotation and flip). We will use an existing pretrained encoder from the NIC paper.

In [8]:
# Featurize images
from featurize_wsi import featurize_images

# Featurize LUAD data
featurize_images(
    input_dir=vectorized_luad_dir,
    model_path=model_path, 
    output_dir=featurized_luad_dir, 
    batch_size=32
    )

Already existing file TCGA-05-4244-01Z-00-DX1_{item} - 530 images left
Already existing file TCGA-05-4245-01Z-00-DX1_{item} - 529 images left
Already existing file TCGA-05-4249-01Z-00-DX1_{item} - 528 images left
Already existing file TCGA-05-4250-01Z-00-DX1_{item} - 527 images left
Already existing file TCGA-05-4382-01Z-00-DX1_{item} - 526 images left
Already existing file TCGA-05-4395-01Z-00-DX1_{item} - 525 images left
Already existing file TCGA-05-4396-01Z-00-DX1_{item} - 524 images left
Already existing file TCGA-05-4397-01Z-00-DX1_{item} - 523 images left
Already existing file TCGA-05-4398-01Z-00-DX1_{item} - 522 images left
Already existing file TCGA-05-4402-01Z-00-DX1_{item} - 521 images left
Already existing file TCGA-05-4403-01Z-00-DX1_{item} - 520 images left
Already existing file TCGA-05-4405-01Z-00-DX1_{item} - 519 images left
Already existing file TCGA-05-4415-01Z-00-DX1_{item} - 518 images left
Already existing file TCGA-05-4417-01Z-00-DX1_{item} - 517 images left
Alread

Already existing file TCGA-50-6673-01Z-00-DX1_{item} - 341 images left
Already existing file TCGA-50-7109-01Z-00-DX1_{item} - 340 images left
Already existing file TCGA-50-8457-01Z-00-DX1_{item} - 339 images left
Already existing file TCGA-50-8459-01Z-00-DX1_{item} - 338 images left
Already existing file TCGA-53-7624-01Z-00-DX1_{item} - 337 images left
Already existing file TCGA-53-7626-01Z-00-DX1_{item} - 336 images left
Already existing file TCGA-53-7813-01Z-00-DX1_{item} - 335 images left
Already existing file TCGA-53-A4EZ-01Z-00-DX1_{item} - 334 images left
Already existing file TCGA-55-1592-01Z-00-DX1_{item} - 333 images left
Already existing file TCGA-55-1594-01Z-00-DX1_{item} - 332 images left
Already existing file TCGA-55-1595-01Z-00-DX1_{item} - 331 images left
Already existing file TCGA-55-1596-01Z-00-DX1_{item} - 330 images left
Already existing file TCGA-55-5899-01Z-00-DX1_{item} - 329 images left
Already existing file TCGA-55-6543-01Z-00-DX1_{item} - 328 images left
Alread

Already existing file TCGA-78-7153-01Z-00-DX1_{item} - 160 images left
Already existing file TCGA-78-7154-01Z-00-DX1_{item} - 159 images left
Already existing file TCGA-78-7155-01Z-00-DX1_{item} - 158 images left
Already existing file TCGA-78-7156-01Z-00-DX1_{item} - 157 images left
Already existing file TCGA-78-7158-01Z-00-DX1_{item} - 156 images left
Already existing file TCGA-78-7159-01Z-00-DX1_{item} - 155 images left
Already existing file TCGA-78-7160-01Z-00-DX1_{item} - 154 images left
Already existing file TCGA-78-7161-01Z-00-DX1_{item} - 153 images left
Already existing file TCGA-78-7163-01Z-00-DX1_{item} - 152 images left
Already existing file TCGA-78-7166-01Z-00-DX1_{item} - 151 images left
Already existing file TCGA-78-7167-01Z-00-DX1_{item} - 150 images left
Already existing file TCGA-78-7220-01Z-00-DX1_{item} - 149 images left
Already existing file TCGA-78-7535-01Z-00-DX1_{item} - 148 images left
Already existing file TCGA-78-7536-01Z-00-DX1_{item} - 147 images left
Alread

In [9]:
# Featurize LUSC data
featurize_images(
    input_dir=vectorized_lusc_dir,
    model_path=model_path, 
    output_dir=featurized_lusc_dir, 
    batch_size=32
    )

Already existing file TCGA-18-3406-01Z-00-DX1_{item} - 505 images left
Already existing file TCGA-18-3407-01Z-00-DX1_{item} - 504 images left
Already existing file TCGA-18-3408-01Z-00-DX1_{item} - 503 images left
Already existing file TCGA-18-3409-01Z-00-DX1_{item} - 502 images left
Already existing file TCGA-18-3410-01Z-00-DX1_{item} - 501 images left
Already existing file TCGA-18-3411-01Z-00-DX1_{item} - 500 images left
Already existing file TCGA-18-3412-01Z-00-DX1_{item} - 499 images left
Already existing file TCGA-18-3414-01Z-00-DX1_{item} - 498 images left
Already existing file TCGA-18-3415-01Z-00-DX1_{item} - 497 images left
Already existing file TCGA-18-3416-01Z-00-DX1_{item} - 496 images left
Already existing file TCGA-18-3417-01Z-00-DX1_{item} - 495 images left
Already existing file TCGA-18-3419-01Z-00-DX1_{item} - 494 images left
Already existing file TCGA-18-3421-01Z-00-DX1_{item} - 493 images left
Already existing file TCGA-18-4083-01Z-00-DX1_{item} - 492 images left
Alread

Already existing file TCGA-56-7222-01Z-00-DX1_{item} - 299 images left
Already existing file TCGA-56-7223-01Z-00-DX1_{item} - 298 images left
Already existing file TCGA-56-7579-01Z-00-DX1_{item} - 297 images left
Already existing file TCGA-56-7580-01Z-00-DX1_{item} - 296 images left
Already existing file TCGA-56-7582-01Z-00-DX1_{item} - 295 images left
Already existing file TCGA-56-7730-01Z-00-DX1_{item} - 294 images left
Already existing file TCGA-56-7731-01Z-00-DX1_{item} - 293 images left
Already existing file TCGA-56-7822-01Z-00-DX1_{item} - 292 images left
Already existing file TCGA-56-7823-01Z-00-DX1_{item} - 291 images left
Already existing file TCGA-56-8082-01Z-00-DX1_{item} - 290 images left
Already existing file TCGA-56-8083-01Z-00-DX1_{item} - 289 images left
Already existing file TCGA-56-8201-01Z-00-DX1_{item} - 288 images left
Already existing file TCGA-56-8304-01Z-00-DX1_{item} - 287 images left
Already existing file TCGA-56-8305-01Z-00-DX1_{item} - 286 images left
Alread

Already existing file TCGA-85-6798-01Z-00-DX1_{item} - 117 images left
Already existing file TCGA-85-7696-01Z-00-DX1_{item} - 116 images left
Already existing file TCGA-85-7697-01Z-00-DX1_{item} - 115 images left
Already existing file TCGA-85-7698-01Z-00-DX1_{item} - 114 images left
Already existing file TCGA-85-7699-01Z-00-DX1_{item} - 113 images left
Already existing file TCGA-85-7710-01Z-00-DX1_{item} - 112 images left
Already existing file TCGA-85-7843-01Z-00-DX1_{item} - 111 images left
Already existing file TCGA-85-7844-01Z-00-DX1_{item} - 110 images left
Already existing file TCGA-85-7950-01Z-00-DX1_{item} - 109 images left
Already existing file TCGA-85-8048-01Z-00-DX1_{item} - 108 images left
Already existing file TCGA-85-8049-01Z-00-DX1_{item} - 107 images left
Already existing file TCGA-85-8052-01Z-00-DX1_{item} - 106 images left
Already existing file TCGA-85-8070-01Z-00-DX1_{item} - 105 images left
Already existing file TCGA-85-8071-01Z-00-DX1_{item} - 104 images left
Alread

## 3. Train CNN on compressed images ##

Once we have compressed the WSIs, we can proceed with the CNN classifier. In this example, we will train a classifier targeting the binary label `HGP_SL` found in the CSV file. We will be training 4 models using cross-validation: in each fold, we will use 2 data partitions for training, 1 for validation and 1 for testing. At the end of model training, we perform inference on the test set, compute metrics, and run GradCAM on the images.



In [11]:
from model_training import train_wsi_classifier, eval_model, compute_metrics
from utils import check_file_exists

def train_model(featurized_dir, csv_path, fold_n, output_dir, cache_dir, batch_size=16, epochs=32,
                images_dir=None, vectorized_dir=None, lr=1e-2, patience=4, delete_folder=False,
                occlusion_augmentation=False, elastic_augmentation=False, shuffle_augmentation=None):
    """
    Trains a CNN using compressed whole-slide images.

    :param featurized_dir: folder containing the compressed (featurized) images.
    :param csv_path: list of slides with labels.
    :param fold_n: fold determining which data partitions to use for training, validation and testing.
    :param output_dir: destination folder to store results.
    :param cache_dir: folder to store compressed images temporarily for fast access.
    :param batch_size: number of samples to train with in one-go.
    :return: nothing.
    """
    
    # Delete folder and subfolders if exists
    if delete_folder: 
        if exists(result_dir):  shutil.rmtree(result_dir)
            
    # Train CNN
    train_wsi_classifier(
        data_dir=featurized_dir,
        csv_path=csv_path,
        partitions=None,
        crop_size=400,
        output_dir=output_dir,
        output_units=2,
        cache_dir=cache_dir,
        n_epochs=epochs,
        batch_size=batch_size,
        lr=lr,
        code_size=128,
        workers=1,
        train_step_multiplier=1,
        val_step_multiplier=0.5,
        keep_data_training=1,
        keep_data_validation=1,
        patience=patience,
        occlusion_augmentation=occlusion_augmentation,
        elastic_augmentation=elastic_augmentation,
        shuffle_augmentation=shuffle_augmentation
    )  

    # Evaluate CNN 
    
    # Get compressed wsi directories with csv test file
    data_config = featurized_dir
    data_config['csv_path'] = csv_path['csv_test']
    
    eval_model(
        model_path=join(output_dir, 'checkpoint.h5'),
        data_config=data_config,
        crop_size=400,
        output_path=join(output_dir, 'eval', 'preds.csv'),
        cache_dir=None,
        batch_size=batch_size,
        keep_data=1
    )

    # Metrics
    try:
        compute_metrics(
            input_path=join(output_dir, 'eval', 'preds.csv'),
            output_dir=join(output_dir, 'eval')
        )
    except Exception as e:
        print('Failed to compute metrics. Exception: {e}'.format(e=e), flush=True)

#     # Apply GradCAM analysis to CNN
#     gradcam_on_dataset(
#         featurized_dir=featurized_dir,
#         csv_path=csv_path,
#         model_path=join(output_dir, 'checkpoint.h5'),
#         partitions=folds[fold_n]['test'],
#         layer_name='separable_conv2d_1',
#         output_unit=1,
#         custom_objects=None,
#         cache_dir=cache_dir,
#         images_dir=images_dir,
#         vectorized_dir=vectorized_dir
#     )

In [None]:
# Train CNN

#selected_fold = 0

featurized_dir = {'data_dir_luad': featurized_luad_dir, 'data_dir_lusc': featurized_lusc_dir}
csv_path = {'csv_train': csv_train, 'csv_val': csv_val, 'csv_test': csv_test}

train_model(
    featurized_dir=featurized_dir,
    csv_path=csv_path,
    fold_n=0, 
    output_dir=result_dir,
    cache_dir=None,
    batch_size =4,
    epochs=2,
    delete_folder=True,
    occlusion_augmentation=False,
    lr=1e-2,
    patience=4,
    elastic_augmentation=False,
    images_dir=None,  # required for GradCAM
    vectorized_dir=None,  # required for GradCAM
    shuffle_augmentation=None
)

Loading training set ...
FeaturizedWsiGenerator data config: {'data_dir_luad': '/mnt/netcache/pathology/projects/pathology-weakly-supervised-lung-cancer-growth-pattern-prediction/results/tcga_luad/featurized/no_augmentations', 'data_dir_lusc': '/mnt/netcache/pathology/projects/pathology-weakly-supervised-lung-cancer-growth-pattern-prediction/results/tcga_lusc/featurized/no_augmentations', 'csv_path': '/mnt/netcache/pathology/archives/lung/train_slide_list_tcga.csv'}
FeaturizedWsiGenerator using 580 samples and 145 batches, distributed in 306 positive and 274 negative samples.
Loading validation set ...
FeaturizedWsiSequence data config: {'data_dir_luad': '/mnt/netcache/pathology/projects/pathology-weakly-supervised-lung-cancer-growth-pattern-prediction/results/tcga_luad/featurized/no_augmentations', 'data_dir_lusc': '/mnt/netcache/pathology/projects/pathology-weakly-supervised-lung-cancer-growth-pattern-prediction/results/tcga_lusc/featurized/no_augmentations', 'csv_path': '/mnt/netcac

## Aplying GradCam

Here we get the folder separetely

In [None]:
print('GradCam will be apply to this dataset!')
csv_path_luad_feat = join(data_dir, 'slide_list_featurized_luad.csv')
data_to_csv(featurized_luad_dir, csv_path_luad_feat)
pd.read_csv(csv_path_luad_feat)

In [None]:
# for layer in model.layers:
#     print(layer.name)
# Apply GradCAM analysis to CNN on LUAD data

# Apply GradCam on layer 1
gradcam_on_dataset(
    data_dir=[featurized_luad_dir, featurized_lusc_dir],
    csv_path=csv_path_luad_feat,
    model_path=join(result_dir, 'checkpoint.h5'),
    partitions=0,
    layer_number=1,
    custom_objects=None,
    cache_dir=cache_dir,
    images_dir=dir_luad_wsi,
    vectorized_dir=vectorized_luad_dir,
    output_dir=gradcam_dir,
    predict_two_output = True
)

In [None]:
def plot_gradcam(gradcam_images):
    %matplotlib inline
    rows = 3; columns = 5;
    fig, axs = plt.subplots(rows,columns,figsize=(20,8))
    axs = axs.ravel()
    n_images = rows * columns

    for idx in range(n_images):
        img = plt.imread(gradcam_images[idx])
        if idx == 0: print(f'Images shape: {img.shape}')
        axs[idx].imshow(img)
        prefix = os.path.basename(gradcam_images[idx]).split('_')[0]
        subfix = os.path.basename(gradcam_images[idx]).split('_')[-1]
        axs[idx].set_title(prefix + '_' + subfix)
        axs[idx].set_xlabel(prefix + subfix)
        axs[idx].get_xaxis().set_visible(False)
        axs[idx].get_yaxis().set_visible(False)

In [None]:
# Get gradcam images
gradcam_images = sorted([os.path.join(gradcam_dir, f) for f in os.listdir(gradcam_dir) if f.endswith('.png')])

# Plot images
plot_gradcam(gradcam_images)