# 20X Dataset
Downsample the 40X ROIs from the NFT detection project to 20X. Then tile the new ROIs into a dataset.

This notebook is final.

In [None]:
# Imports
import sys
sys.path.append('../..')

from pandas import read_csv, concat, DataFrame
from os.path import join, isfile
from shutil import copyfile, rmtree
from tqdm.notebook import tqdm
import cv2 as cv
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import yaml

from neurotk import imread, imwrite
from neurotk.utils import create_dirs, get_filename, im_to_txt_path
from neurotk import tile_roi_with_labels_wrapper

### Global Parameters

In [None]:
# If you want to save figures even if they already exist, then set this 
# parameter to True.
OVERWRITE = False

# Location to save 20X dataset.
SAVE_DIR = '/jcDataStore/Data/NeuroTK-Dash/nft-detection'

### Downsample 40X ROIs to 20X

In [None]:
# Downsample ROIs to 20X.
roi_csv_fp = join(SAVE_DIR, 'rois.csv')

if OVERWRITE or not isfile(roi_csv_fp):
    # Get info for ROIs used for training and validation datasets. 
    src_rois_df = concat(
        [
            read_csv('/jcDataStore/Data/nft-ai-project/datasets/'
                    'model-assisted-labeling/rois.csv'),
            read_csv('/jcDataStore/Data/nft-ai-project/datasets/'
                    'model-assisted-labeling/background-rois.csv')
        ],
        ignore_index=True
    )

    # Replace filepaths to be relative to local filepaths.
    src_rois_df = src_rois_df.replace(
        '/workspace/data/jcDataStore/Data/nft-ai-project/', regex=True
    )

    # Saving new dataframe with upated paths and magnification.
    rois_df = []
    
    roi_img_dir = join(SAVE_DIR, 'rois/images')
    roi_label_dir = join(SAVE_DIR, 'rois/labels')
    roi_bound_dir = join(SAVE_DIR, 'rois/boundaries')

    rmtree(roi_img_dir)
    rmtree(roi_label_dir)
    rmtree(roi_bound_dir)
    create_dirs([roi_img_dir, roi_label_dir, roi_bound_dir])

    # Loop through each ROI.
    for _, r in tqdm(src_rois_df.iterrows(), total=len(src_rois_df)):
        fn = get_filename(r.fp)
        
        new_img_fp = join(roi_img_dir, fn + '.png')
        
        # Read the image.
        img = imread(r.fp)
        
        # Resize image.        
        img = cv.resize(img, None, fx=0.5, fy=0.5, interpolation=cv.INTER_AREA)
        
        h, w = img.shape[:2]
        
        imwrite(new_img_fp, img)
            
        # Save the corresponding label and boundary file.
        label_fp = im_to_txt_path(r.fp)
        bound_fp = im_to_txt_path(r.fp, txt_dir='boundaries')
                
        if isfile(label_fp):
            copyfile(label_fp, join(roi_label_dir, fn + '.txt'))
            
        if isfile(bound_fp):
            copyfile(bound_fp, join(roi_bound_dir, fn + '.txt'))
        else:
            raise FileNotFoundError(
                f'Missing ROI boundary file for \"{r.fp}\".'
            )
                
        # Track the ROI metadata.
        r = r.copy()
        r.mag = 20
        r.fp = new_img_fp
        r.h = h
        r.w = w
        r.sf = 0.5
        rois_df.append(r)
        
    rois_df = DataFrame(rois_df)
    rois_df.to_csv(roi_csv_fp, index=False)
else:
    rois_df = read_csv(roi_csv_fp)
    
rois_df.head()

### Tile ROIs.

In [None]:
# Using multi-parallel processing when doing this.
tiles_csv_fp = join(SAVE_DIR, 'tiles.csv')

if OVERWRITE or not isfile(tiles_csv_fp):
    # Note: most recent version of shapely throws warnings when there is no 
    # intersection between two geometris. This is not an issue but the warnings 
    # are annoying. May put a catch later to avoid this.
    tiles_df = tile_roi_with_labels_wrapper(
        rois_df.fp.tolist(), 
        join(SAVE_DIR, 'tiles'), 
        tile_size=1280,
        stride=960,
        boundary_thr=0.2,
        nproc=10,
        box_thr=0.5,
        notebook=True
    )
    
    tiles_df.to_csv(tiles_csv_fp, index=False)
else:
    tiles_df = read_csv(tiles_csv_fp)
    
tiles_df.head()

### Create dataset for training and validation.

In [None]:
# Split the training and validation by WSI images belong to. To do this add
# the WSI to each tile image.
roi_to_wsi_map = {r.fp: r.wsi_name for _, r in rois_df.iterrows()}

tiles_df['wsi_name'] = [''] * len(tiles_df)

for i, r in tiles_df.iterrows():
    tiles_df.loc[i, 'wsi_name'] = roi_to_wsi_map[r.roi_fp]

In [None]:
# Split the tiles into train and val at 90:10 split.
train_wsis, val_wsis = train_test_split(
    sorted(list(tiles_df.wsi_name.unique())),
    train_size=0.9,
    random_state=64
)

train_tiles = tiles_df[tiles_df.wsi_name.isin(train_wsis)]
val_tiles = tiles_df[tiles_df.wsi_name.isin(val_wsis)]

# Create train.txt, val.txt and the dataset.yaml file.
with open(join(SAVE_DIR, '20X-train.txt'), 'w') as fh:
    lines = ''
    
    for fp in train_tiles.fp:
        lines += f'{fp}\n'
        
    fh.write(lines.strip())
    
with open(join(SAVE_DIR, '20X-val.txt'), 'w') as fh:
    lines = ''
    
    for fp in val_tiles.fp:
        lines += f'{fp}\n'
        
    fh.write(lines.strip())
    
yaml_dict = {
    'names': ['Pre-NFT', 'iNFT'],
    'nc': 2,
    'path': SAVE_DIR,
    'train': '20X-train.txt',
    'val': '20X-val.txt'
}

with open(join(SAVE_DIR, '20X-dataset.yaml'), 'w') as fh:
    yaml.dump(yaml_dict, fh)