# 20X Dataset
Downsample the 40X ROIs from the NFT detection project to 20X. Then tile the new ROIs into a dataset.

This notebook is final.

In [19]:
# Imports
import sys
sys.path.append('../..')

from pandas import read_csv, concat, DataFrame
from os.path import join, isfile, isdir
from shutil import copyfile, rmtree
from tqdm.notebook import tqdm
import cv2 as cv
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import yaml

from neurotk import imread, imwrite
from neurotk.utils import create_dirs, get_filename, im_to_txt_path
from neurotk import tile_roi_with_labels_wrapper

### Global Parameters

In [7]:
# If you want to save figures even if they already exist, then set this 
# parameter to True.
OVERWRITE = False

# Location to save 20X dataset.
SAVE_DIR = '/jcDataStore/Data/NeuroTK-Dash/nft-detection'
SRC_DIR = '/jcDataStore/Data/nft-ai-project/'

### Downsample 40X ROIs to 20X

In [20]:
# Downsample ROIs to 20X.
roi_csv_fp = join(SAVE_DIR, 'rois.csv')

if OVERWRITE or not isfile(roi_csv_fp):
    # Read the ROI iterations.
    mal_df = read_csv(join(
        SRC_DIR, 'datasets/model-assisted-labeling/model-assisted-labeling.csv'
    ))

    src_rois_df = read_csv(join(
        SRC_DIR, 'datasets/model-assisted-labeling/rois.csv'
    ))

    # Include only ROIs that have been checked. 
    src_rois_df = src_rois_df[
        src_rois_df.fp.isin(mal_df[mal_df.checked].fp.tolist())
    ]

    # Concatenate the background ROIs.
    src_rois_df = concat(
        [
            src_rois_df,
            read_csv(join(
                SRC_DIR, 'datasets/model-assisted-labeling/background-rois.csv'
            )),
        ],
        ignore_index=True
    )

    # Replace filepaths to be relative to local filepaths.
    src_rois_df = src_rois_df.replace(
        '/workspace/data/',
        '/jcDataStore/Data/nft-ai-project/',
        regex=True
    )

    # Saving new dataframe with upated paths and magnification.
    rois_df = []
    
    roi_img_dir = join(SAVE_DIR, 'rois/images')
    roi_label_dir = join(SAVE_DIR, 'rois/labels')
    roi_bound_dir = join(SAVE_DIR, 'rois/boundaries')

    if isdir(roi_img_dir):
        rmtree(roi_img_dir)
    if isdir(roi_label_dir):
        rmtree(roi_label_dir)
    if isdir(roi_bound_dir):
        rmtree(roi_bound_dir)
    create_dirs([roi_img_dir, roi_label_dir, roi_bound_dir])

    # Loop through each ROI.
    for _, r in tqdm(src_rois_df.iterrows(), total=len(src_rois_df)):
        fn = get_filename(r.fp)
        
        new_img_fp = join(roi_img_dir, fn + '.png')
        
        # Read the image.
        img = imread(r.fp)
        
        # Resize image.        
        img = cv.resize(img, None, fx=0.5, fy=0.5, interpolation=cv.INTER_AREA)
        
        h, w = img.shape[:2]
        
        imwrite(new_img_fp, img)
            
        # Save the corresponding label and boundary file.
        label_fp = im_to_txt_path(r.fp)
        bound_fp = im_to_txt_path(r.fp, txt_dir='boundaries')
                
        if isfile(label_fp):
            copyfile(label_fp, join(roi_label_dir, fn + '.txt'))
            
        if isfile(bound_fp):
            copyfile(bound_fp, join(roi_bound_dir, fn + '.txt'))
        else:
            raise FileNotFoundError(
                f'Missing ROI boundary file for \"{r.fp}\".'
            )
                
        # Track the ROI metadata.
        r = r.copy()
        r.mag = 20
        r.fp = new_img_fp
        r.h = h
        r.w = w
        r.sf = 0.5
        rois_df.append(r)
        
    rois_df = DataFrame(rois_df)
    rois_df.to_csv(roi_csv_fp, index=False)
else:
    rois_df = read_csv(roi_csv_fp)
    
rois_df.head()

  0%|          | 0/302 [00:00<?, ?it/s]

Unnamed: 0,fp,x,y,mag,group,sf,w,h,wsi_name,wsi_id,case,region,Braak_stage
0,/jcDataStore/Data/NeuroTK-Dash/nft-detection/r...,56607,69379,20,ROIv3,0.5,5412,5414,OS03-163_1B_TAU.svs,638147667f8a5e686a52efa4,A03-74,Amygdala,6
1,/jcDataStore/Data/NeuroTK-Dash/nft-detection/r...,58000,54422,20,ROIv1,0.5,2478,2812,OS03-163_1B_TAU.svs,638147667f8a5e686a52efa4,A03-74,Amygdala,6
2,/jcDataStore/Data/NeuroTK-Dash/nft-detection/r...,26754,51330,20,ROIv1,0.5,2536,2798,OS03-163_1B_TAU.svs,638147667f8a5e686a52efa4,A03-74,Amygdala,6
3,/jcDataStore/Data/NeuroTK-Dash/nft-detection/r...,40972,21598,20,ROIv3,0.5,5108,3592,OS03-163_1D_TAU.svs,6381476c7f8a5e686a5336e2,A03-74,Temporal cortex,6
4,/jcDataStore/Data/NeuroTK-Dash/nft-detection/r...,29095,57709,20,ROIv3,0.5,5591,4342,OS03-163_1D_TAU.svs,6381476c7f8a5e686a5336e2,A03-74,Temporal cortex,6


### Tile ROIs.

In [21]:
# Using multi-parallel processing when doing this.
tiles_csv_fp = join(SAVE_DIR, 'tiles.csv')

if OVERWRITE or not isfile(tiles_csv_fp):
    # Note: most recent version of shapely throws warnings when there is no 
    # intersection between two geometris. This is not an issue but the warnings 
    # are annoying. May put a catch later to avoid this.
    tiles_df = tile_roi_with_labels_wrapper(
        rois_df.fp.tolist(), 
        join(SAVE_DIR, 'tiles'), 
        tile_size=640,
        stride=480,
        boundary_thr=0.2,
        nproc=10,
        box_thr=0.5,
        notebook=True
    )
    
    tiles_df.to_csv(tiles_csv_fp, index=False)
else:
    tiles_df = read_csv(tiles_csv_fp)
    
tiles_df.head()

  0%|          | 0/302 [00:00<?, ?it/s]

  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **kwargs)
  return lib.intersection(a, b, **

Unnamed: 0,fp,roi_fp,x,y,tile_size
0,/jcDataStore/Data/NeuroTK-Dash/nft-detection/t...,/jcDataStore/Data/NeuroTK-Dash/nft-detection/r...,0,1440,640
1,/jcDataStore/Data/NeuroTK-Dash/nft-detection/t...,/jcDataStore/Data/NeuroTK-Dash/nft-detection/r...,0,1920,640
2,/jcDataStore/Data/NeuroTK-Dash/nft-detection/t...,/jcDataStore/Data/NeuroTK-Dash/nft-detection/r...,480,960,640
3,/jcDataStore/Data/NeuroTK-Dash/nft-detection/t...,/jcDataStore/Data/NeuroTK-Dash/nft-detection/r...,480,1440,640
4,/jcDataStore/Data/NeuroTK-Dash/nft-detection/t...,/jcDataStore/Data/NeuroTK-Dash/nft-detection/r...,480,1920,640


### Create dataset for training and validation.

In [None]:
# Split the training and validation by WSI images belong to. To do this add
# the WSI to each tile image.
roi_to_wsi_map = {r.fp: r.wsi_name for _, r in rois_df.iterrows()}

tiles_df['wsi_name'] = [''] * len(tiles_df)

for i, r in tiles_df.iterrows():
    tiles_df.loc[i, 'wsi_name'] = roi_to_wsi_map[r.roi_fp]

In [23]:
# Split the tiles by ROI into train and val (the background ROIs are only train).
wsis = rois_df[rois_df.group.isin(('ROIv1', 'ROIv3'))].wsi_name.unique().tolist()

In [26]:
# Split the tiles into train and val at 90:10 split.
train_wsis, val_wsis = train_test_split(
    sorted(wsis),
    train_size=0.9,
    random_state=64
)

background_fps = rois_df[rois_df.group == 'background-roi'].fp.tolist()
annotated_df = rois_df[rois_df.group.isin(('ROIv1', 'ROIv3'))]

train_fps = annotated_df[annotated_df.wsi_name.isin(train_wsis)].fp.tolist()
val_fps = annotated_df[annotated_df.wsi_name.isin(val_wsis)].fp.tolist()

train_tiles = tiles_df[tiles_df.roi_fp.isin(train_fps + background_fps)]
val_tiles = tiles_df[tiles_df.roi_fp.isin(val_fps)]

# Create train.txt, val.txt and the dataset.yaml file.
with open(join(SAVE_DIR, 'train-20X-640.txt'), 'w') as fh:
    lines = ''
    
    for fp in train_tiles.fp:
        lines += f'{fp}\n'
        
    fh.write(lines.strip())
    
with open(join(SAVE_DIR, 'val-20X-640.txt'), 'w') as fh:
    lines = ''
    
    for fp in val_tiles.fp:
        lines += f'{fp}\n'
        
    fh.write(lines.strip())
    
yaml_dict = {
    'names': ['Pre-NFT', 'iNFT'],
    'nc': 2,
    'path': SAVE_DIR,
    'train': 'train-20X-640.txt',
    'val': 'val-20X-640.txt',
    'test': 'test-20X-640.txt'
}

with open(join(SAVE_DIR, 'dataset-20X-640.yaml'), 'w') as fh:
    yaml.dump(yaml_dict, fh)