In [76]:
# Hacky way to resolve project paths
import sys
import os
from pathlib import Path

sys.path.append(str(Path(os.getcwd()).parents[0]))
sys.path.append(str(Path(os.getcwd()).parents[1]))

import json
from ubteacher import add_ubteacher_config
from detectron2.config import get_cfg
from typing import Dict, Set, List, Tuple, Iterator
import matplotlib.pyplot as plt
import tifffile as tf
from detectron2.engine import default_argument_parser, default_setup, launch

Plan: use qupath_annotations_latest instead of TissueAnnotator outputs for loading -- consistency across datasets and easier to use.
1. Load only information relevant to the task at hand from json
2. Load npy files for each image from dir, with option to create
3. Register the dataset
4. Idk

In [77]:
from ubteacher.utils.train2_utils import (get_scaling, ParseFromQuPath)

In [106]:
import os
import numpy as np
from typing import Dict, Tuple, List, Set, Iterator
import json
import glob
from detectron2.data import transforms as T
import tifffile as tf


def select_annotypes(anno_dir: str) -> List[str]:
    """
    Select annotation types to include
    """
    annotypes = []
    possible_tissues = []
    for f in glob.glob(os.path.join(anno_dir, '*.json')):
        with open(f, 'r') as f:
            data = json.load(f)
        for i in data:
            try:
                if i['geometry']['type'] == 'Polygon':
                    possible_tissues += [next(search_recursive(i, 'name'))]
                    possible_tissues = list(set(t.split(' ')[0] for t in possible_tissues))
            except:
                pass
    print(f'Found {set(possible_tissues)} tissue types with valid annotations')
    selected_tissues = input('Select tissue types to train on (comma separated)')
    tissue_types = selected_tissues.split(',')
    print(f'Selected tissue types: {tissue_types}')
    annotypes.extend(tissue_types)
    return annotypes
        
def find_anno_dir(parent_dir: str) -> List[str]:
    """
    Find qupath exported annotations directory
    """
    
    if os.path.exists(os.path.join(parent_dir, 'xupath_annotations_latest')):
        return os.path.join(parent_dir, 'qupath_annotations_latest')
    else:
        anno_dirs = []
        for root, dirs, files in os.walk(parent_dir):
            for d in dirs:
                if 'annotations' in d:
                    anno_dirs.append(os.path.join(root, d))
        # user chooses if there are multiple annotation folders
        print('Found multiple annotation folders:')
        for i, anno_dir in enumerate(anno_dirs):
            print(f'{i}: {os.path.relpath(anno_dir, parent_dir)}')
        choice = input('Choose annotation folder index')
        if choice.isdigit() and int(choice) < len(anno_dirs):
            return anno_dirs[int(choice)]    
        else:
            raise ValueError('Annotation folder not found')
        
def find_img_dir(parent_dir: str) -> List[str]:
    """
    Find npy image directory for training
    """
    img_dirs = []
    for root, dirs, files in os.walk(parent_dir):
        for d in dirs:
            if glob.glob(os.path.join(root, d, '*.npy')):
                img_dirs.append(os.path.join(root, d))
    # user chooses if there are multiple img folders
    for i, img_dir in enumerate(img_dirs):
        print(f'{i}: {os.path.relpath(img_dir, parent_dir)}')
    choice = input('Choose image folder index')
    if choice.isdigit() and int(choice) < len(img_dirs):
        return img_dirs[int(choice)]
    else:
        raise ValueError('Image folder not found')
        
def search_recursive(d: Dict, key: str) -> Iterator:
        """Helper function for finding which level of json annotations has
        the matching key.
        """
        for k, v in d.items():
            if isinstance(v, Dict):
                for match in search_recursive(v, key):
                    yield match
            if k == key:
                # generator function - saves in memory until called
                # (use for loop to call)
                yield v
    
def get_scaling(original_file, output_file):
        with tf.TiffFile(original_file) as tiff:
            # get base size
            base_dim = tiff.pages[0].shape[:2]
            print(f'Image size: {base_dim}') #TODO: remove
    
        f = np.load(output_file)
        target_dim = f.shape[:2]
        del f # use del instead of with because numpy version issue
        print(f'Image size: {target_dim}') #TODO: remove
        return base_dim, target_dim

class ParseFromQuPath:
    
    def __init__(self, ref_dim, target_dim, tissue_types):
        self.anno_dir = '/mnt/RSX/Datasets_pathology/SRI_OSCC_lymph_labeled/qupath_annotations_latest'
        self.img_dir = '/mnt/RSX/Datasets_pathology/GT_2023/TissueFinderV2/SRI_OSCC'
        self.ref_dim = ref_dim
        self.target_dim = target_dim
        self.tissue_types = tissue_types
            
    def scale_bboxes_qupath(self, anno):
        x_scale = self.ref_dim[1] / self.target_dim[1]
        y_scale = self.ref_dim[0] / self.target_dim[0]
        for i in anno:
            [coords] = i['coordinates']
            # First, build XYXY
            x0 = int(coords[0][0] / x_scale)
            y0 = int(coords[0][1] / y_scale)
            x1 = int(coords[2][0] / x_scale)
            y1 = int(coords[2][1] / y_scale)
            i['bbox'] = [x0, y0, x1, y1]
            del i['coordinates']
            # make it as 'bbox': asdjhkf
        return anno
        
    def get_boxes(self, json_file):
        
        with open(json_file, 'r') as f:
            data = json.load(f)
        tissue_data = []
            
        for i in data:
            if any(tissue in list(search_recursive(i, 'name')) for tissue in self.tissue_types):
                tissue_data.append(i)
        cat_map = {tissue: i for i, tissue in enumerate(self.tissue_types)}
        coords = []
        for k in tissue_data:
            ## add names to k 
            k['geometry']['category_id'] = cat_map[next(search_recursive(k, 'name'))]
            del k['geometry']['type']
            k['geometry']['bbox_mode'] = 0
            coords.append(next(search_recursive(k, 'geometry')))
        
        out = self.scale_bboxes_qupath(coords)
        
        return out, cat_map

    def get_coco_format(self, json_file):
        
        """
        Get coco format for detectron2
        """
        ## Determine image format
        img_base = os.path.basename(os.path.splitext(json_file)[0])
        img_fname = os.path.join(self.img_dir, img_base) + '.npy'
        
        ## Get annotation data
        
        annotation_dicts, cat_map = self.get_boxes(json_file)
        
        ## Fill remaining fields
        
        dataset_dicts = [{'file_name': img_fname,
                        'height': self.target_dim[0],
                        'width': self.target_dim[1],
                        'image_id': img_base,
                        'annotations': annotation_dicts}
                        ]  

        return dataset_dicts, cat_map

In [103]:
classes = select_annotypes('/mnt/RSX/Datasets_pathology/SRI_OSCC_lymph_labeled/qupath_annotations_latest')

Found {'ROI_neoplastic', 'Primary', 'multinucleated_cells', 'Lymphoid', 'no', 'No', 'keratin', 'Necrosis', 'Parotid', 'ROI', 'non-neoplastic', 'non-lymph', 'Non-neoplastic', 'primary', 'skeletal', 'oral', 'Salivary', 'parotid', 'Tumor', 'Keratin', 'Submandibular', 'ROI_non-neoplastic', 'lymph'} tissue types with valid annotations
Selected tissue types: ['non-lymph']


In [107]:
original_file = '/mnt/RSX/Datasets_pathology/SRI_OSCC_lymph_labeled/images/Case 1 G7.svs'
output_file = '/mnt/RSX/Datasets_pathology/GT_2023/TissueFinderV2/SRI_OSCC_lymph_labeled/Case 1 G7.npy'
json_file = '/mnt/RSX/Datasets_pathology/SRI_OSCC_lymph_labeled/qupath_annotations_latest/Case 1 G7.json'

base_dim, target_dim = get_scaling(original_file, output_file)
dataset_dicts, cat_map = ParseFromQuPath(base_dim, target_dim, classes).get_coco_format(json_file)
print(dataset_dicts)

Image size: (33709, 39839)
Image size: (2166, 2560)
[{'type': 'Feature', 'id': 'aa9492b8-58bb-490f-acf4-52625979f76a', 'geometry': {'type': 'Polygon', 'coordinates': [[[14701, 1589], [39083, 1589], [39083, 27357], [14701, 27357], [14701, 1589]]]}, 'properties': {'objectType': 'annotation', 'name': 'non-lymph', 'classification': {'name': 'oral', 'color': [176, 50, 232]}}}, {'type': 'Feature', 'id': '328e55f2-31a1-4135-bce1-442798f103be', 'geometry': {'type': 'Polygon', 'coordinates': [[[1629, 10865], [21905, 10865], [21905, 32764], [1629, 32764], [1629, 10865]]]}, 'properties': {'objectType': 'annotation', 'name': 'non-lymph', 'classification': {'name': 'oral', 'color': [176, 50, 232]}}}]
{'non-lymph': 0}


UnboundLocalError: local variable 'k' referenced before assignment

In [17]:
print(dataset_dicts)

[{'file_name': '/mnt/RSX/Datasets_pathology/GT_2023/TissueFinderV2/SRI_OSCC/Case 1 G7.npy', 'height': 2166, 'width': 2560, 'image_id': 'Case 1 G7', 'annotations': [{'category_id': 1, 'bbox_mode': 0, 'bbox': [944, 102, 2511, 1757]}, {'category_id': 1, 'bbox_mode': 0, 'bbox': [104, 698, 1407, 2105]}]}]


In [8]:
print(cat_map)

{'lymph': 0, 'non-lymph': 1}


In [75]:
def setup(args):
    """
    Create configs and perform basic setups.
    """
    cfg = get_cfg()
    cfg.set_new_allowed(True) #allows custom cfg keys
    add_ubteacher_config(cfg)
    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()
    default_setup(cfg, args)
    
def main(args):
    
    cfg = setup(args)
    anno_dir = find_anno_dir(cfg.ANNO_DIR)
    img_dir = find_img_dir(cfg.IMG_DIR)
    try:
        classes = cfg.DATASET.CLASSES
    except:
        classes = select_annotypes(anno_dir)
    
if __name__ == "__main__":
    args = default_argument_parser().parse_args()

    print("Command Line Args:", args)
    launch(
        main,
        args.num_gpus,
        num_machines=args.num_machines,
        machine_rank=args.machine_rank,
        dist_url=args.dist_url,
        args=(args,),
    )
