## Step 1: Data Preparation

In [None]:
# install pathology-whole-slide-data
!pip3 install git+https://github.com/DIAGNijmegen/pathology-whole-slide-data@main

In [None]:
import os
from utils.dot2polygon import dot2polygon
import glob
import yaml

In [None]:
# make a folder for converted annotations
output_root = r'./data/annotations_polygon'
if not(os.path.isdir(output_root)): os.mkdir (output_root) 

Convert the dot annotation to polygon bounding boxes

In [None]:
# convert dot annotations to polygon, single image

xml_path = r'./data/annotations/DI_S02_P000001_combined.xml'
output_path = r'./data/annotations_polygon/DI_S02_P000001_combined_polygon.xml'
lymphocyte_half_box_size = 4.5 # the size of half of the bbox around the lymphocyte dot in um
monocytes_half_box_size = 11.0 # the size of half of the bbox around the monocytes dot in um
min_spacing = 0.25
dot2polygon(xml_path, lymphocyte_half_box_size, monocytes_half_box_size, min_spacing, output_path)

In [None]:
# # convert dot annotations to polygon, in a folder

# annotation_dir = r'./data/annotations'
# annotation_polygon_dir = r'./data/annotations_polygon'
# annotation_list = glob.glob(os.path.join(annotation_dir,'*.xml'))
# for xml_path in annotation_list:
#     print(xml_path)
#     output_path = os.path.join(annotation_polygon_dir,os.path.splitext(os.path.basename(xml_path))[0] + '_polygon' + os.path.splitext(os.path.basename(xml_path))[1])
#     print(output_path)
#     lymphocyte_half_box_size = 4.5 # the size of half of the bbox around the lymphocyte dot in um
#     monocytes_half_box_size = 11.0 # the size of half of the bbox around the monocytes dot in um
#     min_spacing = 0.25
#     dot2polygon(xml_path, lymphocyte_half_box_size, monocytes_half_box_size, min_spacing, output_path)

Creating the yaml config files for training.

In [None]:
def folders_to_yml(wsi_dir: str, 
                   wsa_dir: str,
                   output_dir: str,
                   output_name: str):

    """
    Generate a yaml file to be used as WSD dataconfig from a folder of slides and a folder of annotation or mask files.
    Assumes files use the same name for both the slides and masks.
    """
    
    wsa_list = glob.glob(wsa_dir)

    yaml_dict = {'training': []}
    # yaml_dict = {'training': [], 'validation': []}
    for wsa in wsa_list:
        patient_name = os.path.basename(wsa).split(os.path.basename(wsa_dir).split('*')[1])[0] # monocytes
    #     print(patient_name)
        if os.path.isfile(os.path.join(wsi_dir,patient_name + "_PAS_CPG.tif")):
            wsi = os.path.join(wsi_dir, patient_name + "_PAS_CPG.tif")
            print('match found:    ' , patient_name)
            yaml_dict['training'].append(
                    {"wsa": {"path": str(wsa)}, "wsi": {"path": str(wsi)}})

            # # validation if needed
            # yaml_dict['validation'].append(
            #         {"wsa": {"path": str(wsa)}, "wsi": {"path": str(wsi)}})

        else:
            print('no match found:    ' , patient_name)

    # make a folder for output
    if not(os.path.isdir(output_dir)): os.mkdir (output_dir)


    with open(os.path.join(output_dir,output_name), "w") as file:
            yaml.safe_dump(yaml_dict, file)

In [None]:
wsi_dir = r'./data/images'
wsa_dir = r'./data/annotations_polygon/*_polygon.xml'
output_dir = r'./configs'
output_name = 'training_sample.yml'

folders_to_yml(wsi_dir, 
               wsa_dir,
               output_dir,
               output_name)