In [1]:
from pathgen.data.datasets import camelyon16

training = camelyon16.training_small()
training.paths

Unnamed: 0,slide,annotation,label,tags
23,tumor/tumor_024.tif,lesion_annotations/tumor_024.xml,tumor,
37,tumor/tumor_038.tif,lesion_annotations/tumor_038.xml,tumor,
53,tumor/tumor_054.tif,lesion_annotations/tumor_054.xml,tumor,
62,tumor/tumor_063.tif,lesion_annotations/tumor_063.xml,tumor,
64,tumor/tumor_065.tif,lesion_annotations/tumor_065.xml,tumor,
75,tumor/tumor_076.tif,lesion_annotations/tumor_076.xml,tumor,
88,tumor/tumor_089.tif,lesion_annotations/tumor_089.xml,tumor,
124,normal/normal_014.tif,,normal,
148,normal/normal_038.tif,,normal,
209,normal/normal_100.tif,,normal,


## Preprocessing
- Create the patch index
- Split into train and test
- Save the patches out

In [2]:
from pathgen.utils.seeds import set_seed
from pathgen.preprocess.patching.patch_finder import GridPatchFinder
from pathgen.preprocess.patching.make_index import make_index
from pathgen.preprocess.tissue_detector import TissueDetectorOTSU

tissue_detector = TissueDetectorOTSU()

global_seed = 123

set_seed(global_seed)
# index all the patches for the camelyon16 dataset
train_data = camelyon16.training_small()
patch_finder = GridPatchFinder(6, 0, 256, 256)
train_patches = make_index(train_data, tissue_detector, patch_finder)
train_patches.patches

indexing tumor_024.tif
indexing tumor_038.tif
indexing tumor_054.tif
indexing tumor_063.tif
indexing tumor_065.tif
indexing tumor_076.tif
indexing tumor_089.tif
indexing normal_014.tif
indexing normal_038.tif
indexing normal_100.tif


[<pathgen.preprocess.patching.patchset.SimplePatchSet at 0x7f08aa52e860>,
 <pathgen.preprocess.patching.patchset.SimplePatchSet at 0x7f08a8b96a90>,
 <pathgen.preprocess.patching.patchset.SimplePatchSet at 0x7f08a9199a58>,
 <pathgen.preprocess.patching.patchset.SimplePatchSet at 0x7f08a914ca58>,
 <pathgen.preprocess.patching.patchset.SimplePatchSet at 0x7f08a91964a8>,
 <pathgen.preprocess.patching.patchset.SimplePatchSet at 0x7f08aa4b8048>,
 <pathgen.preprocess.patching.patchset.SimplePatchSet at 0x7f08a930bb38>,
 <pathgen.preprocess.patching.patchset.SimplePatchSet at 0x7f08a930b7f0>,
 <pathgen.preprocess.patching.patchset.SimplePatchSet at 0x7f08a930b6a0>,
 <pathgen.preprocess.patching.patchset.SimplePatchSet at 0x7f08a930b4e0>]

In [3]:
train_patches.summary()

Unnamed: 0,background,normal,tumor
0,0.0,50101,31
1,0.0,5283,736
2,0.0,25753,22335
3,0.0,45720,49
4,0.0,12051,71
5,0.0,15591,37151
6,0.0,44378,29157
7,0.0,8002,0
8,0.0,3365,0
9,0.0,24909,0


In [4]:
import pandas as pd

training_dfs = [patchset.as_df() for patchset in train_patches]
combined = pd.concat(training_dfs, ignore_index=True)
print(combined.label.unique())
print(train_data.labels)
combined


[1 2]
{'background': 0, 'normal': 1, 'tumor': 2}


Unnamed: 0,x,y,width,height,level,slide_idx,dataset_name,label
0,54272,9984,256,256,0,0,camelyon16.training_small,1
1,54272,10240,256,256,0,0,camelyon16.training_small,1
2,68352,10240,256,256,0,0,camelyon16.training_small,1
3,68608,10240,256,256,0,0,camelyon16.training_small,1
4,54272,10496,256,256,0,0,camelyon16.training_small,1
...,...,...,...,...,...,...,...,...
324678,72704,188928,256,256,0,9,camelyon16.training_small,1
324679,72960,188928,256,256,0,9,camelyon16.training_small,1
324680,73216,188928,256,256,0,9,camelyon16.training_small,1
324681,73472,188928,256,256,0,9,camelyon16.training_small,1


In [5]:
tumor_patches = combined[combined['label'] == 2]
normal_patches = combined[combined['label'] == 1]
tumor_patches, normal_patches

(            x       y  width  height  level  slide_idx  \
 31054   24832  126720    256     256      0          0   
 31055   25088  126720    256     256      0          0   
 31056   25344  126720    256     256      0          0   
 31353   24832  126976    256     256      0          0   
 31354   25088  126976    256     256      0          0   
 ...       ...     ...    ...     ...    ...        ...   
 264309  81152   68608    256     256      0          6   
 264310  81408   68608    256     256      0          6   
 264311  81664   68608    256     256      0          6   
 264455  80640   68864    256     256      0          6   
 264456  80896   68864    256     256      0          6   
 
                      dataset_name  label  
 31054   camelyon16.training_small      2  
 31055   camelyon16.training_small      2  
 31056   camelyon16.training_small      2  
 31353   camelyon16.training_small      2  
 31354   camelyon16.training_small      2  
 ...                      

In [6]:
from pathgen.preprocess.sampling.sampler import simple_random

tumor_samples = simple_random(tumor_patches, 20000)
normal_samples = simple_random(normal_patches, 20000)
tumor_samples, normal_samples

(            x       y  width  height  level  slide_idx  \
 63742   37376   79872    256     256      0          2   
 198205  25856   44800    256     256      0          5   
 218315  47360   19456    256     256      0          6   
 66557   68864   92416    256     256      0          2   
 61981   48896   75008    256     256      0          2   
 ...       ...     ...    ...     ...    ...        ...   
 225001  47104   28160    256     256      0          6   
 170618  97024   16640    256     256      0          5   
 204615  45824   54016    256     256      0          5   
 260964  82432   64256    256     256      0          6   
 69540   65536  101632    256     256      0          2   
 
                      dataset_name  label  
 63742   camelyon16.training_small      2  
 198205  camelyon16.training_small      2  
 218315  camelyon16.training_small      2  
 66557   camelyon16.training_small      2  
 61981   camelyon16.training_small      2  
 ...                      

In [7]:
from pathgen.data.datasets.registry import get_dataset
from pathgen.data.slides.region import Region
from pathgen.utils.paths import project_root
import cv2
import numpy as np

experiment_name = "one"
experiment_root = project_root() / "experiments" / experiment_name

balanced_patches = pd.concat([tumor_samples, normal_samples], ignore_index=True)
training_patches = balanced_patches.sample(frac = 0.7) 
validation_patches = balanced_patches.drop(training_patches.index)

print(len(training_patches))
print(len(validation_patches))

output_dir = experiment_root / "training_patches"
for dataset_name, dataset_group in training_patches.groupby('dataset_name'):
    dataset = get_dataset(dataset_name)
    for slide_idx, slide_group in dataset_group.groupby('slide_idx'):
        print(f'{dataset_name}, {slide_idx}', {len(slide_group)})
        with dataset.open_slide(slide_idx) as slide:       
            for row in slide_group.itertuples():
                # read in the region
                region = Region.make(row.x, row.y, row.width, row.level)
                image = slide.read_region(region)

                # get the patch label as a string
                labels = {v: k for k, v in dataset.labels.items()}
                label = labels[row.label]

                # ensure the output directory exists
                output_subdir = output_dir / label
                output_subdir.mkdir(parents=True, exist_ok=True)

                # write out the slide
                slide_path = dataset.get_slide_path(slide_idx)
                rel_slide_path = dataset.to_rel_path(slide_path)
                slide_name_str = str(rel_slide_path)[:-4].replace('/', '-')
                patch_filename = slide_name_str + f"-{row.x}-{row.y}.png"
                image_path = output_dir / label / patch_filename
                cv2.imwrite(str(image_path), np.array(image))

28000
12000
camelyon16.training_small, 0 {2975}
camelyon16.training_small, 1 {402}
camelyon16.training_small, 2 {4989}
camelyon16.training_small, 3 {2736}
camelyon16.training_small, 4 {779}
camelyon16.training_small, 5 {6679}
camelyon16.training_small, 6 {7277}
camelyon16.training_small, 7 {473}
camelyon16.training_small, 8 {208}
camelyon16.training_small, 9 {1482}


In [9]:
output_dir = experiment_root / "validation_patches"
for dataset_name, dataset_group in validation_patches.groupby('dataset_name'):
    dataset = get_dataset(dataset_name)
    for slide_idx, slide_group in dataset_group.groupby('slide_idx'):
        print(f'{dataset_name}, {slide_idx}', {len(slide_group)})
        with dataset.open_slide(slide_idx) as slide:       
            for row in slide_group.itertuples():
                # read in the region
                region = Region.make(row.x, row.y, row.width, row.level)
                image = slide.read_region(region)

                # get the patch label as a string
                labels = {v: k for k, v in dataset.labels.items()}
                label = labels[row.label]

                # ensure the output directory exists
                output_subdir = output_dir / label
                output_subdir.mkdir(parents=True, exist_ok=True)

                # write out the slide
                slide_path = dataset.get_slide_path(slide_idx)
                rel_slide_path = dataset.to_rel_path(slide_path)
                slide_name_str = str(rel_slide_path)[:-4].replace('/', '-')
                patch_filename = slide_name_str + f"-{row.x}-{row.y}.png"
                image_path = output_dir / label / patch_filename
                cv2.imwrite(str(image_path), np.array(image))

camelyon16.training_small, 0 {1289}
camelyon16.training_small, 1 {194}
camelyon16.training_small, 2 {2253}
camelyon16.training_small, 3 {1120}
camelyon16.training_small, 4 {337}
camelyon16.training_small, 5 {2862}
camelyon16.training_small, 6 {3043}
camelyon16.training_small, 7 {195}
camelyon16.training_small, 8 {85}
camelyon16.training_small, 9 {622}
