# Dataset Preprocessing
In this notebook we are going to preprocess the slides to generate our sets of patches for training, validation, and testing.

In [1]:
from pathgen.utils.seeds import set_seed

global_seed = 13371
set_seed(global_seed)

In [2]:
from pathgen.utils.paths import project_root

experiment_name = "paper"
experiment_root = project_root() / "experiments" / experiment_name

## 1. Audit the dataset
Let's find out how many patches there are in each slide in the Camelyon16 dataset, so we can select from the in a principled way.

In [3]:
import pathgen.data.datasets.camelyon16 as camelyon16

# create the datasets that we will use
train = camelyon16.training()
test = camelyon16.testing()

print(f'Training slides: {len(train)}')
print(f'Testing slides: {len(test)}')

Training slides: 270
Testing slides: 129


In [4]:
from pathgen.preprocess.patching import make_index
from pathgen.preprocess.tissue_detection import TissueDetectorOTSU
from pathgen.preprocess.patching import GridPatchFinder

tissue_detector = TissueDetectorOTSU()
patch_finder = GridPatchFinder(6, 0, 256, 256)

train_index = make_index(train, tissue_detector, patch_finder)

indexing tumor_001.tif
indexing tumor_002.tif
indexing tumor_003.tif
indexing tumor_004.tif
indexing tumor_005.tif
indexing tumor_006.tif
indexing tumor_007.tif
indexing tumor_008.tif
indexing tumor_009.tif
indexing tumor_010.tif
indexing tumor_011.tif
indexing tumor_012.tif
indexing tumor_013.tif
indexing tumor_014.tif
indexing tumor_015.tif
indexing tumor_016.tif
indexing tumor_017.tif
indexing tumor_018.tif
indexing tumor_019.tif
indexing tumor_020.tif
indexing tumor_021.tif
indexing tumor_022.tif
indexing tumor_023.tif
indexing tumor_024.tif
indexing tumor_025.tif
indexing tumor_026.tif
indexing tumor_027.tif
indexing tumor_028.tif
indexing tumor_029.tif
indexing tumor_030.tif
indexing tumor_031.tif
indexing tumor_032.tif
indexing tumor_033.tif
indexing tumor_034.tif
indexing tumor_035.tif
indexing tumor_036.tif
indexing tumor_037.tif
indexing tumor_038.tif
indexing tumor_039.tif
indexing tumor_040.tif
indexing tumor_041.tif
indexing tumor_042.tif
indexing tumor_043.tif
indexing tu

In [5]:
test_index = make_index(test, tissue_detector, patch_finder)

indexing test_001.tif
indexing test_002.tif
indexing test_003.tif
indexing test_004.tif
indexing test_005.tif
indexing test_006.tif
indexing test_007.tif
indexing test_008.tif
indexing test_009.tif
indexing test_010.tif
indexing test_011.tif
indexing test_012.tif
indexing test_013.tif
indexing test_014.tif
indexing test_015.tif
indexing test_016.tif
indexing test_017.tif
indexing test_018.tif
indexing test_019.tif
indexing test_020.tif
indexing test_021.tif
indexing test_022.tif
indexing test_023.tif
indexing test_024.tif
indexing test_025.tif
indexing test_026.tif
indexing test_027.tif
indexing test_028.tif
indexing test_029.tif
indexing test_030.tif
indexing test_031.tif
indexing test_032.tif
indexing test_033.tif
indexing test_034.tif
indexing test_035.tif
indexing test_036.tif
indexing test_037.tif
indexing test_038.tif
indexing test_039.tif
indexing test_040.tif
indexing test_041.tif
indexing test_042.tif
indexing test_043.tif
indexing test_044.tif
indexing test_045.tif
indexing t

In [6]:
def print_totals(index):
    summary = index.summary()
    total_normal = summary['normal'].sum()
    total_tumor = summary['tumor'].sum()
    print(f"Total normal patches: {total_normal}")
    print(f"Total tumor patches: {total_tumor}")
    
print("Train Index")
print_totals(train_index)
print("")
print("Test Index")
print_totals(test_index)

Train Index
Total normal patches: 7438602
Total tumor patches: 307594

Test Index
Total normal patches: 3410036
Total tumor patches: 244550


In [7]:
train_index.save(experiment_root / 'index' / 'train')
test_index.save(experiment_root / 'index' / 'test')

In [9]:
from pathgen.preprocess.patching import SlidesIndex

loaded_index = SlidesIndex.load(experiment_root / 'index' / 'train')
len(loaded_index)

270