In [None]:
# default_exp data

# BreaKHis Data Processing

> The BreaKHis dataset is organized by benign and malignant tumors, and then by the specific tumor types. The functions provided here will help to quickly (lazily) process the data for usage in training image classification models, while maintaining the additional information that might not be necessary. For example, if training on benign/malignant labels, the information about which specific tumor is present will still be available in the dataset definition. The data is anonymized, so there's no possibility of splitting at the patient level. Instead, we leave dataset splitting up to the user, but provide some utility functions to reproduce the results obtained in initial development.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
import numpy as np
import os
import torch

from PIL import Image
from torchvision import transforms

np.random.seed(31)
torch.manual_seed(31);

For reproducibility, the random seed (for both Numpy and PyTorch) are set to 31.

In [None]:
#hide
data_dir = '/share/nikola/export/dt372/BreaKHis_v1/'
label = 'tumor_type'
_data_dir = os.path.join(data_dir, 'histology_slides/breast/')
benign_dir = os.path.join(_data_dir, 'benign/SOB')
benign_subtumors = ['adenosis', 'fibroadenoma', 'phyllodes_tumor', 'tubular_adenoma']
benign_subdirs = [('benign', subtumor, os.path.join(benign_dir, subtumor)) for subtumor in benign_subtumors]
malignant_dir = os.path.join(_data_dir, 'malignant/SOB')
malignant_subtumors = ['ductal_carcinoma', 'lobular_carcinoma', 'mucinous_carcinoma', 'papillary_carcinoma']
malignant_subdirs = [('malignant', subtumor, os.path.join(malignant_dir, subtumor)) for subtumor in malignant_subtumors]
flatten = lambda l: [item for sublist in l for item in sublist]
benign_subsubdirs = flatten([
    [(tumor, subtumor, subsubdir, os.path.join(subdir, subsubdir)) for subsubdir in os.listdir(subdir)]
    for tumor, subtumor, subdir in benign_subdirs
])
malignant_subsubdirs = flatten([
    [(tumor, subtumor, subsubdir, os.path.join(subdir, subsubdir)) for subsubdir in os.listdir(subdir)]
    for tumor, subtumor, subdir in malignant_subdirs
])
for (_, _, _, benign_subsubdir), (_, _, _, malignant_subsubdir) in zip(benign_subsubdirs, malignant_subsubdirs):
    assert os.path.isdir(benign_subsubdir), "%s is not a valid directory" % benign_subsubdir
    assert os.path.isdir(malignant_subsubdir), "%s is not a valid directory" % malignant_subsubdir
magnifications = ['40X', '100X', '200X', '400X']
benign_data = []
malignant_data = []
for (tumor, subtumor, slide_number, benign_subsubdir) in benign_subsubdirs:
    data_mapping = {}
    slide_id = slide_number.split('_')[-1].split('-')[-1]
    for magnification in magnifications:
        magnification_subsubdir = os.path.join(benign_subsubdir, magnification)
        for slide in os.listdir(magnification_subsubdir):
            sequence_id = int(slide.split('.')[0].split('-')[-1])
            if sequence_id not in data_mapping:
                data_mapping[sequence_id] = {}
            data_mapping[sequence_id][magnification] = (
                os.path.join(magnification_subsubdir, slide),
                tumor, subtumor, magnification, slide_id, sequence_id
            )
    benign_data.append((tumor, subtumor, slide_id, data_mapping))
for (tumor, subtumor, slide_number, malignant_subsubdir) in malignant_subsubdirs:
    data_mapping = {}
    slide_id = slide_number.split('_')[-1].split('-')[-1]
    for magnification in magnifications:
        magnification_subsubdir = os.path.join(malignant_subsubdir, magnification)
        for slide in os.listdir(magnification_subsubdir):
            sequence_id = int(slide.split('.')[0].split('-')[-1])
            if sequence_id not in data_mapping:
                data_mapping[sequence_id] = {}
            data_mapping[sequence_id][magnification] = (
                os.path.join(magnification_subsubdir, slide),
                tumor, subtumor, magnification, slide_id, sequence_id
            )
    malignant_data.append((tumor, subtumor, slide_id, data_mapping))

all_data = []
total = 0
for tumor, subtumor, slide_id, data_mapping in benign_data:
    for cell_id, file_mapping in data_mapping.items():
        total += len(file_mapping)
        all_data.append((
            (tumor, subtumor, slide_id, file_mapping),
            subtumor if label == 'tumor_type' else tumor
        ))
for tumor, subtumor, slide_id, data_mapping in malignant_data:
    for cell_id, file_mapping in data_mapping.items():
        total += len(file_mapping)
        all_data.append((
            (tumor, subtumor, slide_id, file_mapping),
            subtumor if label == 'tumor_type' else tumor
        ))
assert total == 7909, "Some images might be missing."

In [None]:
#export
class BreaKHisDataset(torch.utils.data.Dataset):
    """ PyTorch dataset definition of the BreaKHis dataset.
    
    Construction of the dataset object should be done using this
    class's method `initialize`. Simply providing the data directory
    where the data was downloaded is sufficient.
    """
    
    label_mapping = {
        'tumor_class': {'benign': 0, 'malignant': 1},
        'tumor_type': {
            subtumor: i for i, subtumor in enumerate([
                'adenosis', 'fibroadenoma', 'phyllodes_tumor', 'tubular_adenoma',
                'ductal_carcinoma', 'lobular_carcinoma', 'mucinous_carcinoma', 'papillary_carcinoma'   
            ])
        }
    }
    index_mapping = {
        k: {iv: kv for (kv, iv) in v.items()} for (k, v) in label_mapping.items()
    }

    def __init__(self, dataset, transform=None):
        self.dataset = dataset
        self.transform = transform

    
    def __getitem__(self, index):
        (fp, tumor, subtumor, magnification, slide_id, sequence_id), label = self.dataset[index]
        image = Image.open(fp)
        data = self.transform(image) if image else image
        return data, torch.Tensor([label]).long().squeeze()
    
    def __len__(self):
        return len(self.dataset)
    
    
    @classmethod
    def _split(
        cls, dataset, split={'train': 0.8, 'val': 0.2}, criterion=['tumor_class'],
        split_transforms={'train': None, 'val': None}
    ):
        """ Splits `dataset` according to `split` percentages and `criterion`
        
        Arguments:
            `dataset`: the dataset to split (this is the output of `initialize`)
            `split` (Dict[str, float]): a mapping of strings to floats corresponding to the percentage
                in each split of the dataset; must add up to 1.
            `criterion` (List[str]): one of 'tumor_class' (benign/malignant) or 
                                     'tumor_type' (e.g. adenosis) and/or 'magnification'
            `split_transforms` (Dict[str, torchvision.transforms]): a mapping of split IDs to the
                corresponding transforms.
        Returns:
            data_partitioned (Dict[str, List]): mapping of data corresponding to the split IDs
                given in `split`, and each split ID maps to a `BreaKHisDataset` encompassing the
                list of data points, with the proper percentage of the dataset allocated to each split ID.
        """
        assert sum([v for (_, v) in split.items()]) == 1, 'Please specify proper split percentages to sum to 1.'
        assert len(criterion) > 0, "Must specify at least 1 criterion to split on"
        split_by_tumor_class = 'tumor_class' in criterion
        split_by_tumor_type = 'tumor_type' in criterion
        assert bool(split_by_tumor_class) != bool(split_by_tumor_type), "Please only specify 1 of tumor type and class"
        split_by_magnification = 'magnification' in criterion
        
        data_split = {}
        for (tumor_class, tumor_type, slide_id, slide_data_mapping), label in dataset:
            if split_by_tumor_class or split_by_tumor_type:
                split_key = tumor_class if split_by_tumor_class else tumor_type
                if split_key not in data_split:
                    data_split[split_key] = (
                        {'40X': [], '100X': [], '200X': [], '400X': []}
                        if split_by_magnification else []
                    )
                for magnification, fp in slide_data_mapping.items():
                    queue = (
                        data_split[split_key][magnification] if split_by_magnification
                        else data_split[split_key]
                    )
                    queue.append((fp, label))
            else:
                for magnification, fp in slide_data_mapping.items():
                    if magnification not in data_split:
                        data_split[magnification] = []
                    data_split[magnification].append((fp, label))

        def partition(dataset):
            permutation = np.random.permutation(range(len(dataset)))
            init_percentage = 0.0
            split_dataset = []
            for split_id, split_percentage in split.items():
                start_index = int(init_percentage * len(permutation))
                end_index = int((init_percentage + split_percentage) * len(permutation))
                split_dataset.append(
                    (split_id, [dataset[index] for index in permutation[start_index:end_index]])
                )
                init_percentage += split_percentage
            return split_dataset

        for split_key, data_wrap in data_split.items():
            if type(data_wrap) == dict:
                for magnification, index_set in data_wrap.items():
                    data_wrap[magnification] = partition(index_set)
            else:
                data_split[split_key] = partition(data_wrap)
        
        data_partitioned = {k: [] for (k, _) in split.items()}
        for split_key, data_wrap in data_split.items():
            if type(data_wrap) == dict:
                for magnification, index_set in data_wrap.items():
                    for split_id, data in index_set:
                        data_partitioned[split_id].extend(data)
            else:
                for split_id, data in data_wrap:
                    data_partitioned[split_id].extend(data)

        return {k: cls(data_subset, split_transforms[k]) for k, data_subset in data_partitioned.items()}
    
    @classmethod
    def initalize(
        cls, data_dir, label='tumor_class', split={'train': 0.8, 'val': 0.2}, criterion=['tumor_class'],
        split_transforms={'train': None, 'val': None}
    ):
        """Initializes a PyTorch dataset object for the data contained in `data_dir`.
           
        Arguments:
            `data_dir` (str): the directory where the BreaKHis dataset was downloaded
            `label` (str): the label to use for the dataset (either 'tumor_class' or 'tumor_type')
            `split` (Dict[str, float]): a mapping of strings to floats corresponding to the percentage
                in each split of the dataset; must add up to 1.
            `criterion` (List[str]): one of 'tumor_class' (benign/malignant) or 
                'tumor_type' (e.g. adenosis) and/or 'magnification'
            `split_transforms` (Dict[str, torchvision.transforms]): a mapping of split IDs to the
                corresponding transforms.
        Returns:
            data_partitioned (Dict[str, List]): mapping of data corresponding to the split IDs
                given in `split`, and each split ID maps to a `BreaKHisDataset` encompassing the
                list of data points, with the proper percentage of the dataset allocated to each split ID.
        """
        assert label in ['tumor_class', 'tumor_type'], "Please properly specify the label for this dataset."
        for split_id in split:
            assert split_id in split_transforms, """Split ID '%s' is not included in split_transforms""" % split_id
        _data_dir = os.path.join(data_dir, 'histology_slides/breast/')
        benign_dir = os.path.join(_data_dir, 'benign/SOB')
        benign_subtumors = ['adenosis', 'fibroadenoma', 'phyllodes_tumor', 'tubular_adenoma']
        benign_subdirs = [('benign', subtumor, os.path.join(benign_dir, subtumor)) for subtumor in benign_subtumors]
        malignant_dir = os.path.join(_data_dir, 'malignant/SOB')
        malignant_subtumors = ['ductal_carcinoma', 'lobular_carcinoma', 'mucinous_carcinoma', 'papillary_carcinoma']
        malignant_subdirs = [('malignant', subtumor, os.path.join(malignant_dir, subtumor)) for subtumor in malignant_subtumors]
        flatten = lambda l: [item for sublist in l for item in sublist]
        benign_subsubdirs = flatten([
            [(tumor, subtumor, subsubdir, os.path.join(subdir, subsubdir)) for subsubdir in os.listdir(subdir)]
            for tumor, subtumor, subdir in benign_subdirs
        ])
        malignant_subsubdirs = flatten([
            [(tumor, subtumor, subsubdir, os.path.join(subdir, subsubdir)) for subsubdir in os.listdir(subdir)]
            for tumor, subtumor, subdir in malignant_subdirs
        ])
        for (_, _, _, benign_subsubdir), (_, _, _, malignant_subsubdir) in zip(benign_subsubdirs, malignant_subsubdirs):
            assert os.path.isdir(benign_subsubdir), "%s is not a valid directory" % benign_subsubdir
            assert os.path.isdir(malignant_subsubdir), "%s is not a valid directory" % malignant_subsubdir
        magnifications = ['40X', '100X', '200X', '400X']
        benign_data = []
        malignant_data = []
        for (tumor, subtumor, slide_number, benign_subsubdir) in benign_subsubdirs:
            data_mapping = {}
            slide_id = slide_number.split('_')[-1].split('-')[-1]
            for magnification in magnifications:
                magnification_subsubdir = os.path.join(benign_subsubdir, magnification)
                for slide in os.listdir(magnification_subsubdir):
                    sequence_id = int(slide.split('.')[0].split('-')[-1])
                    if sequence_id not in data_mapping:
                        data_mapping[sequence_id] = {}
                    data_mapping[sequence_id][magnification] = (
                        os.path.join(magnification_subsubdir, slide),
                        tumor, subtumor, magnification, slide_id, sequence_id
                    )
            benign_data.append((tumor, subtumor, slide_id, data_mapping))
        for (tumor, subtumor, slide_number, malignant_subsubdir) in malignant_subsubdirs:
            data_mapping = {}
            slide_id = slide_number.split('_')[-1].split('-')[-1]
            for magnification in magnifications:
                magnification_subsubdir = os.path.join(malignant_subsubdir, magnification)
                for slide in os.listdir(magnification_subsubdir):
                    sequence_id = int(slide.split('.')[0].split('-')[-1])
                    if sequence_id not in data_mapping:
                        data_mapping[sequence_id] = {}
                    data_mapping[sequence_id][magnification] = (
                        os.path.join(magnification_subsubdir, slide),
                        tumor, subtumor, magnification, slide_id, sequence_id
                    )
            malignant_data.append((tumor, subtumor, slide_id, data_mapping))
            
        all_data = []
        total = 0
        for tumor, subtumor, slide_id, data_mapping in benign_data:
            for cell_id, file_mapping in data_mapping.items():
                total += len(file_mapping)
                all_data.append((
                    (tumor, subtumor, slide_id, file_mapping),
                    cls.label_mapping[label][subtumor if label == 'tumor_type' else tumor]
                ))
        for tumor, subtumor, slide_id, data_mapping in malignant_data:
            for cell_id, file_mapping in data_mapping.items():
                total += len(file_mapping)
                all_data.append((
                    (tumor, subtumor, slide_id, file_mapping),
                    cls.label_mapping[label][subtumor if label == 'tumor_type' else tumor]
                ))
        assert total == 7909, "Some images might be missing."
        
        return cls._split(
            all_data, split=split, criterion=criterion, split_transforms=split_transforms
        )
        

In [None]:
#export
def initialize_datasets(
    data_dir, label='tumor_class', split={'train': 0.8, 'val': 0.2}, criterion=['tumor_class'],
    split_transforms={'train': None, 'val': None}
):
    """Returns a `BreaKHisDataset` object for the data contained in `data_dir`."""
    return BreaKHisDataset.initalize(
        data_dir, label=label, criterion=criterion, split=split, split_transforms=split_transforms
    )

In [None]:
#hide
BreaKHisDataset.index_mapping

{'tumor_class': {0: 'benign', 1: 'malignant'},
 'tumor_type': {0: 'adenosis',
  1: 'fibroadenoma',
  2: 'phyllodes_tumor',
  3: 'tubular_adenoma',
  4: 'ductal_carcinoma',
  5: 'lobular_carcinoma',
  6: 'mucinous_carcinoma',
  7: 'papillary_carcinoma'}}

To create the dataset, you only need one function calls. Within this function call:
* You can specify the label type when initializing the dataset by specifying `label` in `initialize`
    * It must be 1 of 'tumor_class' or 'tumor_type'
* You can make arbitrary splits of the data (within reason) when splitting the dataset via `split_dataset`
* You can make sure to split equally within various criterion using `criterion`, which can include tumor class/tumor type, and magnification.
    * You can not split equally by both tumor class and tumor type (error will be thrown if attempted).
* You can use different transforms for different splits using `split_transforms`.

In [None]:
train_transform = transforms.Compose([
    transforms.RandomRotation(90),
    transforms.RandomHorizontalFlip(0.8),
    transforms.RandomResizedCrop(224),
    transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465),
                         (0.2023, 0.1994, 0.2010)),
])

val_transform = transforms.Compose([
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465),
                        (0.2023, 0.1994, 0.2010)),
])

In [None]:
ds_mapping = initialize_datasets(
    '/share/nikola/export/dt372/BreaKHis_v1/',
    label='tumor_type', criterion=['tumor_type', 'magnification'],
    split_transforms={'train': train_transform, 'val': val_transform}
)

In [None]:
tr_ds, val_ds = ds_mapping['train'], ds_mapping['val']

In [None]:
tr_ds[0]

(tensor([[[-2.4291, -2.4291, -2.4291,  ...,  0.8082,  0.7888,  0.8082],
          [-2.4291, -2.4291, -2.4291,  ...,  0.8082,  0.7888,  0.7888],
          [-2.4291, -2.4291, -2.4291,  ...,  0.8082,  0.8082,  0.7501],
          ...,
          [ 0.6725,  0.6144,  0.2654,  ...,  0.4399,  0.3624,  0.0522],
          [ 0.4981,  0.6338,  0.5174,  ...,  0.3236,  0.3817,  0.1297],
          [ 0.3430,  0.3624,  0.4593,  ...,  0.3624,  0.3042,  0.0522]],
 
         [[-2.4183, -2.4183, -2.4183,  ...,  0.9251,  0.9251,  0.9251],
          [-2.4183, -2.4183, -2.4183,  ...,  0.9251,  0.9054,  0.9251],
          [-2.4183, -2.4183, -2.4183,  ...,  0.9251,  0.9251,  0.9054],
          ...,
          [ 0.2564,  0.2564, -0.0189,  ...,  0.5121,  0.4531,  0.0598],
          [ 0.0598,  0.1778,  0.1581,  ...,  0.3744,  0.4138,  0.1188],
          [-0.1566, -0.0976,  0.0204,  ...,  0.4531,  0.3548,  0.0991]],
 
         [[-2.2214, -2.2214, -2.2214,  ...,  0.8027,  0.8027,  0.8027],
          [-2.2214, -2.2214,

From here, it is very simple to create the dataloaders for use in training.

In [None]:
tr_dl = torch.utils.data.DataLoader(tr_ds, batch_size=32, shuffle=True)
val_dl = torch.utils.data.DataLoader(val_ds, batch_size=32)

In [None]:
x, y = next(iter(tr_dl))
x.shape, y.shape

(torch.Size([32, 3, 224, 224]), torch.Size([32]))

## Appendix

All images in this dataset are captured from an ROI determined by a professional pathologist, so all images are assumed to have *a* tumor.

### Samples

* Samples are generated from breast tissue biopsy slides, stained with hematoxylin and eosin (HE).
* Prepared for histological study and labelled by pathologists of the P&D Lab
* Breast tumor specimens assessed by Immunohistochemistry (IHC)
* Core Needle Biopsy (CNB) and Surgical Open Biopsy (SOB)
* Section of ~3µm thickness

### Image acquisition
* Olympus BX-50 system microscope with a relay lens with magnification of 3.3× coupled to a Samsung digital color camera SCC-131AN
* Magnification 40×, 100×, 200×, and 400× (objective lens 4×, 10×, 20×, and 40× with ocular lens 10×)
* Camera pixel size 6.5 µm
* Raw images without normalization nor color color standardization
* Resulting images saved in 3-channel RGB, 8-bit depth in each channel, PNG format