## Set-up

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import sys
import os
os.chdir('drive/MyDrive/Colab Notebooks/histopathology-cancer-classification')
sys.path.append('src')

In [3]:
!ls

 data
 deep-set-and-gnn.ipynb
 deep-set-end-to-end.ipynb
 deep-set-from-extracted-features.ipynb
 deep-set.ipynb
 deep-set-multiple.ipynb
 feature_extraction.ipynb
 feature_extraction_train.ipynb
 Graph_MIL.ipynb
 model_checkpoint.pt
 README.md
'Sean-testing-Copy of graph-neural-network.ipynb'
 src


In [4]:
!pip install torchinfo

Collecting torchinfo
  Downloading https://files.pythonhosted.org/packages/58/de/c2bb79cd6bb57e63feed501943cf55a197f25ae25f883ec19901843e466b/torchinfo-0.0.9-py3-none-any.whl
Installing collected packages: torchinfo
Successfully installed torchinfo-0.0.9


In [5]:
import time
import copy
import pickle
import math
import random
import shutil
from PIL import Image
from pathlib import Path
from collections import Counter

import pandas as pd
import numpy as np
import seaborn as sns
from tqdm.notebook import tqdm
from matplotlib import pylab as plt
from sklearn.model_selection import train_test_split

import torch
import torch.nn.functional as F
from torch import optim, nn
from torch.utils.data import DataLoader, Dataset
from torchvision import models, transforms
from torchinfo import summary

## Utility

In [6]:
def get_device():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("Current device: {:s}".format(device.type))
    return device

In [7]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [8]:
def get_subdirs(dir, name_only=True):
    path = Path(dir)
    subdirs = [str(d) for d in path.iterdir() if d.is_dir()]
    if name_only:
        subdirs = [s[(s.rfind('/') + 1):] for s in subdirs] # prefix
    return subdirs

In [9]:
def get_files(dir, name_only=True):
    path = Path(dir)
    files = [str(f) for f in path.iterdir() if f.is_file()]
    if name_only:
        files = [f[(f.rfind('/') + 1):] for f in files] # prefix
        files = [f[:f.rfind('.')] for f in files] # extension
    return files

## Global

In [10]:
seed = 647

In [11]:
img_c, img_w, img_h = 3, 256, 256

In [12]:
norm = ([0.485, 0.456, 0.406],
        [0.229, 0.224, 0.225])

data_transforms = {
    'train' : transforms.Compose([
        # transforms.RandomResizedCrop(size=256, scale=(0.95, 1.05), ratio=(1.0, 1.0)),
        # transforms.RandomAffine(degrees=5, translate=(0.05, 0.05)),
        # transforms.RandomVerticalFlip(),
        # transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(*norm)
    ]),
    'dev' : transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(*norm)
    ])
}

In [13]:
data_roots = {
    'train' : 'data/train',
    'dev' : 'data/dev'
}

In [14]:
data_dirs = {
    'train' : ['LUAD_256', 'LUSC_256', 'MESO_256'],
    'dev' : ['LUAD_DEV_256', 'LUSC_DEV_256', 'MESO_DEV_256']
}

In [15]:
device = get_device()

Current device: cuda


## Dataset

In [16]:
class HistoDataset(Dataset):
    def __init__(self, root, dir, preprocess):
        """
        @param root: Root directory
        @param dir: Parent directory
        @param preprocess: Image transformations
        """
        self.root = root # data/train or data/dev
        self.dir = dir # x_256
        self.wsis = get_subdirs("{}/{}".format(root, dir), name_only=True)
        self.preprocess = preprocess
        
    def __len__(self):
        return len(self.wsis)
    
    def __getitem__(self, index):
        wsi = self.wsis[index]
        files = get_files(dir = "{}/{}/{}".format(self.root, self.dir, wsi),
                          name_only=False)
        if len(files) == 0:
            raise FileExistsError
        input_images = [Image.open(f) for f in files]
        image_tensors = [self.preprocess(img) for img in input_images]
        files = [f[(f.rfind('/') + 1):] for f in files] # exclude prefix
        return wsi, files, image_tensors

## Feature Extractor

In [17]:
class FeatureExtractor(nn.Module):
  def __init__(self, requires_grad):
    super(FeatureExtractor, self).__init__()
    # Download pre-trained Densenet121
    base_model = models.densenet121(pretrained=True)
    # Sequential layers from base model
    self.features = list(base_model.features)
    self.features = nn.Sequential(*list(base_model.children())[:-1])
    # Set grad
    for param in self.parameters():
        param.requires_grad = requires_grad
  
  def forward(self, x):
    # Run through sequential layers
    x = self.features(x)
    # Max pooling
    x = F.avg_pool2d(x, kernel_size=x.size()[2:])
    x = x.view(-1, x.size()[1])
    # ReLU
    x = torch.relu(x)
    return x

In [18]:
model_fe = FeatureExtractor(requires_grad=False)
model_fe = model_fe.to(device)
model_fe = model_fe.eval()

Downloading: "https://download.pytorch.org/models/densenet121-a639ec97.pth" to /root/.cache/torch/hub/checkpoints/densenet121-a639ec97.pth


HBox(children=(FloatProgress(value=0.0, max=32342954.0), HTML(value='')))




In [19]:
summary(model_fe, input_size=(1, img_c, img_w, img_h))

Layer (type:depth-idx)                   Output Shape              Param #
├─Sequential: 1-1                        [1, 1024, 8, 8]           --
|    └─Sequential: 2-1                   [1, 1024, 8, 8]           --
|    |    └─Conv2d: 3-1                  [1, 64, 128, 128]         (9,408)
|    |    └─BatchNorm2d: 3-2             [1, 64, 128, 128]         (128)
|    |    └─ReLU: 3-3                    [1, 64, 128, 128]         --
|    |    └─MaxPool2d: 3-4               [1, 64, 64, 64]           --
|    |    └─_DenseBlock: 3-5             [1, 256, 64, 64]          (335,040)
|    |    └─_Transition: 3-6             [1, 128, 32, 32]          (33,280)
|    |    └─_DenseBlock: 3-7             [1, 512, 32, 32]          (919,680)
|    |    └─_Transition: 3-8             [1, 256, 16, 16]          (132,096)
|    |    └─_DenseBlock: 3-9             [1, 1024, 16, 16]         (2,837,760)
|    |    └─_Transition: 3-10            [1, 512, 8, 8]            (526,336)
|    |    └─_DenseBlock: 3-11     

## Extract Features

In [20]:
def extract_features(dataset):
    # All outputs
    all_outputs = {}
    # Iterate through each WSI
    with tqdm(total=len(dataset)) as pbar:
        for wsi, files, image_tensors in dataset:
            # Outputs for this WSI
            outputs = {}
            # Iterate through each tensor
            for f, tensor in zip(files, image_tensors):
                input = tensor.unsqueeze(0).to(device)
                output = model_fe(input).squeeze(0).detach().cpu()
                # Add to WSI outputs
                outputs[f] = output
            # Add to all outputs
            all_outputs[wsi] = outputs
            # Update progress bar
            time.sleep(0.1)
            pbar.update(1)
    # Save
    with open("{}/{}_features_avgpool.pickle".format(dataset.root, dataset.dir), 'wb') as out_file:
        pickle.dump(all_outputs, out_file, pickle.HIGHEST_PROTOCOL)

In [21]:
dataset = HistoDataset(
    root = 'data/train',
    dir = 'MESO_256',
    preprocess = data_transforms['train']
)

In [22]:
set_seed(seed)

extract_features(dataset)

HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))


