In [1]:
# Here we take care of paths.

from pathlib import Path
import os
print('Starting path:' + os.getcwd())
if os.getcwd()[-18:] == 'VESUVIUS_Challenge':
    pass
else:
    PATH = Path().resolve().parents[0]
    os.chdir(PATH)

# make sure you are in Paragraph_to_Tex folder
print('Current path:' + os.getcwd())

In [2]:
import torch
import torch.utils.data as data
from torch.utils.data import ConcatDataset, DataLoader, Dataset, ConcatDataset
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import glob
import PIL.Image as Image
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from tqdm import tqdm
from ipywidgets import interact, fixed
from torchvision import transforms
import torchvision.models as models

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
transform = transforms.ToPILImage()

# Dataset Modules

### CONFIGS

In [3]:

PATH = 'kaggle/input/vesuvius-challenge/'
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# scroll_1 size = 8181, 6330
# scroll_2 size = 14830, 9506
# scroll_3 size = 7606, 5249

### Base_Dataset class 
- due to multiprocessing issues in Ipython we import it from from Data_Modules.Base_Dataset import Base_Dataset

class Base_Dataset(Dataset):

    def __init__(self,
                image_stack,
                label,
                pixels,
                 buffer,
                 z_dim,

                 ):

        self.image_stack = image_stack
        self.label = label
        self.pixels = pixels
        self.buffer = buffer
        self.z_dim = z_dim



    def __len__(self):
        return len(self.pixels)



    def __getitem__(self, index: int):
        y,x = self.pixels[index]
        subvolume = self.image_stack[:, y - self.buffer:y + self.buffer + 1, x - self.buffer:x + self.buffer + 1].view(1, self.z_dim, self.buffer * 2 + 1,self.buffer * 2 + 1)
        inklabel = self.label[y, x].view(1)
        return subvolume, inklabel
    



### Scrolls_Dataset wrapper

In [4]:
from Data_Modules.Base_Dataset import Base_Dataset
class Scrolls_Dataset(pl.LightningDataModule):

    def __init__(self,
                 buffer = 30,
                 z_start = 27,
                 z_dim = 10,
                 validation_rect = (1100, 3500, 700, 950),
                shared_height = 8000,
                 downsampling =None,
                 scroll_fragments = [1,2,3],
                 stage = 'train',
                 shuffle=True,
                 batch_size=8,
                 num_workers =4 ,
                 on_gpu= False,


                 ):

        self.buffer = buffer
        self.z_start = z_start
        self.z_dim = z_dim
        self.validation_rect = validation_rect
        self.shared_height = shared_height
        self.downsampling = downsampling
        self.scroll_fragments = scroll_fragments
        self.stage = stage
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.on_gpu = on_gpu


    def prepare_data(self, *args, **kwargs):
        if self.stage.lower() == 'train':


            z_slices = [[] for _ in range(len(self.scroll_fragments))]
            labels =  [[] for _ in range(len(self.scroll_fragments))]
            masks = [[] for _ in range(len(self.scroll_fragments))]

            for i in self.scroll_fragments:
                # get z_slices .tiffs paths
                z_slices[i-1] += sorted(glob.glob(f"{PATH}/{'train'}/{i}/surface_volume/*.tif"))[self.z_start:self.z_start + self.z_dim]
                # get labels
                labels[i-1] = self.load_labels('train', i)
                # get masks
                masks[i-1] = self.load_mask('train', i)

            # get images of z-slices and convert them to tensors
            images = [[] for _ in range(len(self.scroll_fragments))]
            for i in range(len(self.scroll_fragments)):
                images[i] = self.load_slices(z_slices[i])

            # concat images, labels and masks of different scrolls
            images_tensors = torch.cat([image for image in images], axis=-1)
            label_tensors =  torch.cat([label for label in labels], axis=-1)
            mask_tensors =  np.concatenate([mask for mask in masks], axis=-1)
            del images
            del z_slices
            del labels
            del masks

            # obtain train_pixesl and val_pixels
            train_pixels , val_pixels = self.split_train_val(mask_tensors)
            self.mask = mask_tensors
            #del mask_tensors
            
            self.data_train = Base_Dataset(image_stack=images_tensors, label=label_tensors,  pixels=train_pixels, buffer=self.buffer, z_dim=self.z_dim )
            self.data_val = Base_Dataset(image_stack=images_tensors, label=label_tensors,  pixels=val_pixels,  buffer=self.buffer, z_dim=self.z_dim)

            del images_tensors
            del label_tensors
            del train_pixels
            del val_pixels



        # TODO: finish the same for test, note paths are different
        elif self.stage.lower() == 'test':

            # get z_slices paths
            z_slices = [[], []]
            for i, l in enumerate(['a','b']):
                z_slices[i] = sorted(glob.glob(f"{PATH}/{'test'}/{l}/surface_volume/*.tif"))[self.z_star:self.z_star + self.z_dim]



    def train_dataloader(self, *args, **kwargs) -> DataLoader:
        """
        construct a dataloader for training data
        data is shuffled !
        :param args:
        :param kwargs:
        :return:
        """
        return DataLoader(
            self.data_train,
            shuffle=True,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            pin_memory=self.on_gpu,
            #collate_fn=self.collate_function,
        )

    def val_dataloader(self, *args, **kwargs):
        """

        :param args:
        :param kwargs:
        :return:
        """
        return DataLoader(
            self.data_val,
            shuffle=False,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            pin_memory=self.on_gpu,
            #collate_fn=self.collate_function
        )

    def test_dataloader(self, *args, **kwargs):
        """

        :param args:
        :param kwargs:
        :return:
        """
        return DataLoader(
            self.data_test,
            shuffle=False,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            pin_memory=self.on_gpu,
            collate_fn=self.collate_function,
        )






    # image_stack = torch.stack([torch.from_numpy(image) for image in images], dim=0).to(DEVICE)
    def load_slices(self, z_slices_fnames):
        images = []
        for z, filename in tqdm(enumerate(z_slices_fnames)):
            img = Image.open(filename)
            img = self.resize(img)
            z_slice = np.array(img, dtype="float32")/65535.0
            images.append(z_slice)
        return torch.stack([torch.from_numpy(image) for image in images], dim=0).to(DEVICE)



    def load_mask(self, split, index):
        img = Image.open(f"{PATH}/{split}/{index}/mask.png").convert('1')
        img = self.resize(img)
        return np.array(img)



    def load_labels(self, split, index):
        img = Image.open(f"{PATH}/{split}/{index}/inklabels.png")
        img = self.resize(img)
        return torch.from_numpy(np.array(img)).gt(0).float().to(DEVICE)


    def resize(self, img):
        current_width, current_height = img.size
        aspect_ratio = current_width / current_height
        new_width = int(self.shared_height * aspect_ratio)
        new_size = (new_width, self.shared_height)
        img = img.resize(new_size)
        return img



    def split_train_val(self,mask):
        rect = self.validation_rect
        not_border = np.zeros(mask.shape, dtype=bool)
        not_border[self.buffer:mask.shape[0] - self.buffer, self.buffer:mask.shape[1] - self.buffer] = True
        arr_mask = np.array(mask) * not_border
        inside_rect = np.zeros(mask.shape, dtype=bool) * arr_mask
        inside_rect[rect[1]:rect[1] + rect[3] + 1, rect[0]:rect[0] + rect[2] + 1] = True
        outside_rect = np.ones(mask.shape, dtype=bool) * arr_mask
        outside_rect[rect[1]:rect[1] + rect[3] + 1, rect[0]:rect[0] + rect[2] + 1] = False
        pixels_inside_rect = np.argwhere(inside_rect)
        pixels_outside_rect = np.argwhere(outside_rect)
        return pixels_outside_rect, pixels_inside_rect


In [5]:
# Initiating Dataset with parameters

# buffer =   -- x,y patchsize for training
# z_start =  --  Offset of slices in the z direction
# z_dim =    -- Number of slices in the z direction. Max value is (64 - z_start)
# validation_rect =  -- rectangle removed for validation set
# shared_height = -- Height to resize all scrolls
# scroll_fragments = -- scrolls to be used 

dataset = Scrolls_Dataset(
                buffer = 31,
                 z_start = 20,
                 z_dim = 24,
                 validation_rect = (1100, 3500, 700, 950),
                shared_height = 5000,
                 downsampling =None,
                 scroll_fragments = [1,2,3],
                 stage = 'train',
                 shuffle=True,
                 batch_size=8,
                 num_workers =4 ,
                 on_gpu= False,
                          
                         )



In [6]:
# prepeare data, by processng images and loading dataloader

#dataset.prepare_data()

24it [00:06,  3.82it/s]
24it [00:13,  1.74it/s]
24it [00:04,  4.95it/s]


### Dataloaders

In [29]:
dataloader = iter(dataset.train_dataloader())
train_tensor = None
train_label = None
for i in range(1):
    # Get image and label from train data -- change number for different ones
    #print(next(dataloader))
    subvolume, inklabel = next(dataloader)
    print('subvolume shape:',subvolume.shape)
    print('inklabel shape:',inklabel.shape)
    train_tensor = subvolume
    train_label = inklabel
    

subvolume shape: torch.Size([8, 1, 24, 63, 63])
inklabel shape: torch.Size([8, 1])


### MODEL

In [8]:
# three backbone options (also try U-NET and V-Net)
r3d_18 = models.video.r3d_18(pretrained=False)

#r2plus1d_18 = models.video.r2plus1d_18(pretrained=False)
#mc3_18 = models.video.mc3_18(pretrained=False)



In [9]:
import math
from typing import Union
import json
import torch
import torch.nn as nn
import torchvision.models
from torch import Tensor
from Models.positional_encoding import PositionalEncoding1D, PositionalEncoding2D



TF_DIM = 256    # embedding_dim 128
TF_FC_DIM = 512 # decoder fully connected dim 256
TF_DROPOUT = 0.4 # decoder_dropout 0.4
TF_LAYERS = 6   # decoder_layers
TF_NHEAD = 8    # decoder_heads
RESNET_DIM = 512  # hard-coded




# TODO: Pass parameters from the dataset

# self.d_model = TF_DIM
# self.max_output_len = dataset.max_label_length
# dim_feedforward = TF_FC_DIM


'''
Image to Tex OCR:
Resnet18 encoder with the first three layers and Transformer Decoder.

'''
class ResNetTransformer(nn.Module):
    def __init__(
        self,
        num_classes = 2,
        max_label_length = 1,
        embedding_dim = TF_DIM,
        decoder_heads = TF_NHEAD,
        decoder_layers = TF_LAYERS,
        decoder_dropout = TF_DROPOUT,
        decoder_fc = TF_FC_DIM,
    ) -> None:

        super().__init__()

        self.embedding_dim = embedding_dim
        self.max_output_len = max_label_length
        self.num_classes = num_classes


        ### Encoder ###
        r3d_18 = torchvision.models.video.r3d_18(pretrained=False)
        self.backbone = nn.Sequential(
            r3d_18.stem,
            r3d_18.layer1,
            r3d_18.layer2,
            r3d_18.layer3,
            r3d_18.layer4,
        )
        self.bottleneck = nn.Conv3d(RESNET_DIM, self.embedding_dim, 1) # in channels, out channels, stride
        
        ### Decoder ###
        
        self.y_mask = generate_square_subsequent_mask(self.max_output_len)
       
        transformer_decoder_layer = nn.TransformerDecoderLayer(self.embedding_dim, decoder_heads, decoder_fc, decoder_dropout)
        self.transformer_decoder = nn.TransformerDecoder(transformer_decoder_layer, decoder_layers)
        self.fc = nn.Linear(self.embedding_dim, self.num_classes)


        # It is empirically important to initialize weights properly
        #if self.training:
            #self._init_weights()
        self._init_weights()


    def _init_weights(self) -> None:
        """Initialize weights."""
        init_range = 0.1
        self.embedding.weight.data.uniform_(-init_range, init_range)
        self.fc.bias.data.zero_()
        self.fc.weight.data.uniform_(-init_range, init_range)

        nn.init.kaiming_normal_(
            self.bottleneck.weight.data,
            a=0,
            mode="fan_out",
            nonlinearity="relu",
        )
        if self.bottleneck.bias is not None:
            _, fan_out = nn.init._calculate_fan_in_and_fan_out(self.bottleneck.weight.data)
            bound = 1 / math.sqrt(fan_out)
            nn.init.normal_(self.bottleneck.bias, -bound, bound)


    def forward(self, x: Tensor, y: Tensor) -> Tensor:
        """Forward pass.

        Args:
            x: (B, _E, _H, _W)
            y: (B, Sy) with elements in (0, num_classes - 1)

        Returns:
            (B, num_classes, Sy) logits
        """
        encoded_x = self.encode(x)  # (Sx, B, E)
        output = self.decode(y, encoded_x)  # (Sy, B, num_classes)
        output = output.permute(1, 2, 0)  # (B, num_classes, Sy)
        return output

    def encode(self, x: Tensor) -> Tensor:
       
        # Resnet expects 3 channels but training images are in gray scale
        if x.shape[1] == 1:   # (B, 1, z_dim, r) where r = Buffer*2+1
            x = x.repeat(1, 3, 1, 1, 1)
        x = self.backbone(x.float())  # (B, RESNET_DIM, Z_DIM, R); Z_DIM = z_dim // 8, R = r // 16
        
        x = self.bottleneck(x)  # (B, E, H, W); E:= embedding dim

        # x = x * math.sqrt(self.embedding_dim)   # This prevented any learning
       
        x = x.flatten(start_dim=2)  # (B, E, H * W)
        x = x.permute(2, 0, 1)  # (Sx, B, E); Sx = H * W
        return x

    def decode(self, y: Tensor, encoded_x: Tensor) -> Tensor:
        """Decode encoded inputs with teacher-forcing.

        Args:
            encoded_x: (Sx, B, E)
            y: (B, Sy) with elements in (0, num_classes - 1)

        Returns:
            (Sy, B, num_classes) logits
        """
        y = y.permute(1, 0)  # (Sy, B)
        y = self.embedding(y) * math.sqrt(self.embedding_dim)  # (Sy, B, E)
        y = self.word_positional_encoder(y)  # (Sy, B, E)
        Sy = y.shape[0]
        y_mask = self.y_mask[:Sy, :Sy].type_as(encoded_x)  # (Sy, Sy)
        output = self.transformer_decoder(y, encoded_x, y_mask)  # (Sy, B, E)
        output = self.fc(output)  # (Sy, B, num_classes)
        return output

    def predict(self, x: Tensor) -> Tensor:
        """Make predctions at inference time.

        Args:
            x: (B, C, H, W). Input images.

        Returns:
            (B, max_output_len) with elements in (0, num_classes - 1).
        """
        B = x.shape[0]
        S = self.max_output_len

        encoded_x = self.encode(x)  # (Sx, B, E)

        output_indices = torch.full(size=(B, S), fill_value=self.pad_index).type_as(x).long()
        output_indices[:, 0] = self.sos_index
        has_ended = torch.full((B,), 0)
        has_ended.to(torch.bool)

        for Sy in range(1, S):
            y = output_indices[:, :Sy]  # (B, Sy)
            logits = self.decode(y, encoded_x)  # (Sy, B, num_classes)
            # Select the token with the highest conditional probability
            output = torch.argmax(logits, dim=-1)  # (Sy, B)
            if B ==1:
                output_indices[:, Sy:Sy+1] = output[-1:]
            else:
                output_indices[:, Sy] = output[-1:]  # Set the last output token

            # Early stopping of prediction loop to speed up prediction
            has_ended |= (output_indices[:, Sy] == self.eos_index).type_as(has_ended)
            if torch.all(has_ended):
                break

        # Set all tokens after end token to be padding
        eos_positions = find_first(output_indices, self.eos_index)
        for i in range(B):
            j = int(eos_positions[i].item()) + 1
            output_indices[i, j:] = self.pad_index

        return output_indices



def generate_square_subsequent_mask(size: int) -> Tensor:
    """Generate a triangular (size, size) mask."""
    mask = (torch.triu(torch.ones(size, size)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float("-inf")).masked_fill(mask == 1, float(0.0))
    return mask


def find_first(x: Tensor, element: Union[int, float], dim: int = 1) -> Tensor:
    """Find the first occurence of element in x along a given dimension.

    Args:
        x: The input tensor to be searched.
        element: The number to look for.
        dim: The dimension to reduce.

    Returns:
        Indices of the first occurence of the element in x. If not found, return the
        length of x along dim.

    Usage:
        >>> first_element(Tensor([[1, 2, 3], [2, 3, 3], [1, 1, 1]]), 3)
        tensor([2, 1, 3])

    Reference:
        https://discuss.pytorch.org/t/first-nonzero-index/24769/9

        I fixed an edge case where the element we are looking for is at index 0. The
        original algorithm will return the length of x instead of 0.
    """
    mask = x == element
    found, indices = ((mask.cumsum(dim) == 1) & mask).max(dim)
    indices[(~found) & (indices == 0)] = x.shape[dim]
    return indices






ModuleNotFoundError: No module named 'Models.positional_encoding'

In [11]:
import torch.nn as nn

In [13]:
backbone = nn.Sequential(
            r3d_18.stem,
            r3d_18.layer1,
            r3d_18.layer2,
            r3d_18.layer3,
            r3d_18.layer4,
        )
bottleneck = nn.Conv3d(512, 128, 1)

In [93]:
train_tensor.shape

torch.Size([8, 3, 24, 63, 63])

torch.Size([8, 1, 24, 63, 63])

In [92]:
if train_tensor.shape[1] == 1:
    train_tensor = train_tensor.repeat(1, 3, 1, 1, 1)
output = backbone(train_tensor)

In [94]:
# [8, 1, 10, H, W] goes to [8, RESNTETDIM of 512, 2, H/16, W/16]
# [8, 1, 10, 65, 65] goes to [8, 512, 2, 5, 5]
# [8, 1, 10, 63, 63] goes to [8, 512, 2, 5, 5]
# [8, 1, 10, 61, 61]  goes to [8, 512, 2, 4, 4]
output.shape

torch.Size([8, 512, 3, 4, 4])

In [95]:
out = bottleneck(output)

In [96]:
out.shape

torch.Size([8, 128, 3, 4, 4])

In [100]:
out =out.flatten(start_dim=1)

In [101]:
out.shape

torch.Size([8, 6144])

In [102]:
fc = nn.Linear(6144, 2)

In [103]:
res = fc(out)

In [104]:
res.shape

torch.Size([8, 2])

In [107]:
train_label.squeeze(1).long()

tensor([0, 1, 0, 0, 0, 0, 0, 0])

In [108]:
train_label.squeeze(1).shape

torch.Size([8])

In [109]:
loss_fn = nn.CrossEntropyLoss()


In [110]:
los = loss_fn(res, train_label.squeeze(1).long())

In [112]:
los

tensor(0.6174, grad_fn=<NllLossBackward0>)

In [86]:
# Example of target with class indices
import torch
import torch.nn as nn

input = torch.rand(3, 5)
target = torch.empty(3, dtype = torch.long).random_(5)
print(target.shape)

loss = nn.CrossEntropyLoss()
output = loss(input, target)

print('input: ', input)
print('target: ', target)
print('Cross Entropy Loss: ', output)

torch.Size([3])
input:  tensor([[0.1593, 0.3356, 0.4385, 0.7986, 0.1470],
        [0.5411, 0.6084, 0.8018, 0.6210, 0.1916],
        [0.5296, 0.5167, 0.5870, 0.0325, 0.8659]])
target:  tensor([2, 3, 1])
Cross Entropy Loss:  tensor(1.5901)


In [89]:
target.shape

torch.Size([3])

In [88]:
input.shape

torch.Size([3, 5])