Loading annotations exported from Label Studio as a json file.

In [3]:
import json

with open('../data/sample/sample.json') as f:
    data = json.load(f)

In [6]:
len(data)

35

35 images were annotated.

In [9]:
data[0].keys()

dict_keys(['id', 'annotations', 'predictions', 'file_upload', 'data', 'meta', 'created_at', 'updated_at', 'project'])

An annotation for a single image is a dictionary. Everything which is interesting for us is in the `annotations` value.

In [47]:
data[0]['annotations'][0].keys()

dict_keys(['id', 'completed_by', 'result', 'was_cancelled', 'ground_truth', 'created_at', 'updated_at', 'lead_time', 'prediction', 'result_count', 'task'])

And specifically in the `result` value inside.

Below functions to format annotations before implementing the `Dataset` class.

In [121]:
from typing import Dict

def format_image_annotations(image_annotations: Dict):
    """
    Formats the annotations for a single image.
    """
    words = []
    boxes = []
    labels = []
    
    for annotation in image_annotations['result']:
        value = annotation['value']
        if 'rectanglelabels' not in value.keys():
            continue
        # x, y, width, height are already normalized and in 0-100
        # For LayoutLMv2 we want them in 0-1000
        x = value['x']
        y = value['y']
        width = value['width']
        height = value['height']
        rotation = value['rotation']

        # ignoring the rotation parameter for now
        #  [x1, y1, x3, y3] format
        x1 = 10 * x
        y1 = 10 * (100 - y - height)
        x3 = 10 * (x + width)
        y3 = 10 * (100 - y)

        boxes.append([x1, y1, x3, y3])
        words.append(annotation['meta']['text'][0])
        labels.append(value['rectanglelabels'][0])
    
    return words, labels, boxes

In [122]:
from typing import List

def format_annotations(annotations: List):
    """
    Format all annotations.
    """
    words = []
    boxes = []
    labels = []
    
    for image_full_annotations in annotations:
        image_annotations = image_full_annotations['annotations'][0]
        image_words, image_labels, image_boxes = format_image_annotations(image_annotations)
        words.append(image_words)
        boxes.append(image_boxes)
        labels.append(image_labels)
    
    return words, labels, boxes

We format all annotations using the `format_annotations` function.

In [123]:
formatted_annotations = format_annotations(data)

Now we implement the `Dataset` class.

In [124]:
def get_image_paths(annotations: List):
    """
    Gets image paths from annotations.
    """
    image_paths = []
    for image_annotations in annotations:
        image_paths.append(image_annotations['data']['image'])
    return image_paths

image_paths = get_image_paths(data)

In [125]:
image_names = [path[21:] for path in image_paths]

In [126]:
image_paths = ['../data/sample/' + name for name in image_names]
image_paths

['../data/sample/2022-02-02_repas_les jardins de presbourg.jpg',
 '../data/sample/2022-01-25_repas_la cantine de belleville.jpg',
 '../data/sample/2022-02-15_achat_la poste.jpg',
 '../data/sample/2022-03-28_repas_la comete.jpg',
 '../data/sample/20221108_230713.jpg',
 '../data/sample/20221019_091203.jpg',
 '../data/sample/20221030_123439.jpg',
 '../data/sample/2022-03-22_achat_la poste.jpg',
 '../data/sample/20221108_230510.jpg',
 '../data/sample/20221105_133703.jpg',
 '../data/sample/20221019_091208.jpg',
 '../data/sample/20221108_230524.jpg',
 '../data/sample/2022-03-10_achat_la poste.jpg',
 '../data/sample/IMG_20221107_211244.jpg',
 '../data/sample/2022-03-06_achat_darty.jpg',
 '../data/sample/20221108_230653.jpg',
 '../data/sample/20221030_123424.jpg',
 '../data/sample/2022-03-06_achat_le divan.jpg',
 '../data/sample/20221117_120342.jpg',
 '../data/sample/20221108_230535.jpg',
 '../data/sample/20221018_185931.jpg',
 '../data/sample/20221105_133551.jpg',
 '../data/sample/20221105_22

In [127]:
all_labels = [item for sublist in formatted_annotations[1] for item in sublist]
labels = list(set(all_labels))
label2id = {label: idx for idx, label in enumerate(labels)}

label2id

{'date': 0,
 'total_price': 1,
 'item_name': 2,
 'item_total_price': 3,
 'code_tva': 4,
 'item_quantity': 5,
 'item_unit_price': 6,
 'taux_tva': 7,
 'magasin': 8}

In [132]:
from os import listdir
from torch.utils.data import Dataset
import torch
from PIL import Image

class TicketsDataset(Dataset):
    """Tickets dataset."""

    def __init__(self, annotations, image_paths: List[str], processor=None, max_length=512):
        """
        Args:
            annotations (List[List]): List of lists containing the word-level annotations (words, labels, boxes).
            image_paths (string): Directory with all the document images.
            processor (LayoutLMv2Processor): Processor to prepare the text + image.
        """
        self.words, self.labels, self.boxes = annotations
        self.image_paths = image_paths
        self.processor = processor

    def __len__(self):
        return len(self.image_file_names)

    def __getitem__(self, idx):
        # first, take an image
        path = self.image_paths[idx]
        image = Image.open(path).convert("RGB")

        # get word-level annotations 
        words = self.words[idx]
        boxes = self.boxes[idx]
        word_labels = self.labels[idx]

        assert len(words) == len(boxes) == len(word_labels)

        word_labels = [label2id[label] for label in word_labels]
        # use processor to prepare everything
        encoded_inputs = self.processor(image, words, boxes=boxes, word_labels=word_labels, 
                                        padding="max_length", truncation=True, 
                                        return_tensors="pt")
        
        # remove batch dimension
        for k,v in encoded_inputs.items():
            encoded_inputs[k] = v.squeeze()

        assert encoded_inputs.input_ids.shape == torch.Size([512])
        assert encoded_inputs.attention_mask.shape == torch.Size([512])
        assert encoded_inputs.token_type_ids.shape == torch.Size([512])
        assert encoded_inputs.bbox.shape == torch.Size([512, 4])
        assert encoded_inputs.image.shape == torch.Size([3, 224, 224])
        assert encoded_inputs.labels.shape == torch.Size([512]) 
      
        return encoded_inputs

In [117]:
!pip install transformers



In [133]:
from transformers import LayoutLMv2Processor

processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")
train_dataset = TicketsDataset(annotations=formatted_annotations,
                            image_paths=image_paths, 
                            processor=processor)

In [134]:
train_dataset

<__main__.TicketsDataset at 0x7fe66838c820>

On vérifie un exemple.

In [135]:
encoding = train_dataset[0]
encoding.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'labels', 'image'])