<a href="https://colab.research.google.com/github/gabrielmelo00/BenchmarkDonutTransformer/blob/main/DonutEncoderOnlyForClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q git+https://github.com/huggingface/transformers.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone


In [3]:
!pip install -q datasets sentencepiece

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 17.0.0 wh

# Dataset

Load the dataset created by Niels Rogge [here](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/Donut/RVL-CDIP/Preparing_an_image_classification_dataset_for_Donut.ipynb). This is a small subset of the entire  RVL-CDIP.

In [4]:
from datasets import load_dataset

dataset = load_dataset("nielsr/rvl_cdip_10_examples_per_class_donut")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading metadata:   0%|          | 0.00/1.85k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/18.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/160 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/160 [00:00<?, ? examples/s]

In [5]:
id2label = {id: label for id, label in enumerate(dataset['train'].features['label'].names)}
print(id2label)

{0: 'letter', 1: 'form', 2: 'email', 3: 'handwritten', 4: 'advertisement', 5: 'scientific report', 6: 'scientific publication', 7: 'specification', 8: 'file folder', 9: 'news article', 10: 'budget', 11: 'invoice', 12: 'presentation', 13: 'questionnaire', 14: 'resume', 15: 'memo'}


# Model

Here is the most important part, we created the model for the encoder only donut with classification head in the end. Basically, the encoder will work as feature extraction step. Its pooled output will pass through a dropout and a classifier layer, which maps from number of features to number of classes.

In [6]:
from transformers import DonutSwinModel, DonutSwinPreTrainedModel
from torch import nn
import torch

class DonutForImageClassification(DonutSwinPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.swin = DonutSwinModel(config)
        self.dropout = nn.Dropout(0.5)
        self.classifier = nn.Linear(self.swin.num_features, config.num_labels)

    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
        outputs = self.swin(pixel_values)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

In [20]:
from transformers import VisionEncoderDecoderConfig, DonutProcessor, VisionEncoderDecoderModel
image_size = [1280, 960]
config = VisionEncoderDecoderConfig.from_pretrained("nielsr/donut-base")
config.encoder.image_size = image_size

processor = DonutProcessor.from_pretrained("nielsr/donut-base")
donut_model = VisionEncoderDecoderModel.from_pretrained("nielsr/donut-base", config=config)

processor.feature_extractor.size = image_size[::-1] # should be (width, height)
processor.feature_extractor.do_align_long_axis = False

donut_model.encoder.save_pretrained("donut_encoder")

In [21]:
model = DonutForImageClassification.from_pretrained("donut_encoder", num_labels=16)

Some weights of DonutForImageClassification were not initialized from the model checkpoint at donut_encoder and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Now we can prepare the dataset to be loaded using PyTorch

In [22]:
import os
from PIL import Image
from torch.utils.data import Dataset

class DocumentDataset(Dataset):
    def __init__(self, dataset_name_or_path, split):
        self.split = split
        self.dataset = load_dataset(dataset_name_or_path, split=self.split)
        print(self.dataset)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        sample = self.dataset[idx]

        pixel_values = processor(sample["image"].convert("RGB"), random_padding=self.split == "train", return_tensors="pt").pixel_values
        pixel_values = pixel_values.squeeze()
        label = sample['label']

        encoding = dict(pixel_values=pixel_values,
                        label=label)
        return encoding

In [23]:
train_dataset = DocumentDataset("nielsr/rvl_cdip_10_examples_per_class_donut", split="train")

Dataset({
    features: ['image', 'label', 'ground_truth'],
    num_rows: 160
})


In [24]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)

In [25]:
batch = next(iter(train_dataloader))
print(batch.keys())

dict_keys(['pixel_values', 'label'])


# Training

In [26]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

DonutForImageClassification(
  (swin): DonutSwinModel(
    (embeddings): DonutSwinEmbeddings(
      (patch_embeddings): DonutSwinPatchEmbeddings(
        (projection): Conv2d(3, 128, kernel_size=(4, 4), stride=(4, 4))
      )
      (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): DonutSwinEncoder(
      (layers): ModuleList(
        (0): DonutSwinStage(
          (blocks): ModuleList(
            (0-1): 2 x DonutSwinLayer(
              (layernorm_before): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
              (attention): DonutSwinAttention(
                (self): DonutSwinSelfAttention(
                  (query): Linear(in_features=128, out_features=128, bias=True)
                  (key): Linear(in_features=128, out_features=128, bias=True)
                  (value): Linear(in_features=128, out_features=128, bias=True)
                  (dropout): Dropout(p=0.0, inplace=False)
            

To be able to benchmark it later, we saved the model after 5, 10, 15, 20 epochs trained. In this way, we have more detail on the convergence and learning of the model.

In [27]:
import torch
from tqdm.auto import tqdm


optimizer = torch.optim.AdamW(params=model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()

for epoch in range(20):
    print("Epoch:", epoch+1)
    model.train()
    for i, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        pixel_values = batch["pixel_values"]
        labels = batch["label"]
        optimizer.zero_grad()
        outputs = model(pixel_values=pixel_values)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        if i % 100 == 0:
            print("Loss:", loss.item())

    if epoch + 1 in {5, 10, 15, 20}:
        checkpoint_path = f"model_epoch_{epoch+1}.pth"
        torch.save(model.state_dict(), checkpoint_path)
        print(f"Model saved at {checkpoint_path}")

Epoch: 1


  0%|          | 0/160 [00:00<?, ?it/s]

Loss: 3.70231556892395
Loss: 3.024982452392578
Epoch: 2


  0%|          | 0/160 [00:00<?, ?it/s]

Loss: 2.541522264480591
Loss: 2.860440969467163
Epoch: 3


  0%|          | 0/160 [00:00<?, ?it/s]

Loss: 2.630465269088745
Loss: 2.1937198638916016
Epoch: 4


  0%|          | 0/160 [00:00<?, ?it/s]

Loss: 2.7742600440979004
Loss: 0.9532672166824341
Epoch: 5


  0%|          | 0/160 [00:00<?, ?it/s]

Loss: 0.4238550662994385
Loss: 4.296530246734619
Model saved at model_epoch_5.pth
Epoch: 6


  0%|          | 0/160 [00:00<?, ?it/s]

Loss: 3.739987373352051
Loss: 0.6943356990814209
Epoch: 7


  0%|          | 0/160 [00:00<?, ?it/s]

Loss: 1.7329750061035156
Loss: 1.1023378372192383
Epoch: 8


  0%|          | 0/160 [00:00<?, ?it/s]

Loss: 1.6882001161575317
Loss: 0.9208144545555115
Epoch: 9


  0%|          | 0/160 [00:00<?, ?it/s]

Loss: 2.4631640911102295
Loss: 0.4820936322212219
Epoch: 10


  0%|          | 0/160 [00:00<?, ?it/s]

Loss: 0.005439955275505781
Loss: 1.6626732349395752
Model saved at model_epoch_10.pth
Epoch: 11


  0%|          | 0/160 [00:00<?, ?it/s]

Loss: 0.2733348309993744
Loss: 1.2818022966384888
Epoch: 12


  0%|          | 0/160 [00:00<?, ?it/s]

Loss: 0.03281886503100395
Loss: 0.05230311304330826
Epoch: 13


  0%|          | 0/160 [00:00<?, ?it/s]

Loss: 0.001836162875406444
Loss: 0.2656080424785614
Epoch: 14


  0%|          | 0/160 [00:00<?, ?it/s]

Loss: 0.12347044050693512
Loss: 3.2924883365631104
Epoch: 15


  0%|          | 0/160 [00:00<?, ?it/s]

Loss: 0.09693533182144165
Loss: 2.4978222846984863
Model saved at model_epoch_15.pth
Epoch: 16


  0%|          | 0/160 [00:00<?, ?it/s]

Loss: 0.12904950976371765
Loss: 0.14737243950366974
Epoch: 17


  0%|          | 0/160 [00:00<?, ?it/s]

Loss: 0.05589354410767555
Loss: 0.07741816341876984
Epoch: 18


  0%|          | 0/160 [00:00<?, ?it/s]

Loss: 1.6614376306533813
Loss: 0.007626701612025499
Epoch: 19


  0%|          | 0/160 [00:00<?, ?it/s]

Loss: 0.0004589696181938052
Loss: 1.743046522140503
Epoch: 20


  0%|          | 0/160 [00:00<?, ?it/s]

Loss: 0.1126399114727974
Loss: 2.5018186569213867
Model saved at model_epoch_20.pth


# Evaluate and Benchmark

Here we simply run the same cell 4 times for each model saved, getting their average inference time and accuracy.

In [31]:
# Evaluation loop
import time
import numpy as np

model.load_state_dict(torch.load('/content/model_epoch_20.pth'))
model.eval()
correct = 0
total = 0
dataset = load_dataset("nielsr/rvl_cdip_10_examples_per_class_donut", split="test")
inference_time = []

with torch.no_grad():
    for sample in dataset:
        pixel_values = processor(sample["image"].convert("RGB"), return_tensors="pt").pixel_values
        pixel_values = pixel_values.to(device)
        labels = sample["label"]
        start_time = time.time()
        outputs = model(pixel_values)
        end_time = time.time()
        _, predicted = torch.max(outputs.data, 1)
        total += 1
        correct += (predicted == labels).sum().item()
        inference_time.append(end_time - start_time)


accuracy = 100 * correct / total
print(f'Accuracy: {accuracy}%')
print(f'Predicted {correct} correctly out of {total}!')
print(f"Average Inference time: {np.mean(inference_time):.6f} seconds")

Accuracy: 51.875%
Predicted 83 correctly out of 160!
Average Inference time: 0.029348 seconds
