In [9]:
from pathlib import Path
import kagglehub

# Download latest version
path = kagglehub.dataset_download("changheonkim/iam-trocr")
path = Path(path)/"IAM"
print("Path to dataset files:", path)

Using Colab cache for faster access to the 'iam-trocr' dataset.
Path to dataset files: /kaggle/input/iam-trocr/IAM


In [10]:

# Assuming 'path' variable holds the base directory from kagglehub.dataset_download
# If not, please replace 'path' with the correct directory string, e.g., '/content/IAM'
if 'path' in globals():
    print(f"Listing directories in: {path}")
    # Use a shell command to list only directories recursively, and sort them
    !ls {path/"image"}


Listing directories in: /kaggle/input/iam-trocr/IAM
c04-110-00.jpg	e06-070-02.jpg	 g07-000b-00.jpg  n02-157-05.jpg
c04-110-01.jpg	e06-070-03.jpg	 g07-000b-01.jpg  n02-157-06.jpg
c04-110-02.jpg	e06-070-04.jpg	 g07-000b-02.jpg  n02-157-07.jpg
c04-110-03.jpg	e06-070-05.jpg	 g07-000b-03.jpg  n02-157-08.jpg
c04-116-00.jpg	e06-070-06.jpg	 g07-000b-04.jpg  n03-038-00.jpg
c04-116-01.jpg	e06-070-07.jpg	 g07-000b-05.jpg  n03-038-01.jpg
c04-116-02.jpg	e06-070-08.jpg	 g07-000b-06.jpg  n03-038-02.jpg
c04-116-03.jpg	e06-070-09.jpg	 g07-000b-07.jpg  n03-038-03.jpg
c04-134-00.jpg	f04-032-00.jpg	 g07-000b-08.jpg  n03-038-04.jpg
c04-134-01.jpg	f04-032-01.jpg	 g07-000b-09.jpg  n03-038-05.jpg
c04-134-02.jpg	f04-032-02.jpg	 g07-079a-00.jpg  n03-038-06.jpg
c04-134-03.jpg	f04-032-03.jpg	 g07-079a-01.jpg  n03-064-00.jpg
c04-134-04.jpg	f04-032-04.jpg	 g07-079a-02.jpg  n03-064-01.jpg
c04-134-05.jpg	f04-032-05.jpg	 g07-079a-03.jpg  n03-064-02.jpg
c04-134-06.jpg	f04-032-06.jpg	 g07-079a-04.jpg  n03-064-03.jpg
c04

In [11]:
from transformers import VisionEncoderDecoderModel, TrOCRProcessor
import torch

# Load the model
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-small-handwritten")
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-small-handwritten")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Loading weights:   0%|          | 0/360 [00:00<?, ?it/s]

VisionEncoderDecoderModel LOAD REPORT from: microsoft/trocr-small-handwritten
Key                         | Status  | 
----------------------------+---------+-
encoder.pooler.dense.bias   | MISSING | 
encoder.pooler.dense.weight | MISSING | 

Notes:
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


preprocessor_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

The image processor of type `DeiTImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. 


tokenizer_config.json:   0%|          | 0.00/327 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/238 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

VisionEncoderDecoderModel(
  (encoder): DeiTModel(
    (embeddings): DeiTEmbeddings(
      (patch_embeddings): DeiTPatchEmbeddings(
        (projection): Conv2d(3, 384, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): DeiTEncoder(
      (layer): ModuleList(
        (0-11): 12 x DeiTLayer(
          (attention): DeiTAttention(
            (attention): DeiTSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
            )
            (output): DeiTSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): DeiTIntermediate(
            (dense): Linear(in_features=384, out_features=1536, bias=True)
        

In [12]:
from torch.utils.data import Dataset
from PIL import Image

class IAMDataset(Dataset):
    def __init__(self, image_dir, annotation_file, processor):
        self.image_dir = image_dir
        self.processor = processor
        self.samples = []

        with open(annotation_file, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue

                image_name, text = line.split("\t")
                self.samples.append((image_name, text))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        image_name, text = self.samples[idx]

        image = Image.open(self.image_dir / image_name).convert("RGB")

        pixel_values = self.processor(
            images=image,
            return_tensors="pt"
        ).pixel_values.squeeze()

        labels = self.processor.tokenizer(
            text,
            padding="max_length",
            max_length=8,
            truncation=True,
            return_tensors="pt"
        ).input_ids.squeeze()

        return {
            "pixel_values": pixel_values,
            "labels": labels
        }


In [13]:
from torch.utils.data import DataLoader

image_dir = path / "image"
annotation_file = path / "gt_test.txt"

dataset = IAMDataset(image_dir, annotation_file, processor)

train_loader = DataLoader(
    dataset,
    batch_size=4,
    shuffle=True,
    num_workers=2,
    pin_memory=True
)


In [14]:
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size



In [15]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)


In [16]:
model.train()

# Demora ~2 minutos por epoch
for epoch in range(3):
    print(f"\nEpoch {epoch+1}")

    for batch in train_loader:
        pixel_values = batch["pixel_values"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            pixel_values=pixel_values,
            labels=labels
        )

        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        print("Loss:", loss.item())



Epoch 1
Loss: 14.539320945739746
Loss: 10.565622329711914
Loss: 8.013885498046875
Loss: 8.161620140075684
Loss: 8.390067100524902
Loss: 8.364788055419922
Loss: 7.723751544952393
Loss: 7.589291572570801
Loss: 7.676917552947998
Loss: 7.4461798667907715
Loss: 7.866739273071289
Loss: 7.322050094604492
Loss: 7.149470806121826
Loss: 7.846858024597168
Loss: 7.825939655303955
Loss: 7.191524505615234
Loss: 7.207895755767822
Loss: 7.69523811340332
Loss: 6.915716171264648
Loss: 6.353573799133301
Loss: 7.333039283752441
Loss: 6.2736711502075195
Loss: 6.2706298828125
Loss: 6.429291248321533
Loss: 5.2521796226501465
Loss: 6.464487075805664
Loss: 6.830649375915527
Loss: 5.374419212341309
Loss: 5.627019882202148
Loss: 6.083999156951904
Loss: 6.905313968658447
Loss: 6.29903507232666
Loss: 6.425820350646973
Loss: 6.163638114929199
Loss: 5.930673122406006
Loss: 5.908890724182129
Loss: 6.1557464599609375
Loss: 6.370401859283447
Loss: 7.01793098449707
Loss: 5.657199382781982
Loss: 5.0663161277771
Loss: 6.