In [1]:
import numpy as np
import pandas as pd
import os

import torch
from torch.utils.data import Dataset, DataLoader, Subset
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from torch.optim import AdamW
from PIL import Image
from torch.cuda.amp import autocast, GradScaler

import time
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = pd.read_csv("dataset.csv")
for index, row in dataset.iterrows():
    img_path = os.path.join("dataset_creator/images", row["img_name"])
    assert os.path.exists(img_path), f"Image {img_path} not found"

In [3]:
class CodeOCRDataset(Dataset):
    def __init__(self, csv_file, img_dir, processor, max_target_length=256):
        self.data = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.processor = processor
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.data.iloc[idx]["img_name"])
        text = self.data.iloc[idx]["ground_truth"]
        image = Image.open(img_path).convert("RGB")
        encoding = self.processor(
            images=image,
            text=text,
            return_tensors="pt",
            padding="max_length",
            max_length=self.max_target_length,
            truncation=True
        )
        return {
            "pixel_values": encoding["pixel_values"].squeeze(0),
            "labels": encoding["labels"].squeeze(0)
        }

In [4]:
def custom_collate_fn(batch):
    pixel_values = torch.stack([item["pixel_values"] for item in batch])
    labels = torch.stack([item["labels"] for item in batch])
    return {
        "pixel_values": pixel_values,
        "labels": labels
    }

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
# Load processor and model
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten", do_rescale=False)
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
model.to(device)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3"
}

Config of the decoder: <class 'transfor

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=False)
              (key): Linear(in_features=768, out_features=768, bias=False)
              (value): Linear(in_features=768, out_features=768, bias=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (i

In [7]:
model.config.decoder_start_token_id = 2
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.bos_token_id = processor.tokenizer.bos_token_id
model.config.eos_token_id = processor.tokenizer.eos_token_id

# Enable gradient checkpointing
model.gradient_checkpointing_enable()

# Move model to device
try:
    model.to(device)
except RuntimeError as e:
    print(f"Error moving model to GPU: {e}")
    print("Falling back to CPU...")
    device = torch.device("cpu")
    model.to(device)

# Compute max_target_length
dataset = pd.read_csv("dataset.csv")
max_length = max(len(processor.tokenizer.encode(text)) for text in dataset["ground_truth"])
print(f"Max token length: {max_length}")
max_target_length = min(max_length + 10, 256)  # Cap at 256 to save memory

# Create train and validation datasets
train_dataset = CodeOCRDataset(
    csv_file="dataset.csv",
    img_dir="dataset_creator/images",
    processor=processor,
    max_target_length=max_target_length
)
indices = np.arange(len(dataset))
np.random.seed(42)
np.random.shuffle(indices)
train_size = int(0.8 * len(dataset))  # 80% train, 20% validation
train_indices, val_indices = indices[:train_size], indices[train_size:]

# Create train and validation loaders
train_loader = DataLoader(
    Subset(train_dataset, train_indices),
    batch_size=2,
    shuffle=True,
    collate_fn=custom_collate_fn
)
val_loader = DataLoader(
    Subset(train_dataset, val_indices),
    batch_size=2,
    shuffle=False,
    collate_fn=custom_collate_fn
)

# Optimizer and scaler
optimizer = AdamW(model.parameters(), lr=5e-5)
scaler = GradScaler() if device.type == "cuda" else None

# Early stopping parameters
patience = 10
best_val_loss = float("inf")
patience_counter = 0
best_model_path = "best_trocr_model"

# Training loop with early stopping
for epoch in range(200):
    # Training
    model.train()
    total_train_loss = 0
    train_pbar = tqdm(train_loader, desc=f'Epoch {epoch+1}/200 [Train]')
    epoch_start_time = time.time()

    for batch in train_loader:
        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}
        if device.type == "cuda":
            with autocast():
                outputs = model(**batch)
                loss = outputs.loss
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
        total_train_loss += loss.item()
        train_pbar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    avg_train_loss = total_train_loss / len(train_loader)

    # Validation
    model.eval()
    total_val_loss = 0
    val_pbar = tqdm(val_loader, desc=f'Epoch {epoch+1}/200 [Val]')

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            if device.type == "cuda":
                with autocast():
                    outputs = model(**batch)
                    loss = outputs.loss
            else:
                outputs = model(**batch)
                loss = outputs.loss
            total_val_loss += loss.item()
            val_pbar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    avg_val_loss = total_val_loss / len(val_loader)
    epoch_time = time.time() - epoch_start_time

    print(f"\nEpoch {epoch+1}/200:")
    print(f"  Time: {epoch_time:.2f}s")
    print(f"  Train Loss: {avg_train_loss:.4f}")
    print(f"  Val Loss: {avg_val_loss:.4f}")
    print()

    # Early stopping check
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        # Save best model
        model.save_pretrained(best_model_path)
        processor.save_pretrained(best_model_path)
        print(f"Saved best model at epoch {epoch} with Val Loss: {best_val_loss:.4f}")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping triggered after {patience} epochs with no improvement.")
            break

# Load best model for final use
model = VisionEncoderDecoderModel.from_pretrained(best_model_path)
processor = TrOCRProcessor.from_pretrained(best_model_path)
model.to(device)

  scaler = GradScaler() if device.type == "cuda" else None


Max token length: 221


  with autocast():
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
  with autocast():



Epoch 1/200:
  Time: 274.22s
  Train Loss: 6.0668
  Val Loss: 2.8992

Saved best model at epoch 0 with Val Loss: 2.8992



Epoch 1/200 [Train]:   0%|          | 0/29 [04:44<?, ?it/s, loss=3.2235]
  with autocast():

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Epoch 1/200 [Val]:   0%|          | 0/8 [04:46<?, ?it/s, loss=1.6566]
  with autocast():
Epoch 2/200 [Val]:   0%|          | 0/8 [00:15<?, ?it/s, loss=1.9153]


Epoch 2/200:
  Time: 276.41s
  Train Loss: 3.0242
  Val Loss: 2.9612



Epoch 2/200 [Train]:   0%|          | 0/29 [04:36<?, ?it/s, loss=5.0331]

Epoch 2/200 [Val]:   0%|          | 0/8 [04:36<?, ?it/s, loss=1.9153]

[A
[A
[A
[A
[A
[A
[A
[A


Epoch 3/200:
  Time: 276.73s
  Train Loss: 2.8920
  Val Loss: 2.6332

Saved best model at epoch 2 with Val Loss: 2.6332


Epoch 3/200 [Train]:   0%|          | 0/29 [04:53<?, ?it/s, loss=3.8731]
  with autocast():
Epoch 3/200 [Val]:   0%|          | 0/8 [04:53<?, ?it/s, loss=1.5165]27]
  with autocast():



Epoch 4/200:
  Time: 276.36s
  Train Loss: 2.7592
  Val Loss: 2.5812

Saved best model at epoch 3 with Val Loss: 2.5812



Epoch 4/200 [Train]:   0%|          | 0/29 [04:45<?, ?it/s, loss=2.6127]
  with autocast():

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Epoch 4/200 [Val]:   0%|          | 0/8 [04:45<?, ?it/s, loss=1.5003]
  with autocast():
Epoch 5/200 [Val]:   0%|          | 0/8 [00:16<?, ?it/s, loss=1.6370]


Epoch 5/200:
  Time: 276.31s
  Train Loss: 2.6712
  Val Loss: 2.6634



Epoch 5/200 [Train]:   0%|          | 0/29 [04:36<?, ?it/s, loss=1.9615]

Epoch 5/200 [Val]:   0%|          | 0/8 [04:36<?, ?it/s, loss=1.6370]

[A
[A
[A
[A
[A
[A
[A
[A


Epoch 6/200:
  Time: 276.21s
  Train Loss: 2.5785
  Val Loss: 2.5987



Epoch 6/200 [Train]:   0%|          | 0/29 [04:36<?, ?it/s, loss=1.5413]
Epoch 6/200 [Val]:   0%|          | 0/8 [04:36<?, ?it/s, loss=1.5349]69]



Epoch 7/200:
  Time: 276.19s
  Train Loss: 2.5603
  Val Loss: 2.5588

Saved best model at epoch 6 with Val Loss: 2.5588



Epoch 7/200 [Train]:   0%|          | 0/29 [04:44<?, ?it/s, loss=1.7669]
  with autocast():

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Epoch 7/200 [Val]:   0%|          | 0/8 [04:44<?, ?it/s, loss=1.4998]
  with autocast():
Epoch 8/200 [Val]:   0%|          | 0/8 [00:15<?, ?it/s, loss=1.5819]


Epoch 8/200:
  Time: 276.18s
  Train Loss: 2.5308
  Val Loss: 2.6204



Epoch 8/200 [Train]:   0%|          | 0/29 [04:36<?, ?it/s, loss=1.1308]

Epoch 8/200 [Val]:   0%|          | 0/8 [04:36<?, ?it/s, loss=1.5819]

[A
[A
[A
[A
[A
[A
[A
[A


Epoch 9/200:
  Time: 276.06s
  Train Loss: 2.5943
  Val Loss: 2.5904



Epoch 9/200 [Train]:   0%|          | 0/29 [04:36<?, ?it/s, loss=4.4950]
Epoch 9/200 [Val]:   0%|          | 0/8 [04:36<?, ?it/s, loss=1.5801]263]



Epoch 10/200:
  Time: 276.09s
  Train Loss: 2.5865
  Val Loss: 2.7292




Epoch 10/200 [Train]:   0%|          | 0/29 [04:36<?, ?it/s, loss=4.3263]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Epoch 10/200 [Val]:   0%|          | 0/8 [04:36<?, ?it/s, loss=1.7337]
Epoch 11/200 [Val]:   0%|          | 0/8 [00:15<?, ?it/s, loss=1.5785]


Epoch 11/200:
  Time: 276.07s
  Train Loss: 2.5115
  Val Loss: 2.6555



Epoch 11/200 [Train]:   0%|          | 0/29 [04:36<?, ?it/s, loss=1.8291]

Epoch 11/200 [Val]:   0%|          | 0/8 [04:36<?, ?it/s, loss=1.5785]

[A
[A
[A
[A
[A
[A
[A
[A


Epoch 12/200:
  Time: 276.03s
  Train Loss: 2.5624
  Val Loss: 2.5761



Epoch 12/200 [Train]:   0%|          | 0/29 [04:36<?, ?it/s, loss=1.8185]
Epoch 12/200 [Val]:   0%|          | 0/8 [04:36<?, ?it/s, loss=1.5116]56]



Epoch 13/200:
  Time: 276.04s
  Train Loss: 2.5686
  Val Loss: 2.5922




Epoch 13/200 [Train]:   0%|          | 0/29 [04:36<?, ?it/s, loss=4.6856]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Epoch 13/200 [Val]:   0%|          | 0/8 [04:36<?, ?it/s, loss=1.5225]
Epoch 14/200 [Val]:   0%|          | 0/8 [00:16<?, ?it/s, loss=1.5807]


Epoch 14/200:
  Time: 276.08s
  Train Loss: 2.5228
  Val Loss: 2.6020



Epoch 14/200 [Train]:   0%|          | 0/29 [04:36<?, ?it/s, loss=2.6215]

Epoch 14/200 [Val]:   0%|          | 0/8 [04:36<?, ?it/s, loss=1.5807]

[A
[A
[A
[A
[A
[A
[A
[A


Epoch 15/200:
  Time: 276.11s
  Train Loss: 2.4785
  Val Loss: 2.5736



Epoch 15/200 [Train]:   0%|          | 0/29 [04:36<?, ?it/s, loss=2.0751]
Epoch 15/200 [Val]:   0%|          | 0/8 [04:36<?, ?it/s, loss=1.5279]85]



Epoch 16/200:
  Time: 276.04s
  Train Loss: 2.5017
  Val Loss: 2.6139




Epoch 16/200 [Train]:   0%|          | 0/29 [04:36<?, ?it/s, loss=2.4785]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Epoch 16/200 [Val]:   0%|          | 0/8 [04:36<?, ?it/s, loss=1.5351]
Epoch 17/200 [Val]:   0%|          | 0/8 [00:15<?, ?it/s, loss=1.5254]Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3"
}

Config of the decoder: <class 'tra


Epoch 17/200:
  Time: 276.03s
  Train Loss: 2.4673
  Val Loss: 2.6000

Early stopping triggered after 10 epochs with no improvement.


VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=False)
              (key): Linear(in_features=768, out_features=768, bias=False)
              (value): Linear(in_features=768, out_features=768, bias=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (i

In [11]:
def predict_code(img_path, model, processor, device):
    image = Image.open(img_path).convert("RGB")
    pixel_values = processor(image, return_tensors="pt").pixel_values.to(device)
    with autocast() if device.type == "cuda" else torch.no_grad():
        outputs = model.generate(pixel_values)
    return processor.batch_decode(outputs, skip_special_tokens=True)[0]

img_path = "dataset_creator/images/1.png"
print(f"Predicted text: {predict_code(img_path, model, processor, device)}")

  with autocast() if device.type == "cuda" else torch.no_grad():


Predicted text: 


In [14]:
for img_path in ("dataset_creator/images/2.png", "dataset_creator/images/Screenshot 2025-04-11 172314.png", "dataset_creator/images/Screenshot 2025-04-11 172402.png"):
    print(f"Predicted text: {predict_code(img_path, model, processor, device)}")

  with autocast() if device.type == "cuda" else torch.no_grad():


Predicted text: 
Predicted text: 
Predicted text: 
