In [2]:
import os
import pandas as pd
import numpy as np
from PIL import Image
from tqdm import tqdm

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

from transformers import TrOCRProcessor, VisionEncoderDecoderModel

In [18]:
BASE_DIR = "/kaggle/input/imgur5k/cropped/"
TRAIN_DIR = os.path.join(BASE_DIR, "train")
VAL_DIR = os.path.join(BASE_DIR, "val")
TEST_DIR = os.path.join(BASE_DIR, "test")

In [5]:
def load_json_to_df(json_path):
    df = pd.read_json(json_path)
    return df

In [82]:
processor = TrOCRProcessor.from_pretrained('microsoft/trocr-large-handwritten')
model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-large-handwritten')

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 1024,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 1024,
  "d_

In [83]:
class Imgur5KDataset(Dataset):
    def __init__(self, image_dir, json_file, processor, transform):
        self.image_dir = image_dir
        self.df = load_json_to_df(image_dir+json_file)
        self.df["image_name"] = self.df["image_name"] + ".png"
        self.processor = processor
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        image_path = os.path.join(self.image_dir, self.df.iloc[idx]["image_name"])
        text = self.df.iloc[idx]["text"]

        # Open image
        image = Image.open(image_path).convert("RGB")
        image = self.transform(image)

        # Process image with TrOCR processor
        pixel_values = self.processor(images=image, return_tensors="pt").pixel_values.squeeze()

        # Process text with TrOCR tokenizer
        labels = self.processor.tokenizer(text, return_tensors="pt", padding="max_length", truncation=True).input_ids.squeeze()

        return {
            "pixel_values": pixel_values,
            "labels": labels
        }

In [84]:
# Define the image transformation pipeline
image_transform = transforms.Compose([
    transforms.ToTensor(),
])

In [85]:
train_dataset = Imgur5KDataset(
    TRAIN_DIR, 
    "/words.json", 
    processor, 
    image_transform
)

In [86]:
val_dataset = Imgur5KDataset(
    VAL_DIR, 
    "/words.json", 
    processor, 
    image_transform
)

In [87]:
test_dataset = Imgur5KDataset(
    TEST_DIR, 
    "/words.json", 
    processor, 
    image_transform
)

In [101]:
train_loader = DataLoader(train_dataset, batch_size=3,shuffle=True,num_workers=4,pin_memory= True)
val_loader = DataLoader(val_dataset, batch_size=3,num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=3,num_workers=4)

In [89]:
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-23): 24 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=False)
              (key): Linear(in_features=1024, out_features=1024, bias=False)
              (value): Linear(in_features=1024, out_features=1024, bias=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=1024, out_features=4096, bias=True)
    

In [90]:
from torch.optim import AdamW
from torch.cuda.amp import GradScaler, autocast

In [91]:
# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)

# Set up gradient scaler for mixed-precision training
scaler = torch.amp.GradScaler()

# Number of epochs for fine-tuning
epochs = 10

In [102]:
from tqdm import tqdm
from torch.cuda.amp import autocast, GradScaler

# Initialize scaler
scaler = torch.cuda.amp.GradScaler('cuda')

# Set the decoder_start_token_id and pad_token_id before training
if model.config.decoder_start_token_id is None:
    model.config.decoder_start_token_id = processor.tokenizer.cls_token_id  # or processor.tokenizer.bos_token_id

# Set the pad_token_id if it's not set
if model.config.pad_token_id is None:
    model.config.pad_token_id = processor.tokenizer.pad_token_id  # Typically, the [PAD] token ID

# Training loop
def train(model, train_loader, val_loader, optimizer, scaler, epochs=10):
    for epoch in range(epochs):
        model.train()  # Set model to training mode
        epoch_loss = 0

        # Loop over training batches
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            pixel_values = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)

            # Zero the gradients
            optimizer.zero_grad()

            # Mixed-precision context
            with autocast():
                outputs = model(pixel_values=pixel_values, labels=labels)
                loss = outputs.loss

            # Scaled loss backpropagation
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            epoch_loss += loss.item()

        print(f"Epoch {epoch+1} Loss: {epoch_loss / len(train_loader)}")
        
        # Validate after each epoch
        evaluate(model, val_loader)
    
    # Save the fine-tuned model after training
    model.save_pretrained("/kaggle/working/trocr_finetuned")
    print("Fine-tuned model saved.")

# Call the training function
train(model, train_loader, val_loader, optimizer, scaler, epochs=epochs)


  scaler = torch.cuda.amp.GradScaler('cuda')
Epoch 1/10:   0%|          | 0/39774 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got 

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 15.89 GiB of which 39.12 MiB is free. Process 5618 has 15.85 GiB memory in use. Of the allocated memory 15.43 GiB is allocated by PyTorch, and 125.33 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
from datasets import load_metric
import numpy as np

# Load evaluation metrics
cer_metric = load_metric("cer")
wer_metric = load_metric("wer")

# Evaluation function
def evaluate(model, data_loader):
    model.eval()  # Set model to evaluation mode

    all_predictions = []
    all_labels = []

    for batch in tqdm(data_loader, desc="Evaluating"):
        pixel_values = batch["pixel_values"].to(device)
        labels = batch["labels"].to(device)

        # Inference (disable gradient calculation)
        with torch.no_grad():
            generated_ids = model.generate(pixel_values)

        # Decode predictions and labels
        decoded_preds = processor.batch_decode(generated_ids, skip_special_tokens=True)
        decoded_labels = processor.batch_decode(labels, skip_special_tokens=True)

        all_predictions.extend(decoded_preds)
        all_labels.extend(decoded_labels)

    # Calculate CER and WER
    cer_score = cer_metric.compute(predictions=all_predictions, references=all_labels)
    wer_score = wer_metric.compute(predictions=all_predictions, references=all_labels)

    print(f"CER: {cer_score:.4f}, WER: {wer_score:.4f}")
    return cer_score, wer_score

In [None]:
# Evaluate on the validation set
print("Evaluating on Validation Set...")
val_cer, val_wer = evaluate(model, val_loader)

# Evaluate on the test set
print("Evaluating on Test Set...")
test_cer, test_wer = evaluate(model, test_loader)

In [None]:
# Save the fine-tuned model to disk
model.save_pretrained("/kaggle/working/trocr_finetuned")