In [52]:
!pip install torch transformers datasets pillow fastapi uvicorn python-multipart




In [4]:
import kagglehub
path = kagglehub.dataset_download("urbikn/sroie-datasetv2")

Downloading from https://www.kaggle.com/api/v1/datasets/download/urbikn/sroie-datasetv2?dataset_version_number=4...


100%|██████████| 834M/834M [00:11<00:00, 75.5MB/s]

Extracting files...





In [51]:
import os, json, torch
from PIL import Image
from torch.utils.data import Dataset
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification, TrainingArguments, Trainer

processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)

label_list = ["O", "QUESTION", "ANSWER", "HEADER"]
label2id = {l:i for i,l in enumerate(label_list)}
id2label = {i:l for l,i in label2id.items()}

def normalize_box(box, w, h):
    return [int(1000*box[0]/w), int(1000*box[1]/h), int(1000*box[2]/w), int(1000*box[3]/h)]

def load_funsd_example(img_path, ann_path):
    image = Image.open(img_path).convert("RGB")
    w, h = image.size
    with open(ann_path) as f:
        data = json.load(f)

    words, boxes, labels = [], [], []
    for item in data["form"]:
        label = item["label"].upper()
        for wbox in item["words"]:
            x1,y1,x2,y2 = wbox["box"]
            words.append(wbox["text"])
            boxes.append(normalize_box([x1,y1,x2,y2], w, h))
            labels.append(label2id.get(label, 0))

    enc = processor(image, words, boxes=boxes, word_labels=labels, padding="max_length", truncation=True, return_tensors="pt")
    return {k:v.squeeze() for k,v in enc.items()}

class FUNSDDataset(Dataset):
    def __init__(self, img_dir, ann_dir):
        self.imgs = sorted(os.listdir(img_dir))
        self.img_dir, self.ann_dir = img_dir, ann_dir

    def __len__(self):
        return len(self.imgs)

    def __getitem__(self, idx):
        name = self.imgs[idx].replace(".png",".json")
        return load_funsd_example(f"{self.img_dir}/{self.imgs[idx]}", f"{self.ann_dir}/{name}")

IMG_DIR = "/content/funsd/dataset/training_data/images"
ANN_DIR = "/content/funsd/dataset/training_data/annotations"
dataset = FUNSDDataset(IMG_DIR, ANN_DIR)

model = LayoutLMv3ForTokenClassification.from_pretrained(
    "microsoft/layoutlmv3-base",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
).cuda()

args = TrainingArguments(
    output_dir="./layoutlmv3_funsd",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=4,
    fp16=True,
    logging_steps=20,
    save_steps=200,
    report_to="none"
)

trainer = Trainer(model=model, args=args, train_dataset=dataset)
trainer.train()

model.save_pretrained("layoutlmv3_funsd_model")
processor.save_pretrained("layoutlmv3_funsd_model")


Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  gc.collect()


Step,Training Loss
20,0.983
40,0.7058
60,0.5286
80,0.5287
100,0.3924
120,0.3173
140,0.2776


[]

In [53]:
model.save_pretrained("layoutlmv3_funsd_model")
processor.save_pretrained("layoutlmv3_funsd_model")


[]