# Setup

## Load Dataset

In [1]:
datasets_root = 'datasets'
dataset_id = 'FUNSD_polygon_augmented'

In [2]:
from datasets import Dataset
from pathlib import Path
import json
from transformers import LayoutLMv3FeatureExtractor, AutoTokenizer, LayoutLMv3Processor
from datasets import Features, Sequence, ClassLabel, Value, Array2D
from PIL import Image
from functools import partial

### Load dataset
Generate a Dataset object from a generator function that contains the paths of the images and annotations.

In [3]:
def generate_samples(folder):
    images_path = Path(folder) / 'images'
    annotations_path = Path(folder) / 'annotations'
    
    images = list(images_path.glob('*.png'))
    
    for image_path in images:
        image_id = image_path.stem
        annotation_path = annotations_path / f'{image_id}.json'
        yield {
            'image_path': str(image_path),
            'annotation_path': str(annotation_path)
        }

def get_features_type():
    return Features({
        "image_path": Value("string"),
        "annotation_path": Value("string")
    })

def get_dataset_folder(split='train'):
    if split == 'train':
        path = Path(datasets_root) / dataset_id / 'dataset' / 'training_data'
    elif split == 'test':
        path = Path(datasets_root) / dataset_id / 'dataset' / 'testing_data'
    else:
        raise ValueError('split must be either "train" or "test"')
    
    return path



train_folder = get_dataset_folder()
train_dataset = Dataset.from_generator(partial(generate_samples, train_folder), features=get_features_type())
test_folder = get_dataset_folder('test')
test_dataset = Dataset.from_generator(partial(generate_samples, test_folder), features=get_features_type())

In [4]:
def get_unique_labels(folder):
    annotations_path = Path(folder) / 'annotations'
    unique_labels = set()

    for annotation_file in annotations_path.glob('*.json'):
        with open(annotation_file, 'r') as f:
            annotations = json.load(f)['form']
            for annotation in annotations:
                unique_labels.add(annotation['label'])

    return sorted(list(unique_labels))

labels = get_unique_labels(train_folder)
num_labels = len(labels)
label_to_id = {label: idx for idx, label in enumerate(labels)}
id_to_label = {idx: label for idx, label in enumerate(labels)}

### Preprocess dataset
Load the images and annotations, and encode them using the LayoutLMv3 processor.

In [16]:
# Initialize processor
model_id = "SCUT-DLVCLab/lilt-roberta-en-base"
feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=False)
tokenizer = AutoTokenizer.from_pretrained(model_id)
processor = LayoutLMv3Processor(feature_extractor, tokenizer)

# Custom features
features = Features(
    {
        "input_ids": Sequence(feature=Value(dtype="int64")),
        "attention_mask": Sequence(feature=Value(dtype="int64")),
        "polygon": Array2D(dtype="int64", shape=(512, 8)),
        "labels": Sequence(feature=ClassLabel(names=labels)),
    }
)

# Preprocess function
def process(sample, processor=None):
    # Assume sample["annotation_path"] is the path to the JSON file containing your annotations
    with open(sample["annotation_path"], "r") as f:
        annotations = json.load(f)['form']

    # Extract sequence and bounding polygons from annotations
    sequences = [ann["text"] for ann in annotations]
    polygons = [ann["polygon"] for ann in annotations]  # Modify this if your "polygon" is not actually a bbox
    
    max_length = 512
    padding_length = max_length - len(sequences)

    # Custom Padding for polygons
    polygons = polygons + [[0, 0, 0, 0, 0, 0, 0, 0] for _ in range(padding_length)]
    
    # Custom Padding for sequence
    sequences = sequences + ["[PAD]" for _ in range(padding_length)]
    
    # Load image
    image = Image.open(sample["image_path"]).convert("RGB")
    
    # Extract labels for each sequence from annotations
    sequence_labels = [label_to_id[ann['label']] for ann in annotations]    
    sequence_labels = sequence_labels + [-100 for _ in range(padding_length)]

    # Encoding without 
    encoding = processor(
        image,
        sequences,
        word_labels=sequence_labels,
        boxes=polygons,
        padding="max_length",
        truncation=True,
    )
    # Manually insert polygons (since processor can't handle 8-coordinates)
    encoding['polygon'] = polygons
    
    del encoding["pixel_values"]
    del encoding['bbox']

    return encoding

# Process your dataset (replace `dataset` with the actual dataset object)
train_dataset = train_dataset.map(
    partial(process, processor=processor),
    remove_columns=["image_path", "annotation_path"],
    features=features,
).with_format("torch")
test_dataset = test_dataset.map(
    partial(process, processor=processor),
    remove_columns=["image_path", "annotation_path"],
    features=features,
).with_format("torch")

Map:   0%|          | 0/14900 [00:00<?, ? examples/s]

torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([512, 8])
torch.Size([5

KeyboardInterrupt: 

In [37]:
from datasets import DatasetDict

dataset = DatasetDict({'train': train_dataset, 'test': test_dataset})
dataset.save_to_disk(Path(datasets_root) / dataset_id / 'huggingface_dataset')

Saving the dataset (0/2 shards):   0%|          | 0/14900 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

# Training

## Load HF dataset

In [5]:
from datasets import DatasetDict

dataset = DatasetDict.load_from_disk(f'{datasets_root}/{dataset_id}/huggingface_dataset')
dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'polygon', 'labels'],
        num_rows: 14900
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'polygon', 'labels'],
        num_rows: 5000
    })
})

## Create model

In [6]:
from model.lilt import  LiltForTokenClassification
from transformers import AutoConfig

config = AutoConfig.from_pretrained(
    'SCUT-DLVCLab/lilt-roberta-en-base',
    num_labels=num_labels,
    label2id=label_to_id,
    id2label=id_to_label,
    problem_type="single_label_classification",
)
# Overriding specific configurations
# config.hidden_size = 768
# config.max_2d_position_embeddings = 1024
# config.channel_shrink_ratio = 2
model = LiltForTokenClassification(config)

## Prepare metrics

In [7]:
import evaluate
import numpy as np

# load seqeval metric
metric = evaluate.load("seqeval")

# labels of the model
class_labels = [config.id2label[i] for i in range(num_labels)]


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    all_predictions = []
    all_labels = []
    for prediction, label in zip(predictions, labels):
        for predicted_idx, label_idx in zip(prediction, label):
            if label_idx == -100:
                continue
            all_predictions.append(class_labels[predicted_idx])
            all_labels.append(class_labels[label_idx])
    return metric.compute(predictions=[all_predictions], references=[all_labels])


## Prepare trainer & train

In [8]:
from transformers import Trainer, TrainingArguments

# hugging face parameter
repository_id = "lilt-polygon"

# Define training args
training_args = TrainingArguments(
    output_dir=repository_id,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    fp16=True,
    learning_rate=5e-5,
    max_steps=1000,
    # logging & evaluation strategies
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=200,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=200,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="overall_f1",
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics,
)

In [9]:
trainer.train()

Step,Training Loss,Validation Loss,Eader,Nswer,Ther,Uestion,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
200,0.7326,1.436065,"{'precision': 0.14129736673089274, 'recall': 0.11458333333333333, 'f1': 0.1265458728789186, 'number': 9600}","{'precision': 0.15097515097515096, 'recall': 0.22591755860894042, 'f1': 0.1809954751131222, 'number': 27001}","{'precision': 0.14991561181434598, 'recall': 0.2307142857142857, 'f1': 0.1817391304347826, 'number': 15400}","{'precision': 0.16042545710267228, 'recall': 0.2393521099052428, 'f1': 0.1920976803547228, 'number': 30499}",0.153783,0.218824,0.180627,0.601074




KeyboardInterrupt: 