In [1]:
import os
import json
import pandas as pd
import numpy as np
import json
from tqdm import tqdm

from shapely.geometry import Polygon
import glob
from pytesseract import pytesseract
from lxml import etree
import ast
import torch
from PIL import ImageDraw, ImageFont

from sklearn.model_selection import train_test_split
from datasets import Dataset,Features, ClassLabel, Sequence, Value, Image, load_dataset
pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [15]:
train_file_path = './bill_dataset/train/data-00000-of-00001.arrow'
test_file_path = './bill_dataset/test/data-00000-of-00001.arrow'
dataset = load_dataset('arrow', data_files={'train': train_file_path, 'test': test_file_path})

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [17]:
from transformers import AutoProcessor
processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)

In [18]:
from datasets.features import ClassLabel

features = dataset["train"].features
column_names = dataset["train"].column_names
image_column_name = "image"
text_column_name = "words"
boxes_column_name = "bboxes"
label_column_name = "ner_tags"

# In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
# unique labels.
def get_label_list(labels):
    unique_labels = set()
    for label in labels:
        unique_labels = unique_labels | set(label)
    label_list = list(unique_labels)
    label_list.sort()
    return label_list

if isinstance(features[label_column_name].feature, ClassLabel):
    label_list = features[label_column_name].feature.names
    # No need to convert the labels since they are already ints.
    id2label = {k: v for k,v in enumerate(label_list)}
    label2id = {v: k for k,v in enumerate(label_list)}
else:
    label_list = get_label_list(train_test_split["train"][label_column_name])
    id2label = {k: v for k,v in enumerate(label_list)}
    label2id = {v: k for k,v in enumerate(label_list)}
num_labels = len(label_list)

In [19]:
def prepare_examples(examples):
  images = examples[image_column_name] ##if you use an image path, this will need to be updated to read the image in
  words = examples[text_column_name]
  boxes = examples[boxes_column_name]
  word_labels = examples[label_column_name]

  encoding = processor(images, words, boxes=boxes, word_labels=word_labels,
                       truncation=True, padding="max_length")

  return encoding

In [20]:
from datasets import Features, Sequence, ClassLabel, Value, Array2D, Array3D

# we need to define custom features for `set_format` (used later on) to work properly
features = Features({
    'pixel_values': Array3D(dtype="float32", shape=(3, 224, 224)),
    'input_ids': Sequence(feature=Value(dtype='int64')),
    'attention_mask': Sequence(Value(dtype='int64')),
    'bbox': Array2D(dtype="int64", shape=(512, 4)),
    'labels': Sequence(feature=Value(dtype='int64')),
})

In [21]:
train_sample = dataset["train"]
train_dataset = train_sample.map(
    prepare_examples,
    batched=True,
    remove_columns=column_names,
    features=features,
)

eval_dataset = dataset["test"].map(
    prepare_examples,
    batched=True,
    remove_columns=column_names,
    features=features,
)

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [22]:
# Change dataset to torch format
train_dataset.set_format("torch")
import torch

example = train_dataset[0]
for k,v in example.items():
    print(k,v.shape)

pixel_values torch.Size([3, 224, 224])
input_ids torch.Size([512])
attention_mask torch.Size([512])
bbox torch.Size([512, 4])
labels torch.Size([512])


In [23]:
from datasets import load_metric

metric = load_metric("seqeval")

In [25]:
return_entity_level_metrics = False
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    if return_entity_level_metrics:
        # Unpack nested dictionaries
        final_results = {}
        for key, value in results.items():
            if isinstance(value, dict):
                for n, v in value.items():
                    final_results[f"{key}_{n}"] = v
            else:
                final_results[key] = value
        return final_results
    else:
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }

In [26]:
from transformers import LayoutLMv3ForTokenClassification

model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base",
                                                         id2label=id2label,
                                                         label2id=label2id)

Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
from transformers import TrainingArguments, Trainer
from transformers.data.data_collator import default_data_collator

training_args = TrainingArguments(output_dir="test",
                                  max_steps=1000,
                                  per_device_train_batch_size=2,
                                  per_device_eval_batch_size=2,
                                  learning_rate=1e-5,
                                  evaluation_strategy="steps",
                                  eval_steps=100,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="f1")

# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics,
)

In [28]:
trainer.train()
trainer.evaluate()

  0%|          | 0/1000 [00:00<?, ?it/s]



  0%|          | 0/2 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.376425176858902, 'eval_precision': 0.6507936507936508, 'eval_recall': 0.5616438356164384, 'eval_f1': 0.6029411764705883, 'eval_accuracy': 0.9395017793594306, 'eval_runtime': 0.2219, 'eval_samples_per_second': 13.52, 'eval_steps_per_second': 9.014, 'epoch': 25.0}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 0.29795125126838684, 'eval_precision': 0.8064516129032258, 'eval_recall': 0.684931506849315, 'eval_f1': 0.7407407407407408, 'eval_accuracy': 0.9537366548042705, 'eval_runtime': 0.2186, 'eval_samples_per_second': 13.724, 'eval_steps_per_second': 9.15, 'epoch': 50.0}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 0.29292139410972595, 'eval_precision': 0.8253968253968254, 'eval_recall': 0.7123287671232876, 'eval_f1': 0.7647058823529411, 'eval_accuracy': 0.9608540925266904, 'eval_runtime': 0.2258, 'eval_samples_per_second': 13.287, 'eval_steps_per_second': 8.858, 'epoch': 75.0}


  0%|          | 0/2 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.31140685081481934, 'eval_precision': 0.7727272727272727, 'eval_recall': 0.6986301369863014, 'eval_f1': 0.7338129496402879, 'eval_accuracy': 0.9555160142348754, 'eval_runtime': 0.233, 'eval_samples_per_second': 12.878, 'eval_steps_per_second': 8.585, 'epoch': 100.0}
{'loss': 0.2169, 'learning_rate': 5e-06, 'epoch': 125.0}


  0%|          | 0/2 [00:00<?, ?it/s]

Checkpoint destination directory test\checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'eval_loss': 0.32098305225372314, 'eval_precision': 0.7428571428571429, 'eval_recall': 0.7123287671232876, 'eval_f1': 0.7272727272727273, 'eval_accuracy': 0.9555160142348754, 'eval_runtime': 0.2068, 'eval_samples_per_second': 14.509, 'eval_steps_per_second': 9.673, 'epoch': 125.0}




  0%|          | 0/2 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.31782692670822144, 'eval_precision': 0.7647058823529411, 'eval_recall': 0.7123287671232876, 'eval_f1': 0.7375886524822695, 'eval_accuracy': 0.9590747330960854, 'eval_runtime': 0.2275, 'eval_samples_per_second': 13.186, 'eval_steps_per_second': 8.791, 'epoch': 150.0}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 0.3255780041217804, 'eval_precision': 0.7647058823529411, 'eval_recall': 0.7123287671232876, 'eval_f1': 0.7375886524822695, 'eval_accuracy': 0.9572953736654805, 'eval_runtime': 0.2224, 'eval_samples_per_second': 13.486, 'eval_steps_per_second': 8.991, 'epoch': 175.0}


  0%|          | 0/2 [00:00<?, ?it/s]



{'eval_loss': 0.3289702236652374, 'eval_precision': 0.7647058823529411, 'eval_recall': 0.7123287671232876, 'eval_f1': 0.7375886524822695, 'eval_accuracy': 0.9572953736654805, 'eval_runtime': 0.2254, 'eval_samples_per_second': 13.312, 'eval_steps_per_second': 8.875, 'epoch': 200.0}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 0.32697466015815735, 'eval_precision': 0.7647058823529411, 'eval_recall': 0.7123287671232876, 'eval_f1': 0.7375886524822695, 'eval_accuracy': 0.9590747330960854, 'eval_runtime': 0.225, 'eval_samples_per_second': 13.333, 'eval_steps_per_second': 8.889, 'epoch': 225.0}
{'loss': 0.0115, 'learning_rate': 0.0, 'epoch': 250.0}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 0.3257977068424225, 'eval_precision': 0.7878787878787878, 'eval_recall': 0.7123287671232876, 'eval_f1': 0.7482014388489209, 'eval_accuracy': 0.9590747330960854, 'eval_runtime': 0.2061, 'eval_samples_per_second': 14.559, 'eval_steps_per_second': 9.706, 'epoch': 250.0}

Checkpoint destination directory test\checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.



{'train_runtime': 805.443, 'train_samples_per_second': 2.483, 'train_steps_per_second': 1.242, 'train_loss': 0.11424336576461792, 'epoch': 250.0}




  0%|          | 0/2 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.3257977068424225,
 'eval_precision': 0.7878787878787878,
 'eval_recall': 0.7123287671232876,
 'eval_f1': 0.7482014388489209,
 'eval_accuracy': 0.9590747330960854,
 'eval_runtime': 0.2115,
 'eval_samples_per_second': 14.182,
 'eval_steps_per_second': 9.455,
 'epoch': 250.0}

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained('./test/checkpoint-1000')
model.to('cpu')