# Import Lib

In [1]:
!nvidia-smi

Sat Mar 25 03:08:07 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 465.19.01    CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:00:06.0 Off |                  N/A |
| 27%   41C    P8    19W / 250W |      1MiB / 11019MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  On   | 00000000:00:0A.0 Off |                  N/A |
| 68%   68C    P2   252W / 250W |  10118MiB / 11019MiB |     67%      Default |
|       

In [2]:
import os
os.environ['TRANSFORMERS_CACHE'] = '/data/tungtx2/tmp/transformers_hub'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [3]:
import os
import json
from pathlib import Path
import numpy as np
from PIL import Image
import torch

torch.__version__

  from .autonotebook import tqdm as notebook_tqdm


'1.13.1+cu117'

# Prepare Data

In [4]:
import unidecode

def normalize_bbox(bbox, width, height):
     return [
         int(1000 * (bbox[0] / width)),
         int(1000 * (bbox[1] / height)),
         int(1000 * (bbox[2] / width)),
         int(1000 * (bbox[3] / height)),
     ]

def gen_annotations(dir, max_sample=1e9, block_type=None):
  ls_words, ls_boxes, ls_labels, ls_img_fp = [], [], [], []
  cnt = 0
  for json_fp in Path(dir).rglob('*.json'):
    if block_type != None and block_type not in str(json_fp):
      continue
    img_fp = json_fp.with_suffix('.jpg')

    words, boxes, labels = [], [], []
    with open(json_fp) as f:
      json_data = json.load(f)
    
    img_h, img_w = json_data['imageHeight'], json_data['imageWidth']
    for i, shape in enumerate(json_data['shapes']):
      words.append(unidecode.unidecode(shape['text'].lower()))
      # words.append(shape['text'].lower())
      labels.append(shape['label'])
      pts = [coord for pt in shape['points'] for coord in pt]
      xmin = min(pts[0::2])
      xmax = max(pts[0::2])
      ymin = min(pts[1::2])
      ymax = max(pts[1::2])

      xmin = max(xmin, 0)
      ymin = max(ymin, 0)
      xmax = min(img_w, xmax)
      ymax = min(img_h, ymax)

      boxes.append(normalize_bbox((xmin, ymin, xmax, ymax), img_w, img_h))
    ls_words.append(words)
    ls_boxes.append(boxes)
    ls_labels.append(labels)
    ls_img_fp.append(str(img_fp))

    cnt += 1
    if cnt >= max_sample:
      break

  return ls_words, ls_boxes, ls_labels, ls_img_fp

In [5]:
train_dir = 'fake_data_24032023/masked/train'
val_dir = 'fake_data_24032023/masked/val'

ls_words_train, ls_boxes_train, ls_labels_train, ls_img_train = gen_annotations(train_dir, max_sample=1e9)
ls_words_val, ls_boxes_val, ls_labels_val, ls_img_val = gen_annotations(val_dir, max_sample=1e9)

print(ls_words_train[0][:10])
print(ls_boxes_train[0][:10])
print(ls_labels_train[0][:10])
print(ls_img_train[0])

[':kibggbe1hya', 'no', 'swift', 'overijssel', 'almelo', '7607dh', ':telshoek', 'add', '18', 'nigeria']
[[390, 829, 496, 846], [364, 824, 385, 850], [329, 822, 364, 850], [587, 801, 676, 821], [521, 799, 583, 822], [466, 799, 516, 822], [360, 799, 444, 822], [330, 799, 356, 824], [447, 799, 463, 822], [807, 764, 871, 787]]
['swift_code', 'marker_swift_code', 'marker_swift_code', 'bank_address', 'bank_address', 'bank_address', 'bank_address', 'marker_bank_address', 'bank_address', 'bank_name']
fake_data_24032023/masked/train/fake_8124.jpg


In [6]:
print(len(ls_words_train))
print(len(ls_words_val))

len(ls_boxes_train)

777
111


777

In [7]:
from collections import Counter

train_annotations = [ls_words_train, ls_boxes_train, ls_labels_train, ls_img_train]
val_annotations = [ls_words_val, ls_boxes_val, ls_labels_val, ls_img_val]
# test = [ls_words_test, ls_boxes_test, ls_labels_test, ls_img_test]
all_labels = [item for sublist in train_annotations[2] for item in sublist] + [item for sublist in val_annotations[2] for item in sublist]
Counter(all_labels)

Counter({'swift_code': 1380,
         'marker_swift_code': 2840,
         'bank_address': 7410,
         'marker_bank_address': 865,
         'bank_name': 8343,
         'marker_bank_name': 2548,
         'account_number': 2233,
         'marker_account_number': 3389,
         'marker_company_name': 4118,
         'text': 20917,
         'tax': 1504,
         'marker_tax': 2196,
         'phone': 2215,
         'marker_phone': 2194,
         'represented_name': 6526,
         'marker_represented_name': 2028,
         'marker_fax': 2158,
         'fax': 2218,
         'company_address': 10512,
         'marker_company_address': 1264,
         'company_name': 10798})

In [8]:
labels = list(set(all_labels))
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for idx, label in enumerate(labels)}

print(labels)
print(label2id)
print(id2label)

label_list = labels

['text', 'marker_tax', 'marker_company_address', 'represented_name', 'bank_name', 'phone', 'marker_swift_code', 'company_name', 'marker_phone', 'marker_fax', 'bank_address', 'account_number', 'marker_account_number', 'marker_represented_name', 'swift_code', 'tax', 'company_address', 'fax', 'marker_bank_address', 'marker_bank_name', 'marker_company_name']
{'text': 0, 'marker_tax': 1, 'marker_company_address': 2, 'represented_name': 3, 'bank_name': 4, 'phone': 5, 'marker_swift_code': 6, 'company_name': 7, 'marker_phone': 8, 'marker_fax': 9, 'bank_address': 10, 'account_number': 11, 'marker_account_number': 12, 'marker_represented_name': 13, 'swift_code': 14, 'tax': 15, 'company_address': 16, 'fax': 17, 'marker_bank_address': 18, 'marker_bank_name': 19, 'marker_company_name': 20}
{0: 'text', 1: 'marker_tax', 2: 'marker_company_address', 3: 'represented_name', 4: 'bank_name', 5: 'phone', 6: 'marker_swift_code', 7: 'company_name', 8: 'marker_phone', 9: 'marker_fax', 10: 'bank_address', 11: 

# Data Loader

In [9]:
from os import listdir
from torch.utils.data import Dataset
import torch
from PIL import Image

class CORDDataset(Dataset):
    """CORD dataset."""

    def __init__(self, annotations, processor=None, max_length=512):
        """
        Args:
            annotations (List[List]): List of lists containing the word-level annotations (words, labels, boxes).
            image_dir (string): Directory with all the document images.
            processor (LayoutLMv2Processor): Processor to prepare the text + image.
        """
        self.words, self.boxes, self.labels, self.img_paths = annotations
        self.processor = processor
        self.multi_split = {}

    def __len__(self):
        return len(self.img_paths)

    def __getitem__(self, index):
        # first, take an image
        item = self.img_paths[index]
        image = Image.open(item).convert("RGB")

        # get word-level annotations 
        words = self.words[index]
        boxes = self.boxes[index]
        word_labels = self.labels[index]

        word_labels = [label2id[label] for label in word_labels]

        encoded_inputs = processor(image, words, boxes=boxes, word_labels=word_labels, truncation=True, stride =128, 
                            padding="max_length", max_length=512, return_overflowing_tokens=True, return_offsets_mapping=True, return_tensors="pt")  
        
        # print(encoded_inputs.keys())
        overflow_to_sample_mapping = encoded_inputs.pop('overflow_to_sample_mapping')
        offset_mapping = encoded_inputs.pop('offset_mapping')
        # print('overflow_to_sample_mapping: ', overflow_to_sample_mapping)
        # print('offset_mapping: ', offset_mapping)

        # remove batch dimension
        idx = np.random.randint(0, len(encoded_inputs['pixel_values']))
        for k, v in encoded_inputs.items():
            encoded_inputs[k] = v[idx]
      
        return encoded_inputs

In [10]:
from transformers import LayoutLMv3Processor, LayoutXLMTokenizerFast, LayoutLMv2FeatureExtractor

# feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
# tokenizer = LayoutXLMTokenizerFast.from_pretrained('microsoft/layoutxlm-base')
# tokenizer.only_label_first_subword = False
# processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor = LayoutLMv3Processor.from_pretrained('microsoft/layoutlmv3-base', apply_ocr=False)
processor.tokenizer.only_label_first_subword = False

print(processor.tokenizer)
print(processor.tokenizer.only_label_first_subword)
print(processor.feature_extractor)

LayoutLMv3TokenizerFast(name_or_path='microsoft/layoutlmv3-base', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True)})
False
LayoutLMv3ImageProcessor {
  "apply_ocr": false,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "feature_ex



In [11]:
train_dataset = CORDDataset(annotations=train_annotations,
                            processor=processor)
val_dataset = CORDDataset(annotations=val_annotations,
                            processor=processor)
# test_dataset = CORDDataset(annotations=test,
#                             processor=processor)
print(len(train_dataset))
print(len(val_dataset))

777
111


In [12]:
encoding= train_dataset[0]
for k,v in encoding.items():
  print(k, v.shape)

input_ids torch.Size([512])
attention_mask torch.Size([512])
bbox torch.Size([512, 4])
labels torch.Size([512])
pixel_values torch.Size([3, 224, 224])


In [13]:
ls_token = [processor.tokenizer.decode(input_id) for input_id in encoding['input_ids']]
ls_label = [id2label[int(label_id)] if label_id != -100 else 'SPECIAL' for label_id in encoding['labels'] ]
ls_bb = list(encoding['bbox'])
for item in zip(ls_token, ls_label, ls_bb):
  print(item)
  # break

('<s>', 'SPECIAL', tensor([0, 0, 0, 0]))
(' :', 'swift_code', tensor([390, 829, 496, 846]))
('k', 'swift_code', tensor([390, 829, 496, 846]))
('ib', 'swift_code', tensor([390, 829, 496, 846]))
('gg', 'swift_code', tensor([390, 829, 496, 846]))
('be', 'swift_code', tensor([390, 829, 496, 846]))
('1', 'swift_code', tensor([390, 829, 496, 846]))
('hya', 'swift_code', tensor([390, 829, 496, 846]))
(' no', 'marker_swift_code', tensor([364, 824, 385, 850]))
(' swift', 'marker_swift_code', tensor([329, 822, 364, 850]))
(' over', 'bank_address', tensor([587, 801, 676, 821]))
('ij', 'bank_address', tensor([587, 801, 676, 821]))
('s', 'bank_address', tensor([587, 801, 676, 821]))
('sel', 'bank_address', tensor([587, 801, 676, 821]))
(' al', 'bank_address', tensor([521, 799, 583, 822]))
('mel', 'bank_address', tensor([521, 799, 583, 822]))
('o', 'bank_address', tensor([521, 799, 583, 822]))
(' 7', 'bank_address', tensor([466, 799, 516, 822]))
('607', 'bank_address', tensor([466, 799, 516, 822]))


In [14]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=True)
# test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

for item in train_dataloader:
  for k, v in item.items():
    print(k, v.shape)
  break

input_ids torch.Size([4, 512])
attention_mask torch.Size([4, 512])
bbox torch.Size([4, 512, 4])
labels torch.Size([4, 512])
pixel_values torch.Size([4, 3, 224, 224])


# Model

In [15]:
from transformers import LayoutLMv3ForTokenClassification, AdamW
import torch
from tqdm.notebook import tqdm

model = LayoutLMv3ForTokenClassification.from_pretrained('microsoft/layoutlmv3-base',
                                                         id2label=id2label,
                                                         label2id=label2id)

Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Hugging Face Trainer

In [16]:
label_list

['text',
 'marker_tax',
 'marker_company_address',
 'represented_name',
 'bank_name',
 'phone',
 'marker_swift_code',
 'company_name',
 'marker_phone',
 'marker_fax',
 'bank_address',
 'account_number',
 'marker_account_number',
 'marker_represented_name',
 'swift_code',
 'tax',
 'company_address',
 'fax',
 'marker_bank_address',
 'marker_bank_name',
 'marker_company_name']

In [17]:
import evaluate

metric = evaluate.load("seqeval")

import numpy as np
from seqeval.metrics import classification_report

return_entity_level_metrics = False

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    if return_entity_level_metrics:
        # Unpack nested dictionaries
        final_results = {}
        for key, value in results.items():
            if isinstance(value, dict):
                for n, v in value.items():
                    final_results[f"{key}_{n}"] = v
            else:
                final_results[key] = value
        return final_results
    else:
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }

In [18]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="ckpt/layoutlmv3-fake-masked-24032023",
                                  num_train_epochs=50,
                                  learning_rate=5e-5,
                                  evaluation_strategy="steps",
                                  save_strategy='steps',
                                  eval_steps=250,
                                  save_steps=500,
                                  save_total_limit=5,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="f1",
                                  warmup_ratio = 0.1,
                                  do_eval=True)

In [19]:
from transformers.data.data_collator import default_data_collator

class CustomTrainer(Trainer):
  def get_train_dataloader(self):
    return train_dataloader

  def get_eval_dataloader(self, eval_dataset = None):
    return val_dataloader

# Initialize our Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=processor.tokenizer,
    compute_metrics=compute_metrics,
)

In [20]:
trainer.train()



Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
250,No log,0.654484,0.621272,0.618487,0.619876,0.851249
500,1.165800,0.110945,0.929937,0.944351,0.937089,0.981697
750,1.165800,0.076799,0.946824,0.95761,0.952186,0.985221
1000,0.075600,0.037533,0.968912,0.977778,0.973325,0.992987
1250,0.075600,0.039524,0.968605,0.973669,0.971131,0.991173
1500,0.037100,0.033312,0.97668,0.985434,0.981037,0.995245
1750,0.037100,0.023261,0.981658,0.979458,0.980557,0.995245
2000,0.022400,0.016164,0.977247,0.978525,0.977886,0.994766
2250,0.022400,0.016013,0.983445,0.987302,0.985369,0.996989
2500,0.013900,0.030974,0.980561,0.979645,0.980103,0.995279


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=9750, training_loss=0.06942500471820434, metrics={'train_runtime': 5489.7093, 'train_samples_per_second': 7.077, 'train_steps_per_second': 1.776, 'total_flos': 1.03128379780608e+16, 'train_loss': 0.06942500471820434, 'epoch': 50.0})

In [21]:
trainer.evaluate()

{'eval_loss': 0.012266192585229874,
 'eval_precision': 0.9889925373134328,
 'eval_recall': 0.9899159663865547,
 'eval_f1': 0.9894540363975735,
 'eval_accuracy': 0.9979815258296271,
 'eval_runtime': 8.5351,
 'eval_samples_per_second': 13.005,
 'eval_steps_per_second': 1.64,
 'epoch': 50.0}

In [22]:
model.save_pretrained('ckpt/masked/best_model_at_end')

In [23]:
! mv ckpt/layoutlmv3-fake-masked-24032023/runs ckpt/masked

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
!rm -rf ckpt/layoutlmv3-fake-masked-24032023

In [24]:
loaded_model = LayoutLMv3ForTokenClassification.from_pretrained('ckpt/masked/best_model_at_end').to('cuda')
trainer.model = loaded_model
trainer.evaluate()

Exception in thread Thread-7:
Traceback (most recent call last):
  File "/usr/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/data/tungtx2/env_ocr/lib/python3.7/site-packages/tensorboard/summary/writer/event_file_writer.py", line 233, in run
    self._record_writer.write(data)
  File "/data/tungtx2/env_ocr/lib/python3.7/site-packages/tensorboard/summary/writer/record_writer.py", line 40, in write
    self._writer.write(header + header_crc + data + footer_crc)
  File "/data/tungtx2/env_ocr/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/io/gfile.py", line 766, in write
    self.fs.append(self.filename, file_content, self.binary_mode)
  File "/data/tungtx2/env_ocr/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/io/gfile.py", line 160, in append
    self._write(filename, file_content, "ab" if binary_mode else "a")
  File "/data/tungtx2/env_ocr/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/io/gfile.py", line 

KeyboardInterrupt: 

In [27]:
loaded_model

LayoutLMv3ForTokenClassification(
  (layoutlmv3): LayoutLMv3Model(
    (embeddings): LayoutLMv3TextEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (x_position_embeddings): Embedding(1024, 128)
      (y_position_embeddings): Embedding(1024, 128)
      (h_position_embeddings): Embedding(1024, 128)
      (w_position_embeddings): Embedding(1024, 128)
    )
    (patch_embed): LayoutLMv3PatchEmbeddings(
      (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
    (encoder): LayoutLMv3Encoder

In [28]:
!cp -r ckpt/layoutlmv3-fake-masked-24032023/checkpoint-7500 ckpt/masked

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [29]:
!rm -rf ckpt/layoutlmv3-fake-masked-24032023

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
