# Import Lib

In [1]:
!nvidia-smi

Sun Mar 26 07:40:06 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 465.19.01    CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:00:06.0 Off |                  N/A |
| 27%   39C    P8    20W / 250W |      1MiB / 11019MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  On   | 00000000:00:0A.0 Off |                  N/A |
| 27%   39C    P8    21W / 250W |      1MiB / 11019MiB |      0%      Default |
|       

In [2]:
import os
os.environ['TRANSFORMERS_CACHE'] = '/data/tungtx2/tmp/transformers_hub'
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [3]:
import os
import json
from pathlib import Path
import numpy as np
from PIL import Image
import torch

torch.__version__

  from .autonotebook import tqdm as notebook_tqdm


'1.13.1+cu117'

# Prepare Data

In [15]:
train_dir = 'real_data/train_labeled_ocred'
val_dir = 'real_data/val_labeled_ocred'

def find_all_labels(data_dir, disable_marker=False):
    labels = {}
    for jp in Path(data_dir).rglob('*.json'):
        data = json.load(open(jp))
        for shape in data['shapes']:
            if disable_marker and 'marker' in shape['label']:
                label = 'text'
            else:
                label = shape['label']
                
            if label in labels:
                labels[label] += 1
            else:
                labels[label] = 1
                
    return labels

train_labels = find_all_labels(train_dir, disable_marker=True)
val_labels = find_all_labels(val_dir, disable_marker=True)
assert set(train_labels.keys()) == set(val_labels.keys())
for k, v in train_labels.items():
    print(k, ':', v)

text : 70061
account_number : 419
swift_code : 235
bank_name : 1493
fax : 566
phone : 963
company_address : 7566
company_name : 3916
bank_address : 1228
represented_position : 464
represented_name : 1076
tax : 57


In [16]:
label_list = list(set(train_labels.keys()))
label2id = {label: idx for idx, label in enumerate(label_list)}
id2label = {idx: label for idx, label in enumerate(label_list)}

print(label2id)
print(id2label)

{'bank_name': 0, 'fax': 1, 'phone': 2, 'swift_code': 3, 'company_name': 4, 'represented_position': 5, 'tax': 6, 'company_address': 7, 'represented_name': 8, 'text': 9, 'account_number': 10, 'bank_address': 11}
{0: 'bank_name', 1: 'fax', 2: 'phone', 3: 'swift_code', 4: 'company_name', 5: 'represented_position', 6: 'tax', 7: 'company_address', 8: 'represented_name', 9: 'text', 10: 'account_number', 11: 'bank_address'}


# Data Loader

In [6]:
from transformers import LayoutLMv3Processor, LayoutXLMTokenizerFast, LayoutLMv2FeatureExtractor

# feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
# tokenizer = LayoutXLMTokenizerFast.from_pretrained('microsoft/layoutxlm-base')
# tokenizer.only_label_first_subword = False
# processor = LayoutXLMProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor = LayoutLMv3Processor.from_pretrained('microsoft/layoutlmv3-base', apply_ocr=False)
processor.tokenizer.only_label_first_subword = False

print(processor.tokenizer)
print(processor.tokenizer.only_label_first_subword)
print(processor.image_processor)

LayoutLMv3TokenizerFast(name_or_path='microsoft/layoutlmv3-base', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True)})
False
LayoutLMv3ImageProcessor {
  "apply_ocr": false,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "feature_ex

In [17]:
from os import listdir
from torch.utils.data import Dataset
import torch
from PIL import Image
import unidecode
from PIL import Image, ImageDraw, ImageFont
import pdb
import xml.etree.ElementTree as ET
from shapely.geometry import Polygon
import cv2


def normalize_bbox(bbox, width, height):
     return [
         int(1000 * (bbox[0] / width)),
         int(1000 * (bbox[1] / height)),
         int(1000 * (bbox[2] / width)),
         int(1000 * (bbox[3] / height)),
     ]
    
    
def parse_xml(xml_path):
    root = ET.parse(xml_path).getroot()
    objs = root.findall('object')
    boxes, obj_names = [], []
    for obj in objs:
        obj_name = obj.find('name').text
        box = obj.find('bndbox')
        xmin = int(box.find('xmin').text)
        ymin = int(box.find('ymin').text)
        xmax = int(box.find('xmax').text)
        ymax = int(box.find('ymax').text)
        boxes.append([xmin, ymin, xmax, ymax])
        obj_names.append(obj_name)
    return boxes, obj_names


def widen_box(box, percent_x, percent_y):
        xmin, ymin, xmax, ymax = box
        w = xmax - xmin
        h = ymax - ymin
        xmin -= w * percent_x
        ymin -= h * percent_y
        xmax += w * percent_x
        ymax += h * percent_y
        return (int(xmin), int(ymin), int(xmax), int(ymax))

    
def draw_json_on_img(img, json_data):
    labels = list(set(shape['label'] for shape in json_data['shapes']))
    color = {}
    for i in range(len(labels)):
        color[labels[i]] = (np.random.randint(0, 255), np.random.randint(0, 255), np.random.randint(0, 255))
        
    img = img.copy()
    draw = ImageDraw.Draw(img)
    font = cv2.FONT_HERSHEY_SIMPLEX
    font_size = 0.5# Draw the text on the image
    # font = ImageFont.truetype(font.font.family, font_size)
    for i, shape in enumerate(json_data['shapes']):
        polys = shape['points']
        polys = [(int(pt[0]), int(pt[1])) for pt in polys]
        label = shape['label']
        draw.polygon(polys, outline=color[label], width=2)
        # Draw the text on the image
        img = np.array(img)
        cv2.putText(img, shape['label'], (polys[0][0], polys[0][1]-5), font, font_size, color[label], thickness=1)
        img = Image.fromarray(img)
        draw = ImageDraw.Draw(img)
    return img
    
    
def mask_image(img, boxes, json_data, widen_range_x, widen_range_y):
    # widen block
    if isinstance(widen_range_x, list) and isinstance(widen_range_y, list):
        boxes = [widen_box(box, np.random.uniform(widen_range_x[0], widen_range_x[1]), np.random.uniform(widen_range_y[0], widen_range_y[1])) for box in boxes]
    else:
        boxes = [widen_box(box, widen_range_x, widen_range_y) for box in boxes]
        
    
    ls_polys2keep = []
    ls_area2keep = []
    iou_threshold = 0.
    for box_idx, box in enumerate(boxes):
        xmin, ymin, xmax, ymax = box
        box_pts = [(xmin, ymin), (xmax, ymin), (xmax, ymax), (xmin, ymax)]
        p_box = Polygon(box_pts)
        for shape_idx, shape in enumerate(json_data['shapes']):
            if shape_idx in ls_polys2keep:
                continue
            pts = shape['points']
            p_shape = Polygon(pts)
            intersect_area = p_box.intersection(p_shape).area
            if intersect_area / p_shape.area > iou_threshold:
                ls_polys2keep.append(shape_idx)
                pts = [coord for pt in pts for coord in pt]
                poly_xmin = min(pts[::2])
                poly_ymin = min(pts[1::2])
                poly_xmax = max(pts[::2])
                poly_ymax = max(pts[1::2])
                ls_area2keep.append((poly_xmin, poly_ymin, poly_xmax, poly_ymax))

    # mask white all area of image that is not in block
    mask = np.zeros(img.shape[:2], dtype=np.uint8)
    for box in boxes:
        xmin, ymin, xmax, ymax = box
        xmin = max(0, xmin)
        ymin = max(0, ymin)
        xmax = min(img.shape[1], xmax)
        ymax = min(img.shape[0], ymax)
        mask[ymin:ymax, xmin:xmax] = 255

    for area2keep in ls_area2keep:
        xmin, ymin, xmax, ymax = area2keep
        xmin = int(max(0, xmin))
        ymin = int(max(0, ymin))
        xmax = int(min(img.shape[1], xmax))
        ymax = int(min(img.shape[0], ymax))
        mask[ymin:ymax, xmin:xmax] = 255

    # mask white
    img[mask == 0] = 255

    # delete all poly that is not in block
    ls_idx2del = [idx for idx, shape in enumerate(json_data['shapes']) if idx not in ls_polys2keep]
    for idx in sorted(ls_idx2del, reverse=True):
        del json_data['shapes'][idx]

    return img, json_data
        

def gen_annotation_for_img(img_fp, xml_fp, json_fp, masked=False, widen_range_x=[0.1, 0.2], widen_range_y=[0.1, 0.25], disable_marker=False):
    img = Image.open(img_fp).convert("RGB")
    json_data = json.load(open(json_fp))
    
    if masked:
        block_boxes, obj_names = parse_xml(xml_fp)
        img, json_data = mask_image(np.array(img), boxes=block_boxes, json_data=json_data, widen_range_x=widen_range_x, widen_range_y=widen_range_y)
        img = Image.fromarray(img)
    # pdb.set_trace()
        
    words, orig_polys, normalized_boxes, labels = [], [], [], []
    img_h, img_w = json_data['imageHeight'], json_data['imageWidth']
    for i, shape in enumerate(json_data['shapes']):
        if disable_marker and 'marker' in shape['label']:
            current_label = 'text'
        else:
            current_label = shape['label']
            
        words.append(unidecode.unidecode(shape['text'].lower()))
        # words.append(shape['text'].lower())
        labels.append(current_label)
        pts = [coord for pt in shape['points'] for coord in pt]
        xmin = min(pts[0::2])
        xmax = max(pts[0::2])
        ymin = min(pts[1::2])
        ymax = max(pts[1::2])

        xmin = max(xmin, 0)
        ymin = max(ymin, 0)
        xmax = min(img_w, xmax)
        ymax = min(img_h, ymax)

        normalized_boxes.append(normalize_bbox((xmin, ymin, xmax, ymax), img_w, img_h))
        orig_polys.append(tuple([tuple(pt) for pt in shape['points']]))

    return img, words, orig_polys, normalized_boxes, labels


class CORDDataset(Dataset):
    """CORD dataset."""

    def __init__(self, file_paths, processor=None, max_length=512, masked=False, widen_range_x=[0.1, 0.2], widen_range_y=[0.1, 0.25], disable_marker=False):
        """
        Args:
            annotations (List[List]): List of lists containing the word-level annotations (words, labels, boxes).
            image_dir (string): Directory with all the document images.
            processor (LayoutLMv2Processor): Processor to prepare the text + image.
        """
        self.ls_img_fp, self.ls_xml_fp, self.ls_json_fp = file_paths
        assert len(self.ls_img_fp) == len(self.ls_json_fp) == len(self.ls_xml_fp)
        self.processor = processor
        self.masked = masked
        self.widen_range_x = widen_range_x
        self.widen_range_y = widen_range_y
        self.disable_marker = disable_marker

    def __len__(self):
        return len(self.ls_img_fp)

    def __getitem__(self, index):
        # first, take an image
        img_fp = self.ls_img_fp[index]
        xml_fp = self.ls_xml_fp[index]
        json_fp = self.ls_json_fp[index]
        
        img, words, _, boxes, text_labels = gen_annotation_for_img(img_fp, xml_fp, json_fp, masked=self.masked, widen_range_x=self.widen_range_x, widen_range_y=self.widen_range_y, disable_marker=self.disable_marker)
        idx_labels = [label2id[label] for label in text_labels]

        encoded_inputs = self.processor(img, words, boxes=boxes, word_labels=idx_labels, truncation=True, stride =128, 
                            padding="max_length", max_length=512, return_overflowing_tokens=True, return_offsets_mapping=True, return_tensors="pt")  
        
        # print(encoded_inputs.keys())
        overflow_to_sample_mapping = encoded_inputs.pop('overflow_to_sample_mapping')
        offset_mapping = encoded_inputs.pop('offset_mapping')
        # print('overflow_to_sample_mapping: ', overflow_to_sample_mapping)
        # print('offset_mapping: ', offset_mapping)

        # remove batch dimension
        idx = np.random.randint(0, len(encoded_inputs['pixel_values']))
        for k, v in encoded_inputs.items():
            encoded_inputs[k] = v[idx]
      
        return encoded_inputs

In [18]:
def get_file_paths(data_dir):
    ls_img_fp, ls_xml_fp, ls_json_fp = [], [], []
    for img_fp in Path(data_dir).rglob('*.jpg'):
        json_fp = img_fp.with_suffix('.json')
        xml_fp = img_fp.with_suffix('.xml')
        
        ls_img_fp.append(str(img_fp))
        ls_xml_fp.append(str(xml_fp))
        ls_json_fp.append(str(json_fp))
    
    return ls_img_fp, ls_xml_fp, ls_json_fp


train_file_paths = get_file_paths(train_dir)
val_file_paths = get_file_paths(val_dir)

widen_range_x = [0.1, 0.2]
widen_range_y = [0.1, 0.25]
disable_marker = True
train_dataset = CORDDataset(file_paths=train_file_paths, processor=processor, masked=False, 
                            widen_range_x=widen_range_x, widen_range_y=widen_range_y, disable_marker=disable_marker)
val_dataset = CORDDataset(file_paths=val_file_paths, processor=processor, masked=False, 
                          widen_range_x=0.1, widen_range_y=0.15, disable_marker=disable_marker)

print(len(train_dataset))
print(len(val_dataset))

303
33


In [21]:
encoding = val_dataset[9]
for k,v in encoding.items():
  print(k, v.shape)

input_ids torch.Size([512])
attention_mask torch.Size([512])
bbox torch.Size([512, 4])
labels torch.Size([512])
pixel_values torch.Size([3, 224, 224])


In [22]:
ls_token = [processor.tokenizer.decode(input_id) for input_id in encoding['input_ids']]
ls_label = [id2label[int(label_id)] if label_id != -100 else 'SPECIAL' for label_id in encoding['labels'] ]
ls_bb = list(encoding['bbox'])
for item in zip(ls_token, ls_label, ls_bb):
  print(item)
  # break

('<s>', 'SPECIAL', tensor([0, 0, 0, 0]))
(' city', 'bank_address', tensor([386, 324, 416, 338]))
(',', 'bank_address', tensor([386, 324, 416, 338]))
(' min', 'bank_address', tensor([355, 324, 388, 338]))
('h', 'bank_address', tensor([355, 324, 388, 338]))
(' chi', 'bank_address', tensor([333, 324, 357, 340]))
(' as', 'bank_name', tensor([182, 324, 210, 338]))
('ia', 'bank_name', tensor([182, 324, 210, 338]))
(' at', 'text', tensor([165, 326, 178, 338]))
(' ho', 'bank_address', tensor([313, 324, 333, 338]))
(' 19', 'account_number', tensor([218, 313, 262, 322]))
('199', 'account_number', tensor([218, 313, 262, 322]))
('66', 'account_number', tensor([218, 313, 262, 322]))
(' no', 'text', tensor([186, 309, 215, 324]))
('.:', 'text', tensor([186, 309, 215, 324]))
(' a', 'text', tensor([165, 309, 188, 324]))
('/', 'text', tensor([165, 309, 188, 324]))
('c', 'text', tensor([165, 309, 188, 324]))
(' -', 'represented_position', tensor([401, 298, 457, 310]))
('director', 'represented_position',

In [23]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=True)
# test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

for item in train_dataloader:
  for k, v in item.items():
    print(k, v.shape)
  break

input_ids torch.Size([4, 512])
attention_mask torch.Size([4, 512])
bbox torch.Size([4, 512, 4])
labels torch.Size([4, 512])
pixel_values torch.Size([4, 3, 224, 224])


# Model

In [24]:
from transformers import LayoutLMv3ForTokenClassification, AdamW
import torch
from tqdm.notebook import tqdm

model = LayoutLMv3ForTokenClassification.from_pretrained('ckpt/nonmasked/real_data/layoutlmv3_pretrained_fake_data/checkpoint-11000-best_f1-0.988')
# model = LayoutLMv3ForTokenClassification.from_pretrained('microsoft/layoutlmv3-base', id2label=id2label)

print(model)

LayoutLMv3ForTokenClassification(
  (layoutlmv3): LayoutLMv3Model(
    (embeddings): LayoutLMv3TextEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (x_position_embeddings): Embedding(1024, 128)
      (y_position_embeddings): Embedding(1024, 128)
      (h_position_embeddings): Embedding(1024, 128)
      (w_position_embeddings): Embedding(1024, 128)
    )
    (patch_embed): LayoutLMv3PatchEmbeddings(
      (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
    (encoder): LayoutLMv3Encoder

In [26]:
label_list

['bank_name',
 'fax',
 'phone',
 'swift_code',
 'company_name',
 'represented_position',
 'tax',
 'company_address',
 'represented_name',
 'text',
 'account_number',
 'bank_address']

In [25]:
import torch.nn as nn

model.classifier.out_proj = nn.Linear(in_features=768, out_features=len(label_list), bias=True)
model.config.id2label = id2label
model.config.label2id = label2id
model.num_labels = len(label_list)

print(model)
print()
print(model.config)

LayoutLMv3ForTokenClassification(
  (layoutlmv3): LayoutLMv3Model(
    (embeddings): LayoutLMv3TextEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (x_position_embeddings): Embedding(1024, 128)
      (y_position_embeddings): Embedding(1024, 128)
      (h_position_embeddings): Embedding(1024, 128)
      (w_position_embeddings): Embedding(1024, 128)
    )
    (patch_embed): LayoutLMv3PatchEmbeddings(
      (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
    (encoder): LayoutLMv3Encoder

In [27]:
len(label_list)

12

# Hugging Face Trainer

In [28]:
label_list

['bank_name',
 'fax',
 'phone',
 'swift_code',
 'company_name',
 'represented_position',
 'tax',
 'company_address',
 'represented_name',
 'text',
 'account_number',
 'bank_address']

In [29]:
import evaluate

metric = evaluate.load("seqeval")

import numpy as np
from seqeval.metrics import classification_report

return_entity_level_metrics = False

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    if return_entity_level_metrics:
        # Unpack nested dictionaries
        final_results = {}
        for key, value in results.items():
            if isinstance(value, dict):
                for n, v in value.items():
                    final_results[f"{key}_{n}"] = v
            else:
                final_results[key] = value
        return final_results
    else:
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }

In [31]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="ckpt/nonmasked/real_data/layoutlmv3_disable_marker_pretrained_real_data",
                                  num_train_epochs=100,
                                  learning_rate=5e-5,
                                  evaluation_strategy="steps",
                                  save_strategy='steps',
                                  eval_steps=250,
                                  save_steps=250,
                                  save_total_limit=15,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="f1",
                                  warmup_ratio = 0.1,
                                  do_eval=True)

In [32]:
from transformers.data.data_collator import default_data_collator

class CustomTrainer(Trainer):
  def get_train_dataloader(self):
    return train_dataloader

  def get_eval_dataloader(self, eval_dataset = None):
    return val_dataloader

# Initialize our Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=processor.tokenizer,
    compute_metrics=compute_metrics,
)

In [33]:
trainer.train()



Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
250,No log,0.010004,0.971429,0.977011,0.974212,0.996526
500,0.344600,0.005118,0.970443,0.975248,0.97284,0.997174
750,0.344600,0.020842,0.970238,0.971684,0.970961,0.996999
1000,0.010600,0.031682,0.949695,0.962906,0.956255,0.995011
1250,0.010600,0.029029,0.96063,0.959119,0.959874,0.995576
1500,0.007200,0.015756,0.96897,0.971711,0.970339,0.995702
1750,0.007200,0.022821,0.970543,0.978125,0.974319,0.997677
2000,0.004100,0.049145,0.971768,0.971768,0.971768,0.996981
2250,0.004100,0.007198,0.982249,0.983704,0.982976,0.998446
2500,0.002600,0.028875,0.98415,0.982734,0.983441,0.998398




TrainOutput(global_step=7600, training_loss=0.024756601003219227, metrics={'train_runtime': 4449.3985, 'train_samples_per_second': 6.81, 'train_steps_per_second': 1.708, 'total_flos': 8042573046988800.0, 'train_loss': 0.024756601003219227, 'epoch': 100.0})

In [18]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Sat Mar 25 15:10:47 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 465.19.01    CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:00:06.0 Off |                  N/A |
| 27%   39C    P8    20W / 250W |   8505MiB / 11019MiB |      0%      Default |
|                               |            

In [19]:
model.save_pretrained('ckpt/nonmasked/real_data/layoutlmv3_pretrained_fake_data/best_model_at_end')

In [20]:
trainer.evaluate()



{'eval_loss': 0.024930810555815697,
 'eval_precision': 0.9762470308788599,
 'eval_recall': 0.9797377830750894,
 'eval_f1': 0.9779892920880429,
 'eval_accuracy': 0.9976113530982156,
 'eval_runtime': 3.2547,
 'eval_samples_per_second': 10.139,
 'eval_steps_per_second': 1.536,
 'epoch': 100.0}

In [22]:
trainer.save_model('ckpt/nonmasked/real_data/layoutlmv3_pretrained_fake_data/best_model')

In [27]:
model_dir = 'ckpt/nonmasked/real_data/layoutlmv3_disable_marker_pretrained_real_data'
max_f1, max_acc = 0, 0
best_f1_model, best_acc_model = None, None
for model_fn in os.listdir(model_dir):
    if model_fn == 'runs':
        continue
    model_fp = os.path.join(model_dir, model_fn)
    loaded_model = LayoutLMv3ForTokenClassification.from_pretrained(model_fp).to('cuda')
    trainer.model = loaded_model
    res = trainer.evaluate()
    if res['eval_f1'] > max_f1:
        best_f1_model = model_fn
        max_f1 = res['eval_f1']
    if res['eval_accuracy'] > max_acc:
        best_acc_model = model_fn
        max_acc = res['eval_accuracy']

print(f'Best f1 model: {best_f1_model} - {max_f1}')
print(f'Best acc model: {best_acc_model} - {max_acc}')



OSError: ckpt/nonmasked/real_data/layoutlmv3_pretrained_fake_data/runs does not appear to have a file named config.json. Checkout 'https://huggingface.co/ckpt/nonmasked/real_data/layoutlmv3_pretrained_fake_data/runs/None' for available files.

In [28]:
print(f'Best f1 model: {best_f1_model} - {max_f1}')
print(f'Best acc model: {best_acc_model} - {max_acc}')

Best f1 model: checkpoint-11000 - 0.9887399463806971
Best acc model: checkpoint-14000 - 0.9987273945077026


In [34]:
# delete model
import shutil

model_dir = 'ckpt/nonmasked/real_data/layoutlmv3_disable_marker_pretrained_real_data'
for model_fn in os.listdir(model_dir):
    if model_fn == 'runs':
        continue
    model_fp = os.path.join(model_dir, model_fn)
    if model_fn != 'checkpoint-4250':
        shutil.rmtree(model_fp)
        print(f'removed {model_fn}')

removed checkpoint-7000
removed checkpoint-5250
removed checkpoint-6250
removed checkpoint-5500
removed checkpoint-6000
removed checkpoint-4500
removed checkpoint-4750
removed checkpoint-7250
removed checkpoint-5000
removed checkpoint-6750
removed checkpoint-4000
removed checkpoint-5750
removed checkpoint-6500
removed checkpoint-7500


In [24]:
! rm -rf ckpt/nonmasked/real_data/layoutlmv3_pretrained_fake_data/best_model

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


# Inference

In [11]:
from transformers import LayoutLMv3ForTokenClassification, LayoutLMv3Processor

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = LayoutLMv3ForTokenClassification.from_pretrained('ckpt/nonmasked/real_data/layoutlmv3_pretrained_fake_data/checkpoint-11000-best_f1-0.988').eval().to(device)
processor = LayoutLMv3Processor.from_pretrained('microsoft/layoutlmv3-base', apply_ocr=False)
processor.tokenizer.only_label_first_subword = False

print(model)
print(processor)

LayoutLMv3ForTokenClassification(
  (layoutlmv3): LayoutLMv3Model(
    (embeddings): LayoutLMv3TextEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (x_position_embeddings): Embedding(1024, 128)
      (y_position_embeddings): Embedding(1024, 128)
      (h_position_embeddings): Embedding(1024, 128)
      (w_position_embeddings): Embedding(1024, 128)
    )
    (patch_embed): LayoutLMv3PatchEmbeddings(
      (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
    (encoder): LayoutLMv3Encoder

In [12]:
id2label = model.config.id2label
label2id = model.config.label2id

In [13]:
import pdb
from transformers import LayoutXLMProcessor
from collections import Counter
from transformers import LayoutXLMProcessor, LayoutXLMTokenizerFast, LayoutLMv2FeatureExtractor, LayoutLMv3Processor



def denormalize(bb, img_w, img_h):
  return (
      int(bb[0] / 1000 * img_w),
      int(bb[1] / 1000 * img_h),
      int(bb[2] / 1000 * img_w),
      int(bb[3] / 1000 * img_h),
  )


def predict(img, words, boxes, model, processor):
    assert len(words) == len(boxes)
    preds_val = None
    img_w, img_h = img.size
    
    # encode input for model
    encoded_inputs = processor(img, words, boxes=boxes, truncation=True, stride=128,
                               padding="max_length", max_length=512, return_overflowing_tokens=True, return_offsets_mapping=True, return_tensors="pt")
    encoded_inputs.pop('overflow_to_sample_mapping')
    encoded_inputs.pop('offset_mapping')

    n = len(encoded_inputs['bbox'])
    print(f'{n} split')

    wordidx2label = {}
    for idx, image in enumerate(encoded_inputs['pixel_values']):
        # prepare input to model
        input_ids = encoded_inputs['input_ids'][idx].unsqueeze(0).to(device)
        bbox = encoded_inputs['bbox'][idx].unsqueeze(0).to(device)
        image = encoded_inputs['pixel_values'][idx].unsqueeze(0).to(device)
        attention_mask = encoded_inputs['attention_mask'][idx].unsqueeze(0).to(device)

        # forward
        outputs = model(input_ids=input_ids, bbox=bbox, pixel_values=image, attention_mask=attention_mask)

        # process output
        preds_val = outputs.logits.detach().cpu().numpy()[0].tolist()
        words_idx = encoded_inputs.words(idx)
        for i, (pred, wordidx) in enumerate(zip(preds_val, words_idx)):
            if wordidx is None:
                continue
            if wordidx not in wordidx2label:
              wordidx2label[wordidx] = [np.argmax(pred)]
            else:
              wordidx2label[wordidx].append(np.argmax(pred))

    wordidx2label = {wordidx: Counter(label).most_common(1)[0][0] for wordidx, label in wordidx2label.items()}
    return wordidx2label, encoded_inputs

In [49]:
def predict_dir(data_dir):
    result_dict = {}
    for jp in Path(data_dir).rglob('*.json'):
        img, words, orig_polys, normalized_boxes, labels = gen_annotation_for_img(img_fp=jp.with_suffix('.jpg'), 
                                                           json_fp=jp, 
                                                           xml_fp=jp.with_suffix('.xml'), 
                                                           masked=False)
        res, encoding = predict(img, words, normalized_boxes, model, processor)
        res = {tuple(orig_polys[wordidx]): (words[wordidx], pred_label, label2id[labels[wordidx]]) for wordidx, pred_label in res.items()}
        result_dict[jp.with_suffix('.jpg')] = res
        print(f'Done {jp}')
    return result_dict
    
data_dir = 'ocr2/extracted_test'
result_dict = predict_dir(data_dir)

3 split
Done ocr2/extracted_test/CTR 970_0.json
1 split




Done ocr2/extracted_test/CTR 957_0.json
1 split
Done ocr2/extracted_test/CTR 978_1.json
2 split
Done ocr2/extracted_test/CTR 973_0.json
1 split
Done ocr2/extracted_test/CTR 604_0.json
2 split
Done ocr2/extracted_test/CTR 822_0.json
2 split
Done ocr2/extracted_test/CTR 959_0.json
2 split
Done ocr2/extracted_test/CTR 825_0.json
1 split
Done ocr2/extracted_test/CTR 823_0.json
1 split
Done ocr2/extracted_test/CTR 971_0.json
2 split
Done ocr2/extracted_test/CTR 965_0.json
2 split
Done ocr2/extracted_test/CTR 967_0.json
1 split
Done ocr2/extracted_test/CTR 821_0.json
2 split
Done ocr2/extracted_test/CTR 963_0.json
2 split
Done ocr2/extracted_test/CTR 958_0.json
2 split
Done ocr2/extracted_test/CTR 966_0.json
2 split
Done ocr2/extracted_test/CTR 968_0.json
2 split
Done ocr2/extracted_test/CTR 974_0.json
1 split
Done ocr2/extracted_test/CTR60_0.json
2 split
Done ocr2/extracted_test/CTR 978_0.json
2 split
Done ocr2/extracted_test/CTR 964_0.json
1 split
Done ocr2/extracted_test/CTR 972_0.json
2 

In [24]:
!rm -rf model_output/val_labeled_ocred_layoutlmv3_nonmasked_new_infer_func

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [55]:
import shutil

out_dir = 'model_output/extracted_test_layoutlmv3_nonmasked_new_infer_func'
os.makedirs(out_dir, exist_ok=True)

for img_fp, prediction in result_dict.items():
    data = json.load(open(img_fp.with_suffix('.json')))
    new_shapes = []
    for poly, (text, pred_label, real_label) in prediction.items():
        print('pred: ', pred_label)
        shape = {
            'label': id2label[pred_label],
            'points': poly,
            'shape_type': 'polygon',
            'text': text,
            'flags': {}
        }
        new_shapes.append(shape)
    data['shapes'] = new_shapes

    with open(os.path.join(out_dir, img_fp.with_suffix('.json').name), 'w') as f:
        json.dump(data, f)
    shutil.copy(img_fp, out_dir)
    print(f'Done {img_fp}')

pred:  17
pred:  15
pred:  15
pred:  10
pred:  8
pred:  8
pred:  11
pred:  11
pred:  11
pred:  18
pred:  18
pred:  11
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pred:  13
pr

In [56]:
!zip -r model_output/extracted_test_layoutlmv3_nonmasked_new_infer_func.zip model_output/extracted_test_layoutlmv3_nonmasked_new_infer_func


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  adding: model_output/extracted_test_layoutlmv3_nonmasked_new_infer_func/ (stored 0%)
  adding: model_output/extracted_test_layoutlmv3_nonmasked_new_infer_func/CTR 970_0.json (deflated 84%)
  adding: model_output/extracted_test_layoutlmv3_nonmasked_new_infer_func/CTR 973_0.jpg (deflated 6%)
  adding: model_output/extracted_test_layoutlmv3_nonmasked_new_infer_func/CTR 957_0.json (deflated 84%)
  adding: model_output/extracted_test_layoutlmv3_nonmasked_new_infer_func/CTR 978_1.json (deflated 85%)
  adding: model_output/extracted_test_layoutlmv3_nonmasked_new_infer_func/CTR 973_0.json (deflated 85%)
  adding: model_output/extracted_test_layoutlmv3_nonmasked_new_infer_func/CTR 604_0.json (deflated 85%)
  adding

In [109]:
!du -h model_output/val_labeled_ocred_layoutlmv3_nonmasked_pred_new_infer_func.zip

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
920K	model_output/val_labeled_ocred_layoutlmv3_nonmasked_pred_new_infer_func.zip


# Evaluate

In [54]:
id2label[17]

'swift_code'

In [52]:
for img_fp, predictions in result_dict.items():
    for polys, (text, pred, true) in predictions.items():
        print(text, pred, true)

rikbjpj 17 13
code: 15 13
swift 15 13
1-10-1,keya,fukui-shffuuuljann 10 13
address: 8 13
bank 8 13
branch 11 13
jriku.bank,ltd 11 13
the1... 11 13
name: 18 13
bank 18 13
klda 11 13
len 13 13
seller: 13 13
ofthe 13 13
favor 13 13
in.the 13 13
va 13 13
:o 13 13
chitky 13 13
nv: 13 13
goods 13 13
months 13 13
the 13 13
recavine 13 13
after 13 13
12 13 13
withln 13 13
t/t 13 13
ngav:... 13 13
jx 13 13
.....ll.20 13 13
payment 13 13
of 13 13
terms 13 13
articla.5: 13 13
i 13 13
doi 13 13
da 13 13
chieu 13 13
ban 13 13
chinh 13 13
destinallon: 13 13
minh 13 13
ho-chl 13 13
pen 13 13
of 13 13
:fukul 13 13
add 13 13
loadlng:japan 13 13
or 13 13
place 13 13
shlpment.: 13 13
not.allowad 13 13
partial 13 13
slgned. 13 13
contract 13 13
afler 13 13
2-3weeks 13 13
dellvery 13 13
shipment 13 13
article.a: 13 13
(medin.co.,ltd) 13 13
dinh.company.limited 13 13
marklng: 13 13
cass 13 13
standards 13 13
export 13 13
per 13 13
packing 13 13
as 13 13
article 13 13
marking 13 13
case 13 13
packing 13 13
7

In [41]:
trues = []
preds = []
for img_fp, predictions in result_dict.items():
    for poly, (text, pred_label, real_label) in predictions.items():
        preds.append(pred_label)
        trues.append(real_label)

for true, pred in zip(trues, preds):
    print(true, pred)

13 13
16 16
5 5
5 5
5 5
6 6
6 6
6 6
6 6
6 6
6 6
14 14
13 13
14 14
21 21
21 21
21 21
21 21
21 21
21 21
21 21
21 21
12 12
12 12
1 1
19 19
19 19
17 17
15 15
15 15
13 13
10 10
10 10
10 10
10 10
10 10
10 10
13 13
10 10
10 10
8 8
8 8
13 13
8 8
11 11
18 18
18 18
11 11
11 11
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13
13 13


In [42]:
import evaluate

metric = evaluate.load("seqeval")
results = metric.compute(predictions=preds, references=trues)

ValueError: Predictions and/or references don't match the expected format.
Expected format: {'predictions': Sequence(feature=Value(dtype='string', id='label'), length=-1, id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='label'), length=-1, id='sequence')},
Input predictions: [13, 16, 5, ..., 13, 13, 13],
Input references: [13, 16, 5, ..., 13, 13, 13]

In [43]:
0 in trues

True

In [44]:
id2label

{0: 'marker_represented_position',
 1: 'phone',
 2: 'tax',
 3: 'marker_represented_name',
 4: 'fax',
 5: 'marker_account_number',
 6: 'company_address',
 7: 'represented_position',
 8: 'marker_bank_address',
 9: 'represented_name',
 10: 'bank_address',
 11: 'bank_name',
 12: 'marker_company_name',
 13: 'text',
 14: 'marker_company_address',
 15: 'marker_swift_code',
 16: 'account_number',
 17: 'swift_code',
 18: 'marker_bank_name',
 19: 'marker_phone',
 20: 'marker_tax',
 21: 'company_name',
 22: 'marker_fax'}

In [45]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(trues, preds))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00        98
           2       1.00      1.00      1.00        10
           3       1.00      0.99      0.99        68
           4       1.00      1.00      1.00        57
           5       1.00      1.00      1.00        59
           6       1.00      1.00      1.00       810
           7       1.00      1.00      1.00        34
           8       1.00      1.00      1.00        31
           9       0.99      1.00      0.99       134
          10       0.98      1.00      0.99       203
          11       1.00      0.99      0.99       155
          12       1.00      1.00      1.00       148
          13       1.00      1.00      1.00      8388
          14       0.98      1.00      0.99        47
          15       0.98      1.00      0.99        40
          16       1.00      0.95      0.98        43
          17       1.00    