In [1]:
from transformers import LayoutXLMProcessor, LayoutLMv2ForTokenClassification
from data_loader_coco_image import DocumentLayoutAnalysisDataset, unnormalize_bbox, color_map
import torch
import os

# os.environ['CUDA_VISIBLE_DEVICES']='1'

processor = LayoutXLMProcessor.from_pretrained(
    "microsoft/layoutxlm-base",
    apply_ocr=False,
    only_label_first_subword=False,
    is_split_into_words=True)

anno_file = "/home/tiendq/PycharmProjects/DeepLearningDocReconstruction/0_data_repository/1000DataForOCR_fineLabel_dataset_coco_v1.1_titleNsuptitle.json"
image_root_folder = "/home/tiendq/Desktop/DocRec/2_data_preparation/2_selected_sample"
torch_dataset = DocumentLayoutAnalysisDataset(image_root_folder, anno_file)

# model = LayoutLMv2ForTokenClassification.from_pretrained(
#     '/home/tiendq/Desktop/DocRec/3_model_checkpoint/GPU-4_0_model_repository/1_update_titleandsupertitle',
#     num_labels=len(torch_dataset.label_list),
#     id2label=torch_dataset.id2label,
#     label2id=torch_dataset.label2id)



loading annotations into memory...
Done (t=0.34s)
creating index...
index created!


In [2]:
from PIL import Image

image_column_name = "image"
text_column_name = "words"
boxes_column_name = "boxes"
label_column_name = "labels_id"

examples = torch_dataset[300]

images = Image.open(examples['image_path']).convert("RGB")
words = examples[text_column_name]
boxes = examples[boxes_column_name]
word_labels = examples[label_column_name]

In [26]:
encoding = processor(images, words, boxes=boxes, word_labels=word_labels, truncation=True, stride =128,
         padding="max_length", max_length=512, return_overflowing_tokens=True, return_offsets_mapping=True, return_tensors='pt')

offset_mapping = encoding.pop('offset_mapping')

overflow_to_sample_mapping = encoding.pop('overflow_to_sample_mapping')

In [29]:
encoding['labels'].shape

torch.Size([2, 512])

In [18]:
tokens = processor.tokenizer.convert_ids_to_tokens(encoding["input_ids"][0])

# Create .txt file
with open("test.txt", "w") as f:
    f.write(" ".join(tokens))

# Create .ann file
with open("test.ann", "w") as f:
    entity_id = 1
    for i, label in enumerate(word_labels):
        if label:
            start_offset = encoding["offset_mapping"][0][i][0]
            end_offset = encoding["offset_mapping"][0][i][1]
            entity_text = " ".join(tokens[i].split("##")) # Account for sub-word tokens
            f.write(f"T{entity_id}\t{label} {start_offset} {end_offset}\t{entity_text}\n")
            entity_id += 1

In [4]:
encoding['labels']

tensor([[-100,    8,    8,  ...,    8,    8, -100],
        [-100,    2,    2,  ..., -100, -100, -100]])

In [9]:
token_boxes = encoding.bbox.squeeze().tolist()

In [14]:
for i in range(0, len(token_boxes)):
    for j in range(0, len(token_boxes[i])):
        if token_boxes[i][j] == [0,0,0,0] or token_boxes[i][j] == [1000,1000,1000,1000]:
            continue

        print("provided label is: {}, bbox is: {} and the text is: {}".format(torch_dataset.id2label[int(encoding['labels'][i][j])], token_boxes[i][j],  processor.tokenizer.decode(encoding["input_ids"][i][j])))

        print(f'FULL BOX text is: {words[encoding.token_to_word(i,j)]}')


provided label is: starting, bbox is: [183, 118, 377, 140] and the text is: (
FULL BOX text is: (Đề thi có 05 trang)
provided label is: starting, bbox is: [183, 118, 377, 140] and the text is: Đ
FULL BOX text is: (Đề thi có 05 trang)
provided label is: starting, bbox is: [183, 118, 377, 140] and the text is: ề
FULL BOX text is: (Đề thi có 05 trang)
provided label is: starting, bbox is: [183, 118, 377, 140] and the text is: thi
FULL BOX text is: (Đề thi có 05 trang)
provided label is: starting, bbox is: [183, 118, 377, 140] and the text is: có
FULL BOX text is: (Đề thi có 05 trang)
provided label is: starting, bbox is: [183, 118, 377, 140] and the text is: 05
FULL BOX text is: (Đề thi có 05 trang)
provided label is: starting, bbox is: [183, 118, 377, 140] and the text is: trang
FULL BOX text is: (Đề thi có 05 trang)
provided label is: starting, bbox is: [183, 118, 377, 140] and the text is: )
FULL BOX text is: (Đề thi có 05 trang)
provided label is: answer, bbox is: [159, 344, 659, 365]

In [18]:
del encoding['labels']

In [19]:
# import numpy as np
x = []
for i in range(0, len(encoding['image'])):
     x.append(encoding['image'][i])
x = torch.stack(x)
encoding['image'] = x

In [20]:
for k,v in encoding.items():
  print(k,type(v), v.shape, sep="\t\t\t\t")

input_ids				<class 'torch.Tensor'>				torch.Size([2, 512])
attention_mask				<class 'torch.Tensor'>				torch.Size([2, 512])
bbox				<class 'torch.Tensor'>				torch.Size([2, 512, 4])
image				<class 'torch.Tensor'>				torch.Size([2, 3, 224, 224])


In [28]:
with torch.no_grad():
  outputs = model(**encoding)

# The model outputs logits of shape (batch_size, seq_len, num_labels).
logits = outputs.logits
print(logits.shape)

# We take the highest score for each token, using argmax. This serves as the predicted label for each token.
predictions = logits.argmax(-1).squeeze().tolist()
token_boxes = encoding.bbox.squeeze().tolist()
# original_words = encoding.word_ids()
if (len(token_boxes) == 512):
  predictions = [predictions]
  token_boxes = [token_boxes]

torch.Size([2, 512, 9])


TokenClassifierOutput(loss=None, logits=tensor([[[-1.3280, -1.9619, -0.7244,  ..., -1.3870, -1.5125, 11.2898],
         [-1.3148, -1.9169, -0.7642,  ..., -1.3464, -1.4291, 11.3161],
         [-1.3102, -1.7967, -0.4242,  ..., -1.4428, -1.6221, 10.9942],
         ...,
         [-1.0703, -1.6430, -1.1171,  ..., -0.8316, -1.7563, 11.3303],
         [-1.0469, -1.6576, -1.1222,  ..., -0.8664, -1.6909, 11.4204],
         [ 1.8050, -2.9063,  4.9642,  ..., -1.9465, -2.1395,  4.7034]],

        [[-1.6905, -2.0083, 11.1895,  ..., -1.7055, -1.8943, -1.7163],
         [-1.4106, -1.9546, 11.3582,  ..., -1.8947, -2.1416, -1.8549],
         [-1.6100, -1.9521, 11.1932,  ..., -1.8369, -2.0665, -1.6602],
         ...,
         [-1.3872, -0.4273, -1.5600,  ..., -1.3751, -1.0653, -0.7592],
         [-1.3995, -0.5256, -1.6096,  ..., -1.3290, -1.0919, -0.7561],
         [-1.3771, -0.4597, -1.5576,  ..., -1.3910, -1.0577, -0.7153]]]), hidden_states=None, attentions=None)

In [51]:
words[0]

'(Đề thi có 05 trang)'

In [8]:
len(token_boxes), token_boxes[0]

(2,
 [[0, 0, 0, 0],
  [183, 118, 377, 140],
  [183, 118, 377, 140],
  [183, 118, 377, 140],
  [183, 118, 377, 140],
  [183, 118, 377, 140],
  [183, 118, 377, 140],
  [183, 118, 377, 140],
  [183, 118, 377, 140],
  [159, 344, 659, 365],
  [159, 344, 659, 365],
  [159, 344, 659, 365],
  [159, 344, 659, 365],
  [159, 344, 659, 365],
  [159, 344, 659, 365],
  [159, 344, 659, 365],
  [159, 344, 659, 365],
  [159, 344, 659, 365],
  [159, 344, 659, 365],
  [159, 344, 659, 365],
  [159, 344, 659, 365],
  [159, 344, 659, 365],
  [159, 344, 659, 365],
  [159, 344, 659, 365],
  [485, 118, 931, 140],
  [485, 118, 931, 140],
  [485, 118, 931, 140],
  [485, 118, 931, 140],
  [485, 118, 931, 140],
  [485, 118, 931, 140],
  [485, 118, 931, 140],
  [485, 118, 931, 140],
  [485, 118, 931, 140],
  [485, 118, 931, 140],
  [485, 118, 931, 140],
  [485, 118, 931, 140],
  [669, 138, 748, 159],
  [669, 138, 748, 159],
  [630, 180, 774, 198],
  [630, 180, 774, 198],
  [630, 180, 774, 198],
  [630, 180, 774, 19

In [10]:
for i in range(0, len(token_boxes)):
    for j in range(0, len(token_boxes[i])):
        if token_boxes[i][j] == [0,0,0,0] or token_boxes[i][j] == [1000,1000,1000,1000]:
            continue
        print("prediction label is: {}, bbox is: {} and the text is: {}".format(torch_dataset.id2label[predictions[i][j]], token_boxes[i][j],  processor.tokenizer.decode(encoding["input_ids"][i][j])))

        print(f'original text is: {words[encoding.token_to_word(i,j)]}')


NameError: name 'predictions' is not defined

In [70]:
instance_dict = {}

# iterate batch sample
for i in range(0, len(token_boxes)):
    # iterate token in batch
    for j in range(0, len(token_boxes[i])):
        box = tuple(token_boxes[i][j])
        if box == (0,0,0,0) or box == (1000,1000,1000,1000):
            continue
        if box not in instance_dict:
            instance_dict[box] = {"token_list":[(processor.decode(encoding["input_ids"][i][j]),predictions[i][j])],
                                  "original_string": words[encoding.token_to_word(i,j)]}
        else:
            instance_dict[box]['token_list'].append((processor.decode(encoding["input_ids"][i][j]),predictions[i][j]))


In [71]:
instance_dict

{(183,
  118,
  377,
  140): {'token_list': [('(', 8),
   ('Đ', 8),
   ('ề', 8),
   ('thi', 8),
   ('có', 8),
   ('05', 8),
   ('trang', 8),
   (')', 8)], 'original_string': '(Đề thi có 05 trang)'},
 (159,
  344,
  659,
  365): {'token_list': [('D', 2),
   ('.', 2),
   ('Phân', 2),
   ('chia', 2),
   ('phạm', 2),
   ('vi', 2),
   ('ảnh', 2),
   ('hưởng', 2),
   ('ở', 2),
   ('châu', 2),
   ('Âu', 2),
   (',', 2),
   ('châu', 2),
   ('Á', 2),
   ('.',
    2)], 'original_string': 'D. Phân chia phạm vi ảnh hưởng ở châu Âu, châu Á.'},
 (485,
  118,
  931,
  140): {'token_list': [('Thời', 8),
   ('gian', 8),
   ('làm', 8),
   ('bài', 8),
   (':', 8),
   ('50', 8),
   ('phút', 8),
   (',', 8),
   ('không', 8),
   ('kể', 8),
   ('thời', 8),
   ('gian',
    8)], 'original_string': 'Thời gian làm bài: 50 phút, không kể thời gian'},
 (669, 138, 748, 159): {'token_list': [('phát', 8), ('để', 8)],
  'original_string': 'phát để'},
 (630,
  180,
  774,
  198): {'token_list': [('Mã', 8),
   ('đề', 8)

In [13]:
# sample_box = instance_dict.popitem()


In [63]:
def majority_voting_label(token_list):
    vote_counts = {}
    for token in token_list:
        vote = token[1]
        if vote in vote_counts.keys():
            vote_counts[vote] += 1
        else:
            vote_counts[vote] = 1

    # winners = []
    max_count = max(vote_counts.values())
    for vote, count in vote_counts.items():
        if count == max_count:
            # winners.append(vote)

            return vote

    # if ties, then the result would be random accordingly to the order of vote

# majority_voting_label(sample_box[1])

In [72]:
for box, box_info in instance_dict.items():
    box_info['box_label'] = majority_voting_label(box_info['token_list'])



In [74]:
import pickle

with open("example_output_for_an_image.bin", 'wb') as f:
    p = pickle.Pickler(f)
    p.dump(instance_dict)

In [73]:
instance_dict

{(183,
  118,
  377,
  140): {'token_list': [('(', 8),
   ('Đ', 8),
   ('ề', 8),
   ('thi', 8),
   ('có', 8),
   ('05', 8),
   ('trang', 8),
   (')', 8)], 'original_string': '(Đề thi có 05 trang)', 'box_label': 8},
 (159,
  344,
  659,
  365): {'token_list': [('D', 2),
   ('.', 2),
   ('Phân', 2),
   ('chia', 2),
   ('phạm', 2),
   ('vi', 2),
   ('ảnh', 2),
   ('hưởng', 2),
   ('ở', 2),
   ('châu', 2),
   ('Âu', 2),
   (',', 2),
   ('châu', 2),
   ('Á', 2),
   ('.',
    2)], 'original_string': 'D. Phân chia phạm vi ảnh hưởng ở châu Âu, châu Á.', 'box_label': 2},
 (485,
  118,
  931,
  140): {'token_list': [('Thời', 8),
   ('gian', 8),
   ('làm', 8),
   ('bài', 8),
   (':', 8),
   ('50', 8),
   ('phút', 8),
   (',', 8),
   ('không', 8),
   ('kể', 8),
   ('thời', 8),
   ('gian',
    8)], 'original_string': 'Thời gian làm bài: 50 phút, không kể thời gian', 'box_label': 8},
 (669, 138, 748, 159): {'token_list': [('phát', 8), ('để', 8)],
  'original_string': 'phát để',
  'box_label': 8},
 (

In [15]:
def containment(box, point):
    if (point[0] > box[0] and point[0] < box[2]) and \
       (point[1] > box[1] and point[1] < box[3]):
        return True

    return False

In [16]:
def search_box(x, y):
    for box, list_token in instance_dict.items():
        if containment(box, (x,y)):
            return box

In [17]:
search_box(200,816)

(114, 815, 926, 836)

In [19]:
def box_divide(box, token_list):
    num_token = len(token_list)
    x0,y0,x1,y1 = box
    sub_boxes = []
    unit = (x1-x0)/num_token
    for i in range(num_token):
        sub_boxes.append((int(x0+i*unit), y0, int(x0+(1+i)*unit), y1))

    return  sub_boxes

box_divide(sample_box[0], sample_box[1])

[(518, 949, 559, 970),
 (559, 949, 600, 970),
 (600, 949, 641, 970),
 (641, 949, 682, 970),
 (682, 949, 723, 970),
 (723, 949, 764, 970),
 (764, 949, 805, 970)]

In [20]:
# def draw_image_with_component()
import numpy as np
import cv2

image = images
width = image.width
height = image.height


image = np.array(image)
# image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
for bbox, token_list in instance_dict.items():

    # todo:
    #   xác định major label of each box
    #   parted color field area for each box
    #   return its original

    major_label_id = majority_voting_label(token_list)

    # bbox = boxes[i]
    bbox = unnormalize_bbox(bbox, width, height)
    #
    label = torch_dataset.id2label[major_label_id]
    color = color_map.get(label)[0]
    print(label, color)
    #
    cv2.rectangle(image, (int(bbox[0]), int(bbox[1])),
                  (int(bbox[2]), int(bbox[3])),
                  color, thickness=2)

    label = torch_dataset.id2label[major_label_id]
    cv2.putText(image, label, (int(bbox[0]), int(bbox[1] - 2)), cv2.FONT_HERSHEY_SIMPLEX,
                fontScale=0.25, color=(0, 0, 255), thickness=1)

cv2.imshow("Image with Annotations", image)
cv2.waitKey(0)

starting (211, 63, 69)
answer (236, 126, 237)
starting (211, 63, 69)
starting (211, 63, 69)
starting (211, 63, 69)
starting (211, 63, 69)
starting (211, 63, 69)
title (148, 102, 168)
title (148, 102, 168)
answer (236, 126, 237)
answer (236, 126, 237)
answer (236, 126, 237)
answer (236, 126, 237)
answer (236, 126, 237)
answer (236, 126, 237)
answer (236, 126, 237)
title (148, 102, 168)
answer (236, 126, 237)
answer (236, 126, 237)
answer (236, 126, 237)
answer (236, 126, 237)
answer (236, 126, 237)
title (148, 102, 168)
answer (236, 126, 237)
title (148, 102, 168)
answer (236, 126, 237)
answer (236, 126, 237)
title (148, 102, 168)
answer (236, 126, 237)
answer (236, 126, 237)
answer (236, 126, 237)
starting (211, 63, 69)
starting (211, 63, 69)
starting (211, 63, 69)
answer (236, 126, 237)
answer (236, 126, 237)
answer (236, 126, 237)
starting (211, 63, 69)
title (148, 102, 168)
starting (211, 63, 69)
answer (236, 126, 237)
starting (211, 63, 69)
answer (236, 126, 237)
answer (236, 126, 

QObject::moveToThread: Current thread (0x6efff90) is not the object's thread (0x1b9cca40).
Cannot move to target thread (0x6efff90)

QObject::moveToThread: Current thread (0x6efff90) is not the object's thread (0x1b9cca40).
Cannot move to target thread (0x6efff90)

QObject::moveToThread: Current thread (0x6efff90) is not the object's thread (0x1b9cca40).
Cannot move to target thread (0x6efff90)

QObject::moveToThread: Current thread (0x6efff90) is not the object's thread (0x1b9cca40).
Cannot move to target thread (0x6efff90)

QObject::moveToThread: Current thread (0x6efff90) is not the object's thread (0x1b9cca40).
Cannot move to target thread (0x6efff90)

QObject::moveToThread: Current thread (0x6efff90) is not the object's thread (0x1b9cca40).
Cannot move to target thread (0x6efff90)

QObject::moveToThread: Current thread (0x6efff90) is not the object's thread (0x1b9cca40).
Cannot move to target thread (0x6efff90)

QObject::moveToThread: Current thread (0x6efff90) is not the object's

27

[30mhello[0m black
[30mhello[0m grey
[31mhello[0m red
[32mhello[0m green
[33mhello[0m yellow
[34mhello[0m blue
[35mhello[0m magenta
[36mhello[0m cyan
[37mhello[0m light_grey
[90mhello[0m dark_grey
[91mhello[0m light_red
[92mhello[0m light_green
[93mhello[0m light_yellow
[94mhello[0m light_blue
[95mhello[0m light_magenta
[96mhello[0m light_cyan
[97mhello[0m white
