In [1]:
from transformers import LayoutXLMProcessor, LayoutLMv2ForTokenClassification
from data_loader_coco_image import DocumentLayoutAnalysisDataset, unnormalize_bbox, color_map
import torch

processor = LayoutXLMProcessor.from_pretrained(
    "microsoft/layoutxlm-base",
    apply_ocr=False,
    only_label_first_subword=False,
    is_split_into_words=True)

anno_file = "/home/tiendq/PycharmProjects/DeepLearningDocReconstruction/0_data_repository/1000DataForOCR_fineLabel_dataset_coco.json"
image_root_folder = "/home/tiendq/Desktop/DocRec/2_data_preparation/2_selected_sample"
torch_dataset = DocumentLayoutAnalysisDataset(image_root_folder, anno_file)

model = LayoutLMv2ForTokenClassification.from_pretrained(
    '/home/tiendq/Desktop/DocRec/3_model_checkpoint/0_model_repository',
    num_labels=len(torch_dataset.label_list),
    id2label=torch_dataset.id2label,
    label2id=torch_dataset.label2id)



loading annotations into memory...
Done (t=0.29s)
creating index...
index created!


In [2]:
from PIL import Image

image_column_name = "image"
text_column_name = "words"
boxes_column_name = "boxes"
label_column_name = "labels_id"

examples = torch_dataset[300]

images = Image.open(examples['image_path']).convert("RGB")
words = examples[text_column_name]
boxes = examples[boxes_column_name]
word_labels = examples[label_column_name]

In [3]:
encoding = processor(images, words, boxes=boxes, word_labels=word_labels, truncation=True, stride =128,
         padding="max_length", max_length=512, return_overflowing_tokens=True, return_offsets_mapping=True, return_tensors='pt')

offset_mapping = encoding.pop('offset_mapping')

overflow_to_sample_mapping = encoding.pop('overflow_to_sample_mapping')

In [4]:
del encoding['labels']

In [5]:
# import numpy as np
x = []
for i in range(0, len(encoding['image'])):
     x.append(encoding['image'][i])
x = torch.stack(x)
encoding['image'] = x

In [6]:
for k,v in encoding.items():
  print(k,type(v), v.shape, sep="\t\t\t\t")

input_ids				<class 'torch.Tensor'>				torch.Size([2, 512])
attention_mask				<class 'torch.Tensor'>				torch.Size([2, 512])
bbox				<class 'torch.Tensor'>				torch.Size([2, 512, 4])
image				<class 'torch.Tensor'>				torch.Size([2, 3, 224, 224])


In [7]:
with torch.no_grad():
  outputs = model(**encoding)

# The model outputs logits of shape (batch_size, seq_len, num_labels).
logits = outputs.logits
print(logits.shape)

# We take the highest score for each token, using argmax. This serves as the predicted label for each token.
predictions = logits.argmax(-1).squeeze().tolist()
token_boxes = encoding.bbox.squeeze().tolist()

if (len(token_boxes) == 512):
  predictions = [predictions]
  token_boxes = [token_boxes]

torch.Size([2, 512, 9])


In [8]:
len(token_boxes), token_boxes[0]

(2,
 [[0, 0, 0, 0],
  [74, 949, 378, 969],
  [74, 949, 378, 969],
  [74, 949, 378, 969],
  [74, 949, 378, 969],
  [74, 949, 378, 969],
  [74, 949, 378, 969],
  [518, 949, 805, 970],
  [518, 949, 805, 970],
  [518, 949, 805, 970],
  [518, 949, 805, 970],
  [518, 949, 805, 970],
  [518, 949, 805, 970],
  [518, 949, 805, 970],
  [342, 0, 835, 20],
  [342, 0, 835, 20],
  [342, 0, 835, 20],
  [342, 0, 835, 20],
  [342, 0, 835, 20],
  [342, 0, 835, 20],
  [342, 0, 835, 20],
  [342, 0, 835, 20],
  [342, 0, 835, 20],
  [342, 0, 835, 20],
  [342, 0, 835, 20],
  [342, 0, 835, 20],
  [342, 0, 835, 20],
  [342, 0, 835, 20],
  [75, 0, 266, 19],
  [75, 0, 266, 19],
  [75, 0, 266, 19],
  [75, 0, 266, 19],
  [75, 0, 266, 19],
  [75, 0, 266, 19],
  [75, 0, 266, 19],
  [114, 688, 932, 709],
  [114, 688, 932, 709],
  [114, 688, 932, 709],
  [114, 688, 932, 709],
  [114, 688, 932, 709],
  [114, 688, 932, 709],
  [114, 688, 932, 709],
  [114, 688, 932, 709],
  [114, 688, 932, 709],
  [114, 688, 932, 709],


In [9]:
for i in range(0, len(token_boxes)):
      for j in range(0, len(token_boxes[i])):
             print("label is: {}, bbox is: {} and the text is: {}".format(torch_dataset.id2label[predictions[i][j]], token_boxes[i][j],  processor.tokenizer.decode(encoding["input_ids"][i][j])))

label is: footer, bbox is: [0, 0, 0, 0] and the text is: <s>
label is: footer, bbox is: [74, 949, 378, 969] and the text is: Facebook
label is: footer, bbox is: [74, 949, 378, 969] and the text is: :
label is: footer, bbox is: [74, 949, 378, 969] and the text is: Học
label is: footer, bbox is: [74, 949, 378, 969] and the text is: cùng
label is: footer, bbox is: [74, 949, 378, 969] and the text is: Viet
label is: footer, bbox is: [74, 949, 378, 969] and the text is: jack
label is: footer, bbox is: [518, 949, 805, 970] and the text is: Youtube
label is: footer, bbox is: [518, 949, 805, 970] and the text is: :
label is: footer, bbox is: [518, 949, 805, 970] and the text is: H
label is: footer, bbox is: [518, 949, 805, 970] and the text is: ọc
label is: footer, bbox is: [518, 949, 805, 970] and the text is: cùng
label is: footer, bbox is: [518, 949, 805, 970] and the text is: Viet
label is: footer, bbox is: [518, 949, 805, 970] and the text is: jack
label is: header, bbox is: [342, 0, 835,

In [20]:
instance_dict = {}

for i in range(0, len(token_boxes)):
    for j in range(0, len(token_boxes[i])):
        box = tuple(token_boxes[i][j])
        if box == (0,0,0,0):
            continue
        if box not in instance_dict:
            instance_dict[box] = [(encoding["input_ids"][i][j],predictions[i][j])]
        else:
            instance_dict[box].append((encoding["input_ids"][i][j],predictions[i][j]))

In [11]:
sample_box = instance_dict.popitem()


In [16]:
sample_box

((114, 815, 926, 836),
 [(tensor(81000), 0),
  (tensor(2289), 0),
  (tensor(46158), 0),
  (tensor(2735), 0),
  (tensor(8061), 0),
  (tensor(925), 0),
  (tensor(3042), 0),
  (tensor(37409), 0),
  (tensor(80570), 0),
  (tensor(15195), 0),
  (tensor(28588), 0),
  (tensor(44565), 0),
  (tensor(5890), 0),
  (tensor(5893), 0),
  (tensor(4), 0),
  (tensor(100801), 0),
  (tensor(27517), 0),
  (tensor(276), 0),
  (tensor(71479), 0),
  (tensor(66), 0),
  (tensor(550), 0)])

In [14]:
def containment(box, point):
    if (point[0] > box[0] and point[0] < box[2]) and \
       (point[1] > box[1] and point[1] < box[3]):
        return True

    return False

In [21]:
def search_box(x, y):
    for box, list_token in instance_dict.items():
        if containment(box, (x,y)):
            return box

In [22]:
search_box(200,816)

(114, 815, 926, 836)

In [13]:
def majority_voting_label(token_list):
    # def majority_vote(l):
    vote_counts = {}
    for token in token_list:
        vote = token[1]
        if vote in vote_counts.keys():
            vote_counts[vote] += 1
        else:
            vote_counts[vote] = 1

    # winners = []
    max_count = max(vote_counts.values())
    for vote, count in vote_counts.items():
        if count == max_count:
            # winners.append(vote)

            return vote

    # if ties, then the result would be random accordingly to the order of vote

majority_voting_label(sample_box[1])

0

In [14]:
def box_divide(box, token_list):
    num_token = len(token_list)
    x0,y0,x1,y1 = box
    sub_boxes = []
    unit = (x1-x0)/num_token
    for i in range(num_token):
        sub_boxes.append((int(x0+i*unit), y0, int(x0+(1+i)*unit), y1))

    return  sub_boxes

box_divide(sample_box[0], sample_box[1])

[(114, 815, 152, 836),
 (152, 815, 191, 836),
 (191, 815, 230, 836),
 (230, 815, 268, 836),
 (268, 815, 307, 836),
 (307, 815, 346, 836),
 (346, 815, 384, 836),
 (384, 815, 423, 836),
 (423, 815, 462, 836),
 (462, 815, 500, 836),
 (500, 815, 539, 836),
 (539, 815, 578, 836),
 (578, 815, 616, 836),
 (616, 815, 655, 836),
 (655, 815, 694, 836),
 (694, 815, 732, 836),
 (732, 815, 771, 836),
 (771, 815, 810, 836),
 (810, 815, 848, 836),
 (848, 815, 887, 836),
 (887, 815, 926, 836)]

In [None]:
# def draw_image_with_component()
import numpy as np
import cv2

image = images
width = image.width
height = image.height


image = np.array(image)
# image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
for bbox, token_list in instance_dict.items():

    # todo:
    #   xác định major label of each box
    #   parted color field area for each box
    #   return its original

    major_label_id = majority_voting_label(token_list)

    # bbox = boxes[i]
    bbox = unnormalize_bbox(bbox, width, height)
    #
    label = torch_dataset.id2label[major_label_id]
    color = color_map.get(label)[0]
    print(label, color)
    #
    cv2.rectangle(image, (int(bbox[0]), int(bbox[1])),
                  (int(bbox[2]), int(bbox[3])),
                  color, thickness=2)

    label = torch_dataset.id2label[major_label_id]
    cv2.putText(image, label, (int(bbox[0]), int(bbox[1] - 2)), cv2.FONT_HERSHEY_SIMPLEX,
                fontScale=0.25, color=(0, 0, 255), thickness=1)

cv2.imshow("Image with Annotations", image)
cv2.waitKey(0)

footer (82, 82, 82)
footer (82, 82, 82)
header (124, 125, 121)
header (124, 125, 121)
answer (236, 126, 237)
answer (236, 126, 237)
answer (236, 126, 237)
title (148, 102, 168)
title (148, 102, 168)
answer (236, 126, 237)
starting (211, 63, 69)
answer (236, 126, 237)
answer (236, 126, 237)
answer (236, 126, 237)
title (148, 102, 168)
answer (236, 126, 237)
starting (211, 63, 69)
starting (211, 63, 69)
title (148, 102, 168)
answer (236, 126, 237)
answer (236, 126, 237)
answer (236, 126, 237)
answer (236, 126, 237)
starting (211, 63, 69)
starting (211, 63, 69)
answer (236, 126, 237)
starting (211, 63, 69)
answer (236, 126, 237)
starting (211, 63, 69)
starting (211, 63, 69)
starting (211, 63, 69)
answer (236, 126, 237)
title (148, 102, 168)
starting (211, 63, 69)
title (148, 102, 168)
starting (211, 63, 69)
starting (211, 63, 69)
title (148, 102, 168)
answer (236, 126, 237)
answer (236, 126, 237)
answer (236, 126, 237)
starting (211, 63, 69)
answer (236, 126, 237)
answer (236, 126, 237)
a

QObject::moveToThread: Current thread (0x2244b4a0) is not the object's thread (0x36825340).
Cannot move to target thread (0x2244b4a0)

QObject::moveToThread: Current thread (0x2244b4a0) is not the object's thread (0x36825340).
Cannot move to target thread (0x2244b4a0)

QObject::moveToThread: Current thread (0x2244b4a0) is not the object's thread (0x36825340).
Cannot move to target thread (0x2244b4a0)

QObject::moveToThread: Current thread (0x2244b4a0) is not the object's thread (0x36825340).
Cannot move to target thread (0x2244b4a0)

QObject::moveToThread: Current thread (0x2244b4a0) is not the object's thread (0x36825340).
Cannot move to target thread (0x2244b4a0)

QObject::moveToThread: Current thread (0x2244b4a0) is not the object's thread (0x36825340).
Cannot move to target thread (0x2244b4a0)

QObject::moveToThread: Current thread (0x2244b4a0) is not the object's thread (0x36825340).
Cannot move to target thread (0x2244b4a0)

QObject::moveToThread: Current thread (0x2244b4a0) is n

In [None]:
from termcolor import colored
import termcolor

for color in  termcolor.COLORS:
    print(colored('hello', color), color)
