In [1]:
import os
import argparse
import json
import pickle
from PIL import Image, ImageDraw
import torch
from paddleocr import PaddleOCR

from bs4 import BeautifulSoup as bs
from IPython.core.display import display, HTML

from donut_layoutLMv3_3 import DonutConfig, DonutModel

  from .autonotebook import tqdm as notebook_tqdm


In [22]:
args_list = [
  "--input_dir", "example",
  "--output_dir", "predicts",
  "--model_path", "model/swin-en_pubtabnet_200-400",
]

In [23]:
parser = argparse.ArgumentParser()

#data
parser.add_argument("--input_dir", type=str, required=True)
parser.add_argument("--output_dir", type=str, required=True)
parser.add_argument("--model_path", type=str, required=True)
args = parser.parse_args(args_list)
args

Namespace(input_dir='example', output_dir='predicts', model_path='model/swin-en_pubtabnet_200-400')

In [40]:
if args.model_path.split("/")[-1].startswith("swin-en"):
    from donut_swin import DonutConfig, DonutModel
    preprocessing_data = preprocessing_data_swin
else:
    from donut_layoutLMv3_3 import DonutConfig, DonutModel
    preprocessing_data = preprocessing_data_layoutlm
    

In [30]:
os.makedirs(args.output_dir, exist_ok=True)
os.makedirs(args.output_dir + "/ocr_results", exist_ok=True)
os.makedirs(args.output_dir + "/model_predictions", exist_ok=True)

def collate_fn(batch):
    encoder_inputs= {}
    for key in ["input_ids", "bbox", "attention_mask", "pixel_values"]:
        encoder_inputs[key] = torch.cat([b["encoding_inputs"][key] for b in batch])
    
    image_paths = [b["image_path"] for b in batch]
    
    return encoder_inputs, image_paths

In [31]:
ocr = PaddleOCR(use_angle_cls=False, lang='en')

[2023/11/16 15:07:46] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=True, use_xpu=False, use_npu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/home/hikaru-si/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/home/hikaru-si/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, r

In [32]:
def visualize_bbox(img, boxes, texts):
    w, h = img.size
    draw = ImageDraw.Draw(img)
    for bb, t in zip(boxes, texts):
        x0, y0, x1, y1 = bb
        x0 = x0 * w
        y0 = y0 * h
        x1 = x1 * w
        y1 = y1 * h
        draw.rectangle([x0, y0, x1, y1], fill=None, outline=(225, 0, 100))
        draw.text((x0, y0), t, fill=(225, 0, 225))
    return img

In [33]:
def build_model(model_path):
    with open(f"{model_path}/model_config.json", "r") as f:
        config = json.load(f)
    donut_config = DonutConfig.from_dict(config)
    model = DonutModel(donut_config)
    trained_state_dict = torch.load(f"{model_path}/best_model.cpt")
    model.load_state_dict(trained_state_dict)
    return model

In [34]:
data = []
for image_name in os.listdir(args.input_dir):
    image_path = f"{args.input_dir}/{image_name}"
    img = Image.open(image_path)
    w, h = img.size
    results = ocr.ocr(image_path, cls=False)
    result = results[0]
    boxes = []
    texts = []
    if result is not None:
        for line in result:
            top_left, top_right, bottom_right, bottom_left = line[0]
            x0, y0 = top_left
            x1, y1 = bottom_right
            x0 = x0 / w
            y0 = y0 / h
            x1 = x1 / w
            y1 = y1 / h
            boxes.append([x0, y0, x1, y1])
            texts.append(line[1][0])
    draw_img = visualize_bbox(img, boxes, texts)
    draw_img.save(f"{args.output_dir}/ocr_results/{image_name}")
    data.append({"image_path": image_path, "image_name": image_name, "texts": texts, "boxes": boxes})

[2023/11/16 15:07:52] ppocr DEBUG: dt_boxes num : 153, elapsed : 0.0766901969909668


[2023/11/16 15:07:53] ppocr DEBUG: rec_res num  : 153, elapsed : 0.2129216194152832
[2023/11/16 15:07:53] ppocr DEBUG: dt_boxes num : 16, elapsed : 0.015017032623291016
[2023/11/16 15:07:53] ppocr DEBUG: rec_res num  : 16, elapsed : 0.024315834045410156


In [35]:
### model download
model = build_model(args.model_path)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'XLMRobertaTokenizer'.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


init weight of <facebook/mbart-large-50>


Some weights of the model checkpoint at facebook/mbart-large-50 were not used when initializing MBartForCausalLM: ['model.encoder.layers.6.self_attn.q_proj.bias', 'model.encoder.layers.1.self_attn.k_proj.bias', 'model.encoder.layers.0.fc1.bias', 'model.encoder.layers.4.fc1.bias', 'model.encoder.layernorm_embedding.bias', 'model.encoder.layers.3.self_attn.k_proj.bias', 'model.encoder.layers.9.fc2.bias', 'model.encoder.layers.9.self_attn.k_proj.weight', 'model.encoder.layers.10.self_attn.v_proj.weight', 'model.encoder.layers.6.final_layer_norm.weight', 'model.encoder.layers.5.self_attn.v_proj.bias', 'model.encoder.layers.4.self_attn_layer_norm.weight', 'model.encoder.layers.3.self_attn.q_proj.bias', 'model.encoder.layers.4.fc2.bias', 'model.encoder.layers.10.self_attn_layer_norm.bias', 'model.encoder.layers.2.final_layer_norm.weight', 'model.encoder.layers.0.fc2.bias', 'model.encoder.layers.9.self_attn.q_proj.bias', 'model.encoder.layers.4.self_attn.v_proj.weight', 'model.encoder.layers.

model.decoder.embed_positions.weight


In [36]:
len(model.decoder.tokenizer)

250055

In [37]:
model = model.cuda()

In [38]:
model.device

device(type='cuda', index=0)

In [17]:
def preprocessing_data_layoutlm(model, data):
    #tokenizer
    encoder_processor = model.encoder.prepare_input
    for sample in data:
        image = Image.open(sample["image_path"])
        encoder_encoding = encoder_processor(image, sample["texts"], bboxes=sample["boxes"])
        encoder_encoding["bbox"] = (encoder_encoding["bbox"]*1000).to(torch.int32)
        sample["encoding_inputs"] = encoder_encoding
    
    return data

In [39]:
def preprocessing_data_swin(model, data):
    #tokenizer
    encoder_processor = model.encoder.prepare_input
    for sample in data:
        image = Image.open(sample["image_path"])
        encoder_encoding = encoder_processor(image)
        sample["encoding_inputs"] = encoder_encoding
    
    return data

In [41]:
dataset = preprocessing_data(model, data)

In [42]:
dataset

[{'image_path': 'example/100.png',
  'image_name': '100.png',
  'texts': ['Subsidiary',
   'Medtronic',
   'Medtronic,',
   'Medtronic',
   'Non-',
   'Consolidating',
   'plc',
   'Inc.',
   'Luxco',
   'guarantors',
   'Adjustments',
   'Total',
   'Net sales',
   '$',
   '1,261',
   'S',
   '$',
   '20,261',
   '$',
   '(1,261) $',
   '20,261',
   'Costs and expenses:',
   'Cost of products sold',
   '895',
   '6,659',
   '{1,245)',
   '6,309',
   'Research and development expense',
   '552',
   '1,088',
   '1,640',
   'Selling, general, and administrative expense',
   '1',
   '857',
   '6,046',
   '6,904',
   'Special charge (gain), net',
   '100',
   '(138)',
   '(38)',
   'Restructuring charges, net',
   '7',
   '230',
   '237',
   'Certain litigation charges',
   '42',
   '42',
   'Acquisition-related items',
   '312',
   '238',
   ' 550',
   'Amortization of intangible assets',
   '11',
   '722',
   '733',
   'Other expense (income), net',
   '103',
   '(1,618)',
   '1,633',
  

In [50]:
generations = []
for sample in dataset:
    inputs = sample["encoding_inputs"].unsqueeze(0)
    generate_text = model.inference(image_tensors=inputs)["predictions"]
    generations.append({"image_name": sample["image_name"], "generation": generate_text[0]})

In [51]:
inputs.unsqueeze(0).shape

torch.Size([1, 1, 3, 448, 896])

In [52]:
for sample in generations:
  with open(f"{args.output_dir}/model_predictions/{sample['image_name']}.txt", "w") as f:
    f.write(str(sample["generation"]))

In [16]:
html = generations[1]["generation"][0]
html

'<table><thead><tr><td><b><i>α</i></b></td><td><b>1</b></td><td><b>12</b></td><td><b>14</b></td></tr></thead><tbody><tr><td><b>precision</b><b>(%)</b></td><td>72.92</td><td>65.96</td><td>47.92</td></tr><tr><td><b>recall</b><b>(%)</b></td><td>85.37</td><td>96.88</td><td>100.00</td></tr><tr><td><b><i><b>f</i></b><b>-measure</b><b>(%)</b></td><td>84.34</td><td>78.48</td><td>64.79</td></tr></tbody></table>'

In [45]:
HTML(html)

α,1,2,1.1
precision (%),72.92,65.96,47.92
recall (%),85.37,96.88,100.0
f-measure(%),84.34,78.48,64.79


In [33]:
html_table  = HTML(format_html(html))

In [34]:
type(html_table)

IPython.core.display.HTML

In [35]:
html_table

Unnamed: 0,"Mediatronic,","Medtronic,","Medtronic,.1",Semiary,Consoliddating,Consolidating,Total
Costs sold,1,1261,5.0,0659,1245.0,1245,1360.0
Research and development expenses,1,1,552.0,1586,1044.0,1645,1900.0
Acquisition-related items,"Other expense (income), net",1034,145.0,(56),3734.0,(16),7726.0
Indirected inactivity,1,1,,,,,

Unnamed: 0,"Mediatronic,","Medtronic,","Medtronic,.1",Semiary,Consoliddating,Consolidating,Total
Costs sold,1,1261,5.0,0659,1245.0,1245,1360.0
Research and development expenses,1,1,552.0,1586,1044.0,1645,1900.0
Acquisition-related items,"Other expense (income), net",1034,145.0,(56),3734.0,(16),7726.0
Indirected inactivity,1,1,,,,,


In [37]:
html

'<table><thead><tr><td></td><td><b>Mediatronic,</b></td><td><b>Medtronic,</b></td><td><b>Medtronic,</b></td><td><b>Semiary</b></td><td><b>Consoliddating</b></td><td><b>Consolidating</b></td><td><b>Total</b></td></tr></thead><tbody><tr><td>Costs sold</td><td>1</td><td>1,261</td><td>5</td><td>0,659</td><td>1,245</td><td>1,245</td><td>1,360</td></tr><tr><td>Research and development expenses</td><td>1</td><td>1</td><td>552</td><td>1,586</td><td>1,044</td><td>1,645</td><td>1,900</td></tr><tr><td>Acquisition-related items</td><td>Other expense (income), net</td><td>1034</td><td>145</td><td>(56)</td><td>3,734</td><td>(16)</td><td>7,726</td></tr><tr><td>Indirected inactivity</td><td>1</td><td>1</'