In [1]:
from layoutlm import FunsdDataset, LayoutlmConfig, LayoutlmForTokenClassification
from transformers import BertTokenizer, AutoTokenizer
from layoutlm.data.funsd import read_examples_from_file, convert_examples_to_features

import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from tqdm import tqdm, trange
import numpy as np

# Utility functions

In [2]:
def get_labels(path):
    with open(path, "r") as f:
        labels = f.read().splitlines()
    if "O" not in labels:
        labels = ["O"] + labels
    return labels

# Prepare data

In [3]:
tokenizer = AutoTokenizer.from_pretrained('model/layoutlm-base-uncased/')

In [4]:
labels = get_labels('data/infer_data/labels.txt')
num_labels = len(labels)
pad_token_label_id = CrossEntropyLoss().ignore_index

In [5]:
examples = read_examples_from_file('data/infer_data/', 'test')

In [6]:
features = convert_examples_to_features(
    examples, labels, 512,
    tokenizer, cls_token_at_end=False, # xlnet has a cls token at the end
    cls_token=tokenizer.cls_token, cls_token_segment_id=0,
    sep_token=tokenizer.sep_token, sep_token_extra=False,
    # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
    pad_on_left=False,
    # pad on the left for xlnet
    pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
    pad_token_segment_id=0,
    pad_token_label_id=pad_token_label_id,
)

In [7]:
all_input_ids = torch.tensor(
    [f.input_ids for f in features], dtype=torch.long
)
all_input_mask = torch.tensor(
    [f.input_mask for f in features], dtype=torch.long
)
all_segment_ids = torch.tensor(
    [f.segment_ids for f in features], dtype=torch.long
)
all_label_ids = torch.tensor(
    [f.label_ids for f in features], dtype=torch.long
)
all_bboxes = torch.tensor([f.boxes for f in features], dtype=torch.long)

# Model for inference

In [8]:
model = LayoutlmForTokenClassification.from_pretrained('model/layoutlm-base-uncased/')
model.to('cuda')
model.eval();

Some weights of the model checkpoint at model/layoutlm-base-uncased/ were not used when initializing LayoutlmForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing LayoutlmForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LayoutlmForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LayoutlmForTokenClassification were not initialized from the model checkpoint at model/layoutlm-base-uncased/ and are newly i

In [9]:
inputs = {
    'input_ids': all_input_ids[0].unsqueeze(0).to('cuda'),
    'attention_mask': all_input_mask[0].unsqueeze(0).to('cuda'),
    'token_type_ids': all_segment_ids[0].unsqueeze(0).to('cuda'),
    'bbox': all_bboxes[0].unsqueeze(0).to('cuda')
}

In [15]:
with torch.no_grad():
    outputs = model(**inputs)

In [33]:
sample_inputs = (
    inputs['input_ids'], inputs['bbox'], 
    inputs['attention_mask'], inputs['token_type_ids'], 
)
torch.onnx.export(
    model, sample_inputs, 'model/model.onnx', export_params=True,
    opset_version=11, do_constant_folding=True,
    input_names=['input_ids', 'bbox', 'attention_mask', 'token_type_ids'], output_names=['output'],
    dynamic_axes={
        'input_ids' : {0 : 'batch_size'},    # variable lenght axes
        'bbox' : {0 : 'batch_size'},
        'attention_mask' : {0 : 'batch_size'},
        'token_type_ids' : {0 : 'batch_size'},
        'output' : {0 : 'batch_size'}
    }
)

  assert all(


# Post proc

In [47]:
preds = outputs[0].squeeze(0).detach().cpu().numpy()
preds = np.argmax(preds, axis=1)
out_label_ids = all_label_ids[0].numpy()

In [29]:
label_map = {i: label for i, label in enumerate(labels)}

In [50]:
out_label_list = [[] for _ in range(out_label_ids.shape[0])]
preds_list = [[] for _ in range(out_label_ids.shape[0])]

for i in range(out_label_ids.shape[0]):
    if out_label_ids[i] != pad_token_label_id:
        out_label_list[i].append(label_map[out_label_ids[i]])
        preds_list[i].append(label_map[preds[i]])

In [51]:
out_label_list

[[],
 ['S-QUESTION'],
 ['S-QUESTION'],
 ['S-QUESTION'],
 ['S-QUESTION'],
 ['S-QUESTION'],
 ['O'],
 ['O'],
 ['O'],
 ['S-QUESTION'],
 ['S-QUESTION'],
 ['S-QUESTION'],
 [],
 [],
 ['S-QUESTION'],
 ['S-QUESTION'],
 ['S-ANSWER'],
 [],
 ['S-ANSWER'],
 [],
 ['O'],
 [],
 [],
 [],
 [],
 ['O'],
 ['O'],
 ['B-QUESTION'],
 ['E-QUESTION'],
 ['B-QUESTION'],
 ['E-QUESTION'],
 ['B-HEADER'],
 ['E-HEADER'],
 [],
 [],
 [],
 ['O'],
 ['O'],
 ['O'],
 ['O'],
 ['O'],
 ['O'],
 ['O'],
 ['O'],
 ['O'],
 ['O'],
 [],
 [],
 [],
 ['O'],
 [],
 ['O'],
 [],
 ['O'],
 [],
 ['O'],
 [],
 [],
 ['B-ANSWER'],
 ['I-ANSWER'],
 [],
 ['E-ANSWER'],
 ['B-ANSWER'],
 ['I-ANSWER'],
 ['I-ANSWER'],
 ['I-ANSWER'],
 [],
 ['I-ANSWER'],
 ['I-ANSWER'],
 [],
 [],
 ['E-ANSWER'],
 [],
 ['B-QUESTION'],
 ['E-QUESTION'],
 ['B-ANSWER'],
 ['I-ANSWER'],
 ['E-ANSWER'],
 ['B-ANSWER'],
 [],
 [],
 ['I-ANSWER'],
 ['E-ANSWER'],
 ['B-ANSWER'],
 [],
 ['I-ANSWER'],
 [],
 ['E-ANSWER'],
 [],
 [],
 ['B-ANSWER'],
 ['E-ANSWER'],
 ['B-QUESTION'],
 ['E-QUESTION'],
 ['B