<a href="https://colab.research.google.com/github/duncansamuelgeorgefreeman/colab/blob/master/LayoutLM_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! rm -r unilm
! git clone -b remove_torch_save https://github.com/NielsRogge/unilm.git
! cd unilm/layoutlm
! pip install unilm/layoutlm

In [None]:
! rm -r transformers
! git clone https://github.com/huggingface/transformers.git
! cd transformers
! pip install ./transformers

In [None]:
! wget https://guillaumejaume.github.io/FUNSD/dataset.zip
! unzip dataset.zip && mv dataset data && rm -rf dataset.zip __MACOSX

In [None]:
! pwd

In [None]:
! ls

In [None]:
! ls /content/data/training_data/images/

In [None]:
from PIL import Image, ImageDraw, ImageFont

img1 = Image.open("/content/data/training_data/images/0000990274.png").convert("RGB")
img1

In [None]:
# Plot Annotations: 
## Each annotation has a label, bbox, one or more words (with their own bbox). BBOX Format [xleft, ytop, xright, ybottom]
### Annotations are contained in training_data/annotations/docname.json
#### Access the annotation data under item["form"]

In [None]:
import json

with open("/content/data/training_data/annotations/0000990274.json") as f:
  anno_data = json.load(f)

for annotation in anno_data["form"]:
  print(annotation)

In [None]:
draw = ImageDraw.Draw(img1, "RGBA")
font = ImageFont.load_default()
label_colors_dict = {'question':'blue', 'answer':'green', 'header':'orange', 'other':'violet'}

for annotation in anno_data["form"]:
  label = annotation['label']
  bbox = annotation['box']
  draw.rectangle(bbox, outline=label_colors_dict[label], width=2)
  # ytop=yposition from top of document - ((xleft + 10, ytop - 10
  draw.text((bbox[0] + 10, bbox[1] - 12), label, fill=label_colors_dict[label], font=font)
  for word in annotation['words']:
    word_bbox = word['box']
    draw.rectangle(word_bbox, outline=label_colors_dict[label], width=1)

img1

### Preprocessing Data

Turn document images into individual tokens and corresponding labels (**BIOES** format):

**B**egin
**I**nside
**O**utside
**E**nd 
**S**ingle

Example:  

Alex S-PER  
is O  
playing O

basketball I-SPORT

with O

Marty B-PER

. O

Rick E-PER

likes O

to O

eat O

Pizza I-FOOD

in O

Los B-LOC

Angeles E-LOC

In [None]:
! ls

In [None]:
! python unilm/layoutlm/examples/seq_labeling/preprocess.py --data_dir data/training_data/annotations \
                                                      --data_split train \
                                                      --output_dir data \
                                                      --model_name_or_path microsoft/layoutlm-base-uncased \
                                                      --max_len 510

! python unilm/layoutlm/examples/seq_labeling/preprocess.py --data_dir data/testing_data/annotations \
                                                      --data_split test \
                                                      --output_dir data \
                                                      --model_name_or_path microsoft/layoutlm-base-uncased \
                                                      --max_len 510

In [None]:
# Create labels.txt file containing the unique labels of the FUNSD dataset:
! cat data/train.txt | cut -d$'\t' -f 2 | grep -v "^$"| sort | uniq > data/labels.txt

In [None]:
ls data/

In [None]:
with open("/content/data/labels.txt", "r") as f:
  labels = f.read().splitlines()
  print(labels)

### Define a PyTorch Dataset

1. Create a list containing unique labels contained in labels.txt

In [None]:
from torch.nn import CrossEntropyLoss

def get_labels(path):
  with open(path, "r") as f:
    labels = f.read().splitlines()
  if "O" not in labels:
    labels += ["O"]
  return labels

labels = get_labels("data/labels.txt")
num_labels = len(labels)
label_map = {i: label for i, label in enumerate(labels)}

pad_token_label_id = CrossEntropyLoss().ignore_index

In [None]:
print(labels)