In [None]:
%pip install gdown
!gdown https://drive.google.com/uc?id=1uPV0H__c189GDoKCC_xNUJ-hxeu5Ji0Y
!unzip datathon_document_classification.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: datathon_document_classification/data/questionnaire/9215.tif  
  inflating: datathon_document_classification/data/questionnaire/9217.tif  
  inflating: datathon_document_classification/data/questionnaire/9225.tif  
  inflating: datathon_document_classification/data/questionnaire/9252.tif  
  inflating: datathon_document_classification/data/questionnaire/9282.tif  
  inflating: datathon_document_classification/data/questionnaire/9286.tif  
  inflating: datathon_document_classification/data/questionnaire/9335.tif  
  inflating: datathon_document_classification/data/questionnaire/937.tif  
  inflating: datathon_document_classification/data/questionnaire/9402.tif  
  inflating: datathon_document_classification/data/questionnaire/9406.tif  
  inflating: datathon_document_classification/data/questionnaire/943.tif  
  inflating: datathon_document_classification/data/questionnaire/9475.tif  
  inflating: datathon_doc

In [None]:
!sudo apt install -y tesseract-ocr
%pip install transformers datasets pytesseract

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'sudo apt autoremove' to remove it.
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 20 not upgraded.
Need to get 4,795 kB of archives.
After this operation, 15.8 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr-eng all 4.00~git24-0e00fe6-1.2 [1,588 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr-osd all 4.00~git24-0e00fe6-1.2 [2,989 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr amd64 4.00~git2288-10f4998a-2 [218 kB]
Fetched 4,795 kB in 1s (3,661 kB/s)
debconf: unable to initi

### Import Libraries and Set Preferences

In [None]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
import pytesseract

from transformers import LayoutLMTokenizer, LayoutLMForSequenceClassification, AdamW, TrainingArguments, Trainer

from datasets.utils import disable_progress_bar
from datasets import Dataset, Image, load_from_disk, Features, Sequence, ClassLabel, Value, Array2D, load_metric
from sklearn.model_selection import train_test_split

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

In [None]:
random = np.random.RandomState(42)


In [None]:
import warnings
import logging
import matplotlib

logging.getLogger("tensorflow").setLevel(logging.ERROR)
warnings.filterwarnings('ignore')

%matplotlib inline
plt.style.use("seaborn-darkgrid")

np.set_printoptions(precision=2, suppress=True)

disable_progress_bar()


## Load And Preprocess Data

In [None]:
data_dir = "./datathon_document_classification/data/"
submission_dir = os.path.join(data_dir, "..", "validation")

labels = [label for label in os.listdir(data_dir)]

idx2label = {v: k for v, k in enumerate(labels)}
label2idx = {k: v for v, k in enumerate(labels)}


In [None]:
data = []

for label in labels:
    for filename in os.listdir(os.path.join(data_dir, label)):
        data.append((os.path.join(data_dir, label, filename), label))


random.shuffle(data)

data = pd.DataFrame(data, columns=["image_path", "label"])


In [None]:
X, y = data[["image_path"]], data[["label"]]

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.3, stratify=y)
X_valid, X_test, y_valid, y_test = train_test_split(
    X_valid, y_valid, test_size=0.5, stratify=y_valid)


In [None]:
train_data = pd.concat([X_train, y_train], axis=1)
valid_data = pd.concat([X_valid, y_valid], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)


In [None]:
def normalize_box(box, width, height):
    return [
        int(1000 * (box[0] / width)),
        int(1000 * (box[1] / height)),
        int(1000 * (box[2] / width)),
        int(1000 * (box[3] / height)),
    ]


def apply_ocr(df_row):
    image = Image.open(df_row['image_path'])
    width, height = image.size

    ocr_df = pytesseract.image_to_data(image, output_type='data.frame')

    ocr_df = ocr_df.dropna().reset_index(drop=True)
    ocr_df = ocr_df.replace(r'^\s*$', np.nan, regex=True)
    ocr_df = ocr_df.dropna().reset_index(drop=True)

    float_cols = ocr_df.select_dtypes('float').columns
    ocr_df[float_cols] = ocr_df[float_cols].round(0).astype(int)

    words = list(ocr_df.text)
    words = [str(w) for w in words]
    coordinates = ocr_df[['left', 'top', 'width', 'height']]
    actual_boxes = []
    for idx, row in coordinates.iterrows():
        # the row comes in (left, top, width, height) format
        x, y, w, h = tuple(row)
        # we turn it into (left, top, left+width, top+height) to get the actual box
        actual_box = [x, y, x+w, y+h]
        actual_boxes.append(actual_box)

    # normalize the bounding boxes
    boxes = []
    for box in actual_boxes:
        boxes.append(normalize_box(box, width, height))

    # add as extra columns
    assert len(words) == len(boxes)
    df_row['words'] = words
    df_row['bbox'] = boxes
    return df_row


In [None]:
train_dataset = Dataset.from_pandas(train_data)
train_dataset = train_dataset.map(apply_ocr)
train_dataset = train_dataset.remove_columns('__index_level_0__')


In [None]:
valid_dataset = Dataset.from_pandas(valid_data)
valid_dataset = valid_dataset.map(apply_ocr)
valid_dataset = valid_dataset.remove_columns('__index_level_0__')


In [None]:
test_dataset = Dataset.from_pandas(test_data)
test_dataset = test_dataset.map(apply_ocr)
test_dataset = test_dataset.remove_columns('__index_level_0__')


In [None]:
train_dataset.save_to_disk(
    "./datathon_document_classification/processed/train_dataset")
test_dataset.save_to_disk(
    "./datathon_document_classification/processed/test_dataset")
valid_dataset.save_to_disk(
    "./datathon_document_classification/processed/valid_dataset")


## Train

In [None]:
train_dataset = load_from_disk(
    "./datathon_document_classification/processed/train_dataset")
test_dataset = load_from_disk(
    "./datathon_document_classification/processed/test_dataset")
valid_dataset = load_from_disk(
    "./datathon_document_classification/processed/valid_dataset")


In [None]:
tokenizer = LayoutLMTokenizer.from_pretrained(
    "microsoft/layoutlm-base-uncased")


def encode_example(example, max_seq_length=512, pad_token_box=[0, 0, 0, 0]):
    words = example['words']
    normalized_word_boxes = example['bbox']

    assert len(words) == len(normalized_word_boxes)

    token_boxes = []
    for word, box in zip(words, normalized_word_boxes):
        word_tokens = tokenizer.tokenize(word)
        token_boxes.extend([box] * len(word_tokens))

    # Truncation of token_boxes
    special_tokens_count = 2
    if len(token_boxes) > max_seq_length - special_tokens_count:
        token_boxes = token_boxes[: (max_seq_length - special_tokens_count)]

    # add bounding boxes of cls + sep tokens
    token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]

    encoding = tokenizer(
        ' '.join(words), padding='max_length', truncation=True)
    # Padding of token_boxes up the bounding boxes to the sequence length.
    input_ids = tokenizer(' '.join(words), truncation=True)["input_ids"]
    padding_length = max_seq_length - len(input_ids)
    token_boxes += [pad_token_box] * padding_length
    encoding['bbox'] = token_boxes
    encoding['label'] = label2idx[example['label']]

    assert len(encoding['input_ids']) == max_seq_length
    assert len(encoding['attention_mask']) == max_seq_length
    assert len(encoding['token_type_ids']) == max_seq_length
    assert len(encoding['bbox']) == max_seq_length

    return encoding


In [None]:
features = Features({
    'input_ids': Sequence(feature=Value(dtype='int64')),
    'bbox': Array2D(dtype="int64", shape=(512, 4)),
    'attention_mask': Sequence(Value(dtype='int64')),
    'token_type_ids': Sequence(Value(dtype='int64')),
    'label': ClassLabel(names=labels),
    'image_path': Value(dtype='string'),
    'words': Sequence(feature=Value(dtype='string')),
})


In [None]:
encoded_train_dataset = train_dataset.map(
    lambda example: encode_example(example), features=features)
encoded_valid_dataset = valid_dataset.map(
    lambda example: encode_example(example), features=features)
encoded_test_dataset = test_dataset.map(
    lambda example: encode_example(example), features=features)


In [None]:
encoded_train_dataset.save_to_disk(
    "./datathon_document_classification/final/train_dataset")
encoded_valid_dataset.save_to_disk(
    "./datathon_document_classification/final/valid_dataset")
encoded_test_dataset.save_to_disk(
    "./datathon_document_classification/final/test_dataset")


In [None]:
encoded_train_dataset = load_from_disk(
    "./datathon_document_classification/final/train_dataset")
encoded_valid_dataset = load_from_disk(
    "./datathon_document_classification/final/valid_dataset")
encoded_test_dataset = load_from_disk(
    "./datathon_document_classification/final/test_dataset")


In [None]:

bs = 256
epochs = 5
lr = 5e-5

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

args = TrainingArguments('model', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True, 
                         evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2, 
                         num_train_epochs=epochs, weight_decay=0.01, report_to='none')

model =LayoutLMForSequenceClassification.from_pretrained("microsoft/layoutlm-base-uncased", num_labels=len(labels))

trainer = Trainer(model, args, train_dataset=encoded_train_dataset, eval_dataset=encoded_valid_dataset, compute_metrics=compute_metrics)
trainer.train();

ValueError: ignored

In [None]:
import shutil

shutil.make_archive("mode", "zip", "model")

'/content/mode.zip'