In [20]:
import argparse
import sys
import pickle
import math

import pytorch_lightning as pl
from transformers import LayoutLMv3Tokenizer

sys.path.append('../src')
from model import My_DataLoader
from model.lightning_module import LayoutLMv3ForMLM

In [21]:
parser = argparse.ArgumentParser()
parser.add_argument("--tokenizer_vocab_dir", type=str, required=True)
parser.add_argument("--input_file", type=str, required=True)
parser.add_argument("--model_params", type=str)
parser.add_argument("--ratio_train", type=float,default=0.9)
parser.add_argument("--output_model_dir", type=str, required=True)
parser.add_argument("--output_file_name", type=str, required=True)
parser.add_argument("--model_name", type=str, required=True)
parser.add_argument("--batch_size", type=int, default=4)
parser.add_argument("--learning_rate", type=int, default=1e-5)
parser.add_argument("--max_epochs", type=int, default=1)
parser.add_argument("--gpus", type=int, nargs="+", default=[0])

args_list = ["--tokenizer_vocab_dir", "../data/vocab/tokenizer_vocab/","--input_file",
            "../data/preprocessing_shared/encoded_dataset_1000.pkl",
            "--output_model_dir", "../data/train/model/", \
            "--output_file_name", "model_pl", \
            "--model_name", "microsoft/layoutlmv3-base", \
            "--gpus", "0"]
args = parser.parse_args(args_list)

In [22]:
with open(f"hparams.txt", 'w') as f:
    f.writelines(str(args.__dict__))

In [23]:
tokenizer = LayoutLMv3Tokenizer(f"{args.tokenizer_vocab_dir}vocab.json", f"{args.tokenizer_vocab_dir}merges.txt")
ids = range(tokenizer.vocab_size)
vocab = tokenizer.convert_ids_to_tokens(ids)

In [24]:
model = LayoutLMv3ForMLM(args)

In [25]:
with open(args.input_file, 'rb') as f:
    data = pickle.load(f)

                                       

In [26]:
len(data)

2293

In [27]:
#divide into train and valid
n_train = math.floor(len(data) * args.ratio_train)
train_data = data[:n_train]
valid_data = data[n_train:]

In [28]:
len(train_data), len(valid_data)

(2063, 230)

In [29]:
my_dataloader = My_DataLoader.My_Dataloader(vocab)
train_dataloader = my_dataloader(train_data, batch_size=args.batch_size, shuffle=False)
valid_dataloader = my_dataloader(valid_data, batch_size=args.batch_size, shuffle=False)

In [30]:
len(train_dataloader), len(valid_dataloader)

(516, 58)

In [31]:
logger = pl.loggers.TensorBoardLogger("tb_logs", name="my_model")

In [32]:
trainer = pl.Trainer(gpus=args.gpus, logger=logger)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [33]:
trainer.fit(model, train_dataloader, valid_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name             | Type             | Params
------------------------------------------------------
0 | model            | LayoutLMv3Model  | 125 M 
1 | dense            | Linear           | 590 K 
2 | transform_act_fn | GELU             | 0     
3 | LayerNorm        | LayerNorm        | 1.5 K 
4 | decoder          | Linear           | 38.7 M
5 | criterion        | CrossEntropyLoss | 0     
------------------------------------------------------
164 M     Trainable params
0         Non-trainable params
164 M     Total params
658.292   Total estimated model params size (MB)


Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:00<00:00, 16.83it/s]

MisconfigurationException: You can't `self.log(on_step=True)` inside `validation_epoch_end`, must be one of (False,).

In [2]:
%reload_ext tensorboard
%tensorboard --logdir=tb_logs/