In [1]:
import sys
import argparse
import os
import random
import math

sys.path.append('../src')
import  pickle
import torch
from transformers import LayoutLMv3Tokenizer, AutoConfig, AutoModel, RobertaModel
from model import LayoutLMv3forMLM, My_DataLoader
from utils import utils
from torch.optim import AdamW
from transformers import get_constant_schedule_with_warmup

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
parser = argparse.ArgumentParser()
parser.add_argument("--tokenizer_vocab_dir", type=str, required=True)
parser.add_argument("--input_file", type=str, required=True)
parser.add_argument("--model_params", type=str)
parser.add_argument("--ratio_train", type=float,default=0.9)
parser.add_argument("--output_model_dir", type=str, required=True)
parser.add_argument("--output_file_name", type=str, required=True)
parser.add_argument("--model_name", type=str, required=True)
parser.add_argument("--batch_size", type=int, default=2)
parser.add_argument("--leaning_rate", type=int, default=1e-5)
parser.add_argument("--max_epochs", type=int, default=1)
args_list = ["--tokenizer_vocab_dir", "../data/vocab/tokenizer_vocab/","--input_file",
            "../data/preprocessing_shared/encoded_dataset.pkl",
            "--output_model_dir", "../data/train/model/", \
            "--output_file_name", "model.param", \
            "--model_name", "microsoft/layoutlmv3-base"]
args = parser.parse_args(args_list)

In [3]:
if not torch.cuda.is_available():
    raise ValueError("GPU is not available.")

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device_ids = list(range(torch.cuda.device_count()))


In [3]:
tokenizer = LayoutLMv3Tokenizer(f"{args.tokenizer_vocab_dir}vocab.json", f"{args.tokenizer_vocab_dir}merges.txt")
ids = range(tokenizer.vocab_size)
vocab = tokenizer.convert_ids_to_tokens(ids)

In [4]:
if not args.model_params is None:
    model = torch.load(args.model_params)
else:
    config = AutoConfig.from_pretrained(args.model_name)
    model = LayoutLMv3forMLM.LayoutLMv3ForMLM(config)
    # Roberta_model = RobertaModel.from_pretrained("roberta-base")
    # ## embedidng 層の重みをRobertaの重みで初期化
    # weight_size = model.state_dict()["model.embeddings.word_embeddings.weight"].shape
    # for i in range(weight_size[0]):
    #   model.state_dict()["model.embeddings.word_embeddings.weight"][i] = \
    #   Roberta_model.state_dict()["embeddings.word_embeddings.weight"][i]

In [6]:
#cpu only , comment out this cell.
model = torch.nn.DataParallel(model, device_ids = device_ids)
model = model.to(f'cuda:{model.device_ids[0]}')

In [5]:
#optimizer 
optimizer = AdamW(model.parameters(), lr=args.leaning_rate, weight_decay=1e-2, betas=(0.9, 0.98))
#cross entropy
loss_fn = torch.nn.CrossEntropyLoss()

In [18]:
with open(args.input_file, 'rb') as f:
    data = pickle.load(f)

In [7]:
#divide into train and valid
n_train = math.floor(len(data) * args.ratio_train)
train_data = data[:n_train]
valid_data = data[n_train:]

In [8]:
len(valid_data)

230

In [9]:
my_dataloader = My_DataLoader.My_Dataloader(vocab)
train_dataloader = my_dataloader(train_data, batch_size=args.batch_size, shuffle=False)
valid_dataloader = my_dataloader(valid_data, batch_size=args.batch_size, shuffle=False)

In [10]:
iter_per_epoch = len(train_dataloader)
num_warmup_steps = round((iter_per_epoch * args.max_epochs) * 0.048)
scheduler = get_constant_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=num_warmup_steps)

In [11]:
def cal_loss(logits, batch):
    t = []
    for i in range(len(batch["mask_position"])):
        if len(batch["mask_position"][i]) == 0:
            continue
        t.append(logits[i][batch["mask_position"][i]])
    if len(t) == 0:
        return 
    logits = torch.cat(t)
    labels = torch.cat(batch["mask_label"])
    labels = labels.to(f'cuda:{model.device_ids[0]}')
    loss = loss_fn(logits, labels)
    return loss

In [16]:
def validation():
    valid_losses = []
    with torch.no_grad():
        for batch in valid_dataloader:
            inputs = {k: batch[k].to(f"cuda:{model.device_ids[0]}") for k in ["input_ids", "bbox", "pixel_values", "attention_mask"]}
            logits = model.forward(inputs)
            loss = cal_loss(logits, batch)
            if loss is None:
                continue
            valid_losses.append(loss.item())
            print(loss.item())
        return sum(valid_losses) / len(valid_losses)

In [14]:
iter_per_epoch = len(train_dataloader)

In [15]:
losses = []
model.train()
for epoch in range(args.max_epochs):
    for iter, batch in enumerate(train_dataloader):
        # inputs = {k: v.to(f'cuda:{model.device_ids[0]}') for k in ["input_ids, bbox", "pixel_values", "attention_mask"]}
        inputs = {k: batch[k] for k in ["input_ids", "bbox", "pixel_values", "attention_mask"]}
        logits = model.forward(inputs)
        loss = cal_loss(logits, batch)
        if loss is None:
            continue
        # labels = labels.to(f'cuda:{model.device_ids[0]}')
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        losses.append(loss.item())
        if iter % math.floor(iter_per_epoch*0.01) == 0:
            val_loss = validation()
            print(iter, loss.item())
            print(iter,"val", val_loss)



AttributeError: 'LayoutLMv3ForMLM' object has no attribute 'device_ids'

In [27]:
batch["mask_position"]

torch.Size([89])

In [13]:
# losses = []
# model.train()
# for epoch in range(args.max_epochs):
#     for iter, batch in enumerate(dataloader):
#         # inputs = {k: v.to(f'cuda:{model.device_ids[0]}') for k in ["input_ids, bbox", "pixel_values", "attention_mask"]}
#         inputs = {k: batch[k] for k in ["input_ids", "bbox", "pixel_values", "attention_mask"]}
#         logits = model.forward(inputs)
#         t = []
#         for i in range(len(batch["mask_position"])):
#             if len(batch["mask_position"][i]) == 0:
#                 continue
#             t.append(logits[i][batch["mask_position"][i]])
#         logits = torch.cat(t)

#         labels = torch.cat(batch["mask_label"])
#         # labels = labels.to(f'cuda:{model.device_ids[0]}')
        
#         loss = loss_fn(logits, labels)
#         loss.backward()
#         optimizer.step()
#         scheduler.step()
#         optimizer.zero_grad()
#         losses.append(loss.item())
#         if iter % 4 == 0:
#             print(iter, loss.item())

0 10.87540340423584
1 10.913202285766602
2 10.854154586791992
3 11.003539085388184
4 10.977672576904297
5 10.788056373596191
6 11.0016450881958
7 10.83245849609375
8 11.038727760314941
9 10.941429138183594
10 10.746644020080566
11 10.765816688537598
12 10.817831993103027
13 10.815743446350098
14 10.858794212341309
15 10.890091896057129
16 10.817254066467285
17 10.730331420898438
18 10.704963684082031
19 10.630881309509277
20 10.737689971923828
21 10.534788131713867
22 10.640432357788086
23 10.627967834472656
24 10.67199993133545
25 10.671628952026367
26 10.399262428283691
27 10.393364906311035
28 10.457799911499023
29 10.454362869262695
30 10.372509956359863
31 10.474432945251465
32 10.55550765991211
33 10.313406944274902
34 10.34122371673584
35 10.251405715942383
36 10.119297981262207
37 10.092342376708984
38 9.98108959197998
39 10.217385292053223
40 10.075906753540039
41 10.293874740600586
42 10.036898612976074
43 10.015714645385742
44 10.167057991027832
45 10.199872970581055
46 10.4

KeyboardInterrupt: 

In [25]:
torch.save(
    {
        "epoch": args.max_epochs,
        "batch_size": args.batch_size,
        "loss_list": losses,
        "model_state_dict": model.module.to("cpu").state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
    },
    f"{args.output_model_dir}{args.output_file_name}",
)     

In [29]:
# state = torch.load(args.output_model_dir+args.output_file_name)