# Test the thesis BERT model

First thing is first let's import some modules that we will need.

In [43]:
!pip install tensorboard



In [1]:
!pip install tokenizers

Collecting tokenizers
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m57.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.13.3


In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m49.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
Collecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m73.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: safetensors, huggingface-hub, transformers
Successfully installed huggingface-hub-0.16.4 safetensors-0.3.1 transformers-4.31.0


In [18]:
import os
import sys
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
from transformers import RobertaTokenizer, pipeline
from transformers import RobertaConfig, RobertaForMaskedLM, AdamW
from torch.utils.tensorboard import SummaryWriter
import torch
import tqdm

print(sys.path)
print(os.getcwd())

['/content/bert-transformer', '/content', '/env/python', '/usr/lib/python310.zip', '/usr/lib/python3.10', '/usr/lib/python3.10/lib-dynload', '', '/usr/local/lib/python3.10/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.10/dist-packages/IPython/extensions', '/root/.ipython']
/content


Now we must mount google drive and then clone the repo containing the code.

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!git clone https://github.com/christianbromley/bert-transformer.git

Cloning into 'bert-transformer'...
remote: Enumerating objects: 45, done.[K
remote: Counting objects: 100% (45/45), done.[K
remote: Compressing objects: 100% (31/31), done.[K
remote: Total 45 (delta 21), reused 35 (delta 14), pack-reused 0[K
Unpacking objects: 100% (45/45), 133.45 KiB | 579.00 KiB/s, done.


In [2]:
sys.path.insert(0, '/content/bert-transformer')

from src.make_tokens import extract_file_paths, run as run_tokenizer

Next get the path to our text.

In [8]:
DATA_DIR='/content/bert-transformer/data'

paths = extract_file_paths(DATA_DIR)

print(paths)

['/content/bert-transformer/data/text_0.txt']


In [4]:
TOKENIZER_OUTPUT_DIR = '/content/bert-transformer/models'

run_tokenizer(paths, TOKENIZER_OUTPUT_DIR)

Ok, so we have run our tokenizer and produced the files now contained in the models directory.

Now we must initialise our tokenizer.

In [5]:
# initialize the tokenizer using the tokenizer we created
tokenizer = RobertaTokenizer.from_pretrained(TOKENIZER_OUTPUT_DIR, max_len=512)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizer'.


In [47]:
tokenizer('This is a cytokine')

{'input_ids': [0, 56, 76, 77, 87, 225, 77, 87, 225, 69, 225, 71, 93, 88, 83, 79, 77, 82, 73, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

Ok, so we have our tokenizer. Now we want to create our tensors. First we need  a function to create our masked language model.

In [6]:
def masked_language_model(tensor):
    rand = torch.rand(tensor.shape)
    mask_arr = (rand < 0.15) * (tensor > 2)
    for i in range(tensor.shape[0]):
        selection = torch.flatten(mask_arr[i].nonzero()).tolist()
        tensor[i,selection] = 4
    return tensor

In [9]:
## input ids - token IDs with a % of tokens masked with the mask token ID which in our case is 4
input_ids = []
## mask - this is a binary tensor of 1 and 0 indicating where the masks are
mask = []
## labels - these are just the unmasked token IDs
labels = []
# initialise the output
tokenized_text = {}

# read file
with open(paths[0], 'r', encoding='utf-8') as fp:
    lines = fp.read().split('\n')

# get the file name
fname = paths[0].split('/')[-1]
print(fname)

text_0.txt


In [10]:
# tokenize the text in these lines
sample = tokenizer(lines, max_length=512, padding='max_length', truncation=True, return_tensors='pt')

tokenized_text[fname] = sample

# the sample object contains some of our tensors - extract these

## get the input IDs and append to labels
labels.append(sample.input_ids)

## get the attention mask - the binary
mask.append(sample.attention_mask)


In [9]:
#print(sample.keys())

In [10]:
#print(sample.attention_mask[0])

In [11]:
## now apply the masked language model function on the input IDs to mask 15% of tokens
mlm_on_input = masked_language_model(sample.input_ids.detach().clone())
input_ids.append(mlm_on_input)

In [12]:
# now format tensors
input_ids = torch.cat(input_ids)
mask = torch.cat(mask)
labels = torch.cat(labels)

In [13]:
# create encodings dict
encodings = {
    'input_ids': input_ids,
    'mask': mask,
    'labels': labels
}

In [14]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __len__(self):
        return self.encodings['input_ids'].shape[0]
    def __getitem__(self, i):
        # return {
        #     'input_ids': self.encodings['input_ids'][i]
        # }
        return {key: tensor[i] for key, tensor in self.encodings.items()}

In [15]:
dataset = Dataset(encodings)

dataloader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

In [16]:
# build RoBERTa config
config = RobertaConfig(
    vocab_size=30_522,  # we align this to the tokenizer vocab_size
    max_position_embeddings=514,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1
)


In [34]:
# init tensorboard writer
writer = SummaryWriter()
# init model
model = RobertaForMaskedLM(config)
# set device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)
# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=1e-4)
# set number of epochs
epochs = 2
# loop through epochs
for epoch in range(epochs):
    print(f'Epoch {str(epoch)}')
    # setup loop with TQDM and dataloader
    loop = tqdm.tqdm(dataloader, leave=False)
    for batch_idx, batch in enumerate(loop):
        #print(batch.keys())
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        #print(batch.keys())
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['mask'].to(device)
        labels = batch['labels'].to(device)
        # model
        outputs = model(input_ids,
                        attention_mask=attention_mask,
                        labels=labels)
        # extract the loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

        writer.add_scalar('Loss', loss.item(), epoch * len(dataloader) + batch_idx)
writer.flush()
writer.close()



Epoch 0




Epoch 1




In [38]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [45]:
!tensorboard --logdir=runs --port=8008 --bind_all

/usr/local/lib/python3.10/dist-packages/tensorboard_data_server/bin/server: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.33' not found (required by /usr/local/lib/python3.10/dist-packages/tensorboard_data_server/bin/server)
/usr/local/lib/python3.10/dist-packages/tensorboard_data_server/bin/server: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.34' not found (required by /usr/local/lib/python3.10/dist-packages/tensorboard_data_server/bin/server)
/usr/local/lib/python3.10/dist-packages/tensorboard_data_server/bin/server: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.32' not found (required by /usr/local/lib/python3.10/dist-packages/tensorboard_data_server/bin/server)
Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.12.3 at http://localhost:8008/ (Press CTRL+C to quit)
^C


In [35]:
model.save_pretrained(f'/content/bert-transformer/models/thesis_bert')

In [48]:
my_pl = pipeline('fill-mask',
                 model='/content/bert-transformer/models/thesis_bert',
                 tokenizer=tokenizer)

In [51]:
my_pl(f'Cancer is a {my_pl.tokenizer.mask_token}')

[{'score': 0.1518344134092331,
  'token': 225,
  'token_str': ' ',
  'sequence': 'Cancer is a '},
 {'score': 0.087583027780056,
  'token': 88,
  'token_str': 't',
  'sequence': 'Cancer is at'},
 {'score': 0.06038049980998039,
  'token': 73,
  'token_str': 'e',
  'sequence': 'Cancer is ae'},
 {'score': 0.04791000485420227,
  'token': 83,
  'token_str': 'o',
  'sequence': 'Cancer is ao'},
 {'score': 0.0468127578496933,
  'token': 82,
  'token_str': 'n',
  'sequence': 'Cancer is an'}]

In [50]:
my_pl(f'Inflammation promotes {my_pl.tokenizer.mask_token}')

[{'score': 0.17040212452411652,
  'token': 225,
  'token_str': ' ',
  'sequence': 'Inflammation promotes '},
 {'score': 0.07921242713928223,
  'token': 88,
  'token_str': 't',
  'sequence': 'Inflammation promotest'},
 {'score': 0.055338140577077866,
  'token': 73,
  'token_str': 'e',
  'sequence': 'Inflammation promotese'},
 {'score': 0.050345294177532196,
  'token': 77,
  'token_str': 'i',
  'sequence': 'Inflammation promotesi'},
 {'score': 0.049357637763023376,
  'token': 82,
  'token_str': 'n',
  'sequence': 'Inflammation promotesn'}]

In [22]:
def train_model(config, loader_for_data):
    # init model
    model = RobertaForMaskedLM(config)
    # set device
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    # and move our model over to the selected device
    model.to(device)
    # activate training mode
    model.train()
    # initialize optimizer
    optim = AdamW(model.parameters(), lr=1e-4)
    # set number of epochs
    epochs = 2
    # loop through epochs
    for epoch in range(epochs):
        print(f'Epoch {str(epoch)}')
        # setup loop with TQDM and dataloader
        loop = tqdm.tqdm(loader_for_data, leave=False)
        for batch in loop:
            print(batch.keys())
            # initialize calculated gradients (from prev step)
            optim.zero_grad()
            # pull all tensor batches required for training
            #print(batch.keys())
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['mask'].to(device)
            labels = batch['labels'].to(device)
            # model
            outputs = model(input_ids,
                            attention_mask=attention_mask,
                            labels=labels)
            # extract the loss
            loss = outputs.loss
            # calculate loss for every parameter that needs grad update
            loss.backward()
            # update parameters
            optim.step()
            # print relevant info to progress bar
            loop.set_description(f'Epoch {epoch}')
            loop.set_postfix(loss=loss.item())

    return model



In [24]:
model = train_model(config=config, loader_for_data=dataloader)

model.save_pretrained(f'/content/bert-transformer/models/thesis_bert')



Epoch 0



  0%|          | 0/117 [00:00<?, ?it/s][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:   0%|          | 0/117 [00:01<?, ?it/s][A
Epoch 0:   0%|          | 0/117 [00:01<?, ?it/s, loss=10.5][A
Epoch 0:   1%|          | 1/117 [00:01<02:19,  1.20s/it, loss=10.5][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:   1%|          | 1/117 [00:02<02:19,  1.20s/it, loss=10.5][A
Epoch 0:   1%|          | 1/117 [00:02<02:19,  1.20s/it, loss=9.81][A
Epoch 0:   2%|▏         | 2/117 [00:02<02:17,  1.19s/it, loss=9.81][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:   2%|▏         | 2/117 [00:03<02:17,  1.19s/it, loss=9.81][A
Epoch 0:   2%|▏         | 2/117 [00:03<02:17,  1.19s/it, loss=9.52][A
Epoch 0:   3%|▎         | 3/117 [00:03<02:15,  1.19s/it, loss=9.52][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:   3%|▎         | 3/117 [00:04<02:15,  1.19s/it, loss=9.52][A
Epoch 0:   3%|▎         | 3/117 [00:04<02:15,  1.19s/it, loss=9.1] [A
Epoch 0:   3%|▎         | 4/117 [00:04<02:15,  1.20s/it, loss=9.1][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:   3%|▎         | 4/117 [00:05<02:15,  1.20s/it, loss=9.1][A
Epoch 0:   3%|▎         | 4/117 [00:06<02:15,  1.20s/it, loss=9.15][A
Epoch 0:   4%|▍         | 5/117 [00:06<02:15,  1.21s/it, loss=9.15][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:   4%|▍         | 5/117 [00:07<02:15,  1.21s/it, loss=9.15][A
Epoch 0:   4%|▍         | 5/117 [00:07<02:15,  1.21s/it, loss=9.01][A
Epoch 0:   5%|▌         | 6/117 [00:07<02:13,  1.21s/it, loss=9.01][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:   5%|▌         | 6/117 [00:08<02:13,  1.21s/it, loss=9.01][A
Epoch 0:   5%|▌         | 6/117 [00:08<02:13,  1.21s/it, loss=8.56][A
Epoch 0:   6%|▌         | 7/117 [00:08<02:13,  1.22s/it, loss=8.56][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:   6%|▌         | 7/117 [00:09<02:13,  1.22s/it, loss=8.56][A
Epoch 0:   6%|▌         | 7/117 [00:09<02:13,  1.22s/it, loss=8.61][A
Epoch 0:   7%|▋         | 8/117 [00:09<02:12,  1.22s/it, loss=8.61][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:   7%|▋         | 8/117 [00:10<02:12,  1.22s/it, loss=8.61][A
Epoch 0:   7%|▋         | 8/117 [00:10<02:12,  1.22s/it, loss=8.27][A
Epoch 0:   8%|▊         | 9/117 [00:10<02:11,  1.21s/it, loss=8.27][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:   8%|▊         | 9/117 [00:12<02:11,  1.21s/it, loss=8.27][A
Epoch 0:   8%|▊         | 9/117 [00:12<02:11,  1.21s/it, loss=8.36][A
Epoch 0:   9%|▊         | 10/117 [00:12<02:10,  1.22s/it, loss=8.36][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:   9%|▊         | 10/117 [00:13<02:10,  1.22s/it, loss=8.36][A
Epoch 0:   9%|▊         | 10/117 [00:13<02:10,  1.22s/it, loss=8.17][A
Epoch 0:   9%|▉         | 11/117 [00:13<02:08,  1.22s/it, loss=8.17][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:   9%|▉         | 11/117 [00:14<02:08,  1.22s/it, loss=8.17][A
Epoch 0:   9%|▉         | 11/117 [00:14<02:08,  1.22s/it, loss=7.83][A
Epoch 0:  10%|█         | 12/117 [00:14<02:07,  1.22s/it, loss=7.83][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  10%|█         | 12/117 [00:15<02:07,  1.22s/it, loss=7.83][A
Epoch 0:  10%|█         | 12/117 [00:15<02:07,  1.22s/it, loss=7.35][A
Epoch 0:  11%|█         | 13/117 [00:15<02:06,  1.22s/it, loss=7.35][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  11%|█         | 13/117 [00:17<02:06,  1.22s/it, loss=7.35][A
Epoch 0:  11%|█         | 13/117 [00:17<02:06,  1.22s/it, loss=7.3] [A
Epoch 0:  12%|█▏        | 14/117 [00:17<02:06,  1.23s/it, loss=7.3][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  12%|█▏        | 14/117 [00:18<02:06,  1.23s/it, loss=7.3][A
Epoch 0:  12%|█▏        | 14/117 [00:18<02:06,  1.23s/it, loss=7.34][A
Epoch 0:  13%|█▎        | 15/117 [00:18<02:05,  1.23s/it, loss=7.34][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  13%|█▎        | 15/117 [00:19<02:05,  1.23s/it, loss=7.34][A
Epoch 0:  13%|█▎        | 15/117 [00:19<02:05,  1.23s/it, loss=6.79][A
Epoch 0:  14%|█▎        | 16/117 [00:19<02:04,  1.24s/it, loss=6.79][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  14%|█▎        | 16/117 [00:20<02:04,  1.24s/it, loss=6.79][A
Epoch 0:  14%|█▎        | 16/117 [00:20<02:04,  1.24s/it, loss=7.74][A
Epoch 0:  15%|█▍        | 17/117 [00:20<02:03,  1.23s/it, loss=7.74][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  15%|█▍        | 17/117 [00:21<02:03,  1.23s/it, loss=7.74][A
Epoch 0:  15%|█▍        | 17/117 [00:21<02:03,  1.23s/it, loss=7.33][A
Epoch 0:  15%|█▌        | 18/117 [00:21<02:01,  1.23s/it, loss=7.33][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  15%|█▌        | 18/117 [00:23<02:01,  1.23s/it, loss=7.33][A
Epoch 0:  15%|█▌        | 18/117 [00:23<02:01,  1.23s/it, loss=6.89][A
Epoch 0:  16%|█▌        | 19/117 [00:23<02:00,  1.23s/it, loss=6.89][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  16%|█▌        | 19/117 [00:24<02:00,  1.23s/it, loss=6.89][A
Epoch 0:  16%|█▌        | 19/117 [00:24<02:00,  1.23s/it, loss=6.69][A
Epoch 0:  17%|█▋        | 20/117 [00:24<01:58,  1.22s/it, loss=6.69][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  17%|█▋        | 20/117 [00:25<01:58,  1.22s/it, loss=6.69][A
Epoch 0:  17%|█▋        | 20/117 [00:25<01:58,  1.22s/it, loss=6.47][A
Epoch 0:  18%|█▊        | 21/117 [00:25<01:57,  1.22s/it, loss=6.47][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  18%|█▊        | 21/117 [00:26<01:57,  1.22s/it, loss=6.47][A
Epoch 0:  18%|█▊        | 21/117 [00:26<01:57,  1.22s/it, loss=6.41][A
Epoch 0:  19%|█▉        | 22/117 [00:26<01:55,  1.22s/it, loss=6.41][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  19%|█▉        | 22/117 [00:27<01:55,  1.22s/it, loss=6.41][A
Epoch 0:  19%|█▉        | 22/117 [00:28<01:55,  1.22s/it, loss=6.37][A
Epoch 0:  20%|█▉        | 23/117 [00:28<01:53,  1.21s/it, loss=6.37][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  20%|█▉        | 23/117 [00:29<01:53,  1.21s/it, loss=6.37][A
Epoch 0:  20%|█▉        | 23/117 [00:29<01:53,  1.21s/it, loss=6.12][A
Epoch 0:  21%|██        | 24/117 [00:29<01:52,  1.21s/it, loss=6.12][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  21%|██        | 24/117 [00:30<01:52,  1.21s/it, loss=6.12][A
Epoch 0:  21%|██        | 24/117 [00:30<01:52,  1.21s/it, loss=5.93][A
Epoch 0:  21%|██▏       | 25/117 [00:30<01:51,  1.21s/it, loss=5.93][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  21%|██▏       | 25/117 [00:31<01:51,  1.21s/it, loss=5.93][A
Epoch 0:  21%|██▏       | 25/117 [00:31<01:51,  1.21s/it, loss=6.01][A
Epoch 0:  22%|██▏       | 26/117 [00:31<01:49,  1.20s/it, loss=6.01][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  22%|██▏       | 26/117 [00:32<01:49,  1.20s/it, loss=6.01][A
Epoch 0:  22%|██▏       | 26/117 [00:32<01:49,  1.20s/it, loss=5.56][A
Epoch 0:  23%|██▎       | 27/117 [00:32<01:47,  1.20s/it, loss=5.56][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  23%|██▎       | 27/117 [00:33<01:47,  1.20s/it, loss=5.56][A
Epoch 0:  23%|██▎       | 27/117 [00:33<01:47,  1.20s/it, loss=5.83][A
Epoch 0:  24%|██▍       | 28/117 [00:33<01:45,  1.19s/it, loss=5.83][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  24%|██▍       | 28/117 [00:35<01:45,  1.19s/it, loss=5.83][A
Epoch 0:  24%|██▍       | 28/117 [00:35<01:45,  1.19s/it, loss=5.4] [A
Epoch 0:  25%|██▍       | 29/117 [00:35<01:44,  1.18s/it, loss=5.4][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  25%|██▍       | 29/117 [00:36<01:44,  1.18s/it, loss=5.4][A
Epoch 0:  25%|██▍       | 29/117 [00:36<01:44,  1.18s/it, loss=5.82][A
Epoch 0:  26%|██▌       | 30/117 [00:36<01:42,  1.18s/it, loss=5.82][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  26%|██▌       | 30/117 [00:37<01:42,  1.18s/it, loss=5.82][A
Epoch 0:  26%|██▌       | 30/117 [00:37<01:42,  1.18s/it, loss=5.76][A
Epoch 0:  26%|██▋       | 31/117 [00:37<01:41,  1.18s/it, loss=5.76][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  26%|██▋       | 31/117 [00:38<01:41,  1.18s/it, loss=5.76][A
Epoch 0:  26%|██▋       | 31/117 [00:38<01:41,  1.18s/it, loss=4.94][A
Epoch 0:  27%|██▋       | 32/117 [00:38<01:39,  1.18s/it, loss=4.94][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  27%|██▋       | 32/117 [00:39<01:39,  1.18s/it, loss=4.94][A
Epoch 0:  27%|██▋       | 32/117 [00:39<01:39,  1.18s/it, loss=5.06][A
Epoch 0:  28%|██▊       | 33/117 [00:39<01:37,  1.17s/it, loss=5.06][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  28%|██▊       | 33/117 [00:40<01:37,  1.17s/it, loss=5.06][A
Epoch 0:  28%|██▊       | 33/117 [00:40<01:37,  1.17s/it, loss=5.1] [A
Epoch 0:  29%|██▉       | 34/117 [00:40<01:36,  1.16s/it, loss=5.1][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  29%|██▉       | 34/117 [00:42<01:36,  1.16s/it, loss=5.1][A
Epoch 0:  29%|██▉       | 34/117 [00:42<01:36,  1.16s/it, loss=5.18][A
Epoch 0:  30%|██▉       | 35/117 [00:42<01:35,  1.16s/it, loss=5.18][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  30%|██▉       | 35/117 [00:43<01:35,  1.16s/it, loss=5.18][A
Epoch 0:  30%|██▉       | 35/117 [00:43<01:35,  1.16s/it, loss=4.99][A
Epoch 0:  31%|███       | 36/117 [00:43<01:33,  1.16s/it, loss=4.99][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  31%|███       | 36/117 [00:44<01:33,  1.16s/it, loss=4.99][A
Epoch 0:  31%|███       | 36/117 [00:44<01:33,  1.16s/it, loss=4.92][A
Epoch 0:  32%|███▏      | 37/117 [00:44<01:32,  1.16s/it, loss=4.92][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  32%|███▏      | 37/117 [00:45<01:32,  1.16s/it, loss=4.92][A
Epoch 0:  32%|███▏      | 37/117 [00:45<01:32,  1.16s/it, loss=4.94][A
Epoch 0:  32%|███▏      | 38/117 [00:45<01:31,  1.16s/it, loss=4.94][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  32%|███▏      | 38/117 [00:46<01:31,  1.16s/it, loss=4.94][A
Epoch 0:  32%|███▏      | 38/117 [00:46<01:31,  1.16s/it, loss=4.58][A
Epoch 0:  33%|███▎      | 39/117 [00:46<01:30,  1.16s/it, loss=4.58][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  33%|███▎      | 39/117 [00:47<01:30,  1.16s/it, loss=4.58][A
Epoch 0:  33%|███▎      | 39/117 [00:47<01:30,  1.16s/it, loss=4.65][A
Epoch 0:  34%|███▍      | 40/117 [00:47<01:28,  1.15s/it, loss=4.65][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  34%|███▍      | 40/117 [00:49<01:28,  1.15s/it, loss=4.65][A
Epoch 0:  34%|███▍      | 40/117 [00:49<01:28,  1.15s/it, loss=4.32][A
Epoch 0:  35%|███▌      | 41/117 [00:49<01:27,  1.16s/it, loss=4.32][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  35%|███▌      | 41/117 [00:50<01:27,  1.16s/it, loss=4.32][A
Epoch 0:  35%|███▌      | 41/117 [00:50<01:27,  1.16s/it, loss=4.49][A
Epoch 0:  36%|███▌      | 42/117 [00:50<01:26,  1.16s/it, loss=4.49][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  36%|███▌      | 42/117 [00:51<01:26,  1.16s/it, loss=4.49][A
Epoch 0:  36%|███▌      | 42/117 [00:51<01:26,  1.16s/it, loss=4.27][A
Epoch 0:  37%|███▋      | 43/117 [00:51<01:25,  1.15s/it, loss=4.27][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  37%|███▋      | 43/117 [00:52<01:25,  1.15s/it, loss=4.27][A
Epoch 0:  37%|███▋      | 43/117 [00:52<01:25,  1.15s/it, loss=4.08][A
Epoch 0:  38%|███▊      | 44/117 [00:52<01:24,  1.15s/it, loss=4.08][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  38%|███▊      | 44/117 [00:53<01:24,  1.15s/it, loss=4.08][A
Epoch 0:  38%|███▊      | 44/117 [00:53<01:24,  1.15s/it, loss=4.36][A
Epoch 0:  38%|███▊      | 45/117 [00:53<01:22,  1.15s/it, loss=4.36][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  38%|███▊      | 45/117 [00:54<01:22,  1.15s/it, loss=4.36][A
Epoch 0:  38%|███▊      | 45/117 [00:54<01:22,  1.15s/it, loss=4.05][A
Epoch 0:  39%|███▉      | 46/117 [00:54<01:21,  1.15s/it, loss=4.05][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  39%|███▉      | 46/117 [00:55<01:21,  1.15s/it, loss=4.05][A
Epoch 0:  39%|███▉      | 46/117 [00:55<01:21,  1.15s/it, loss=4.04][A
Epoch 0:  40%|████      | 47/117 [00:55<01:20,  1.14s/it, loss=4.04][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  40%|████      | 47/117 [00:57<01:20,  1.14s/it, loss=4.04][A
Epoch 0:  40%|████      | 47/117 [00:57<01:20,  1.14s/it, loss=3.77][A
Epoch 0:  41%|████      | 48/117 [00:57<01:18,  1.14s/it, loss=3.77][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  41%|████      | 48/117 [00:58<01:18,  1.14s/it, loss=3.77][A
Epoch 0:  41%|████      | 48/117 [00:58<01:18,  1.14s/it, loss=3.44][A
Epoch 0:  42%|████▏     | 49/117 [00:58<01:17,  1.15s/it, loss=3.44][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  42%|████▏     | 49/117 [00:59<01:17,  1.15s/it, loss=3.44][A
Epoch 0:  42%|████▏     | 49/117 [00:59<01:17,  1.15s/it, loss=3.66][A
Epoch 0:  43%|████▎     | 50/117 [00:59<01:16,  1.14s/it, loss=3.66][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  43%|████▎     | 50/117 [01:00<01:16,  1.14s/it, loss=3.66][A
Epoch 0:  43%|████▎     | 50/117 [01:00<01:16,  1.14s/it, loss=3.2] [A
Epoch 0:  44%|████▎     | 51/117 [01:00<01:15,  1.14s/it, loss=3.2][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  44%|████▎     | 51/117 [01:01<01:15,  1.14s/it, loss=3.2][A
Epoch 0:  44%|████▎     | 51/117 [01:01<01:15,  1.14s/it, loss=3.4][A
Epoch 0:  44%|████▍     | 52/117 [01:01<01:14,  1.14s/it, loss=3.4][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  44%|████▍     | 52/117 [01:02<01:14,  1.14s/it, loss=3.4][A
Epoch 0:  44%|████▍     | 52/117 [01:02<01:14,  1.14s/it, loss=3.38][A
Epoch 0:  45%|████▌     | 53/117 [01:02<01:13,  1.14s/it, loss=3.38][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  45%|████▌     | 53/117 [01:03<01:13,  1.14s/it, loss=3.38][A
Epoch 0:  45%|████▌     | 53/117 [01:03<01:13,  1.14s/it, loss=3.2] [A
Epoch 0:  46%|████▌     | 54/117 [01:03<01:12,  1.14s/it, loss=3.2][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  46%|████▌     | 54/117 [01:05<01:12,  1.14s/it, loss=3.2][A
Epoch 0:  46%|████▌     | 54/117 [01:05<01:12,  1.14s/it, loss=3.11][A
Epoch 0:  47%|████▋     | 55/117 [01:05<01:10,  1.15s/it, loss=3.11][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  47%|████▋     | 55/117 [01:06<01:10,  1.15s/it, loss=3.11][A
Epoch 0:  47%|████▋     | 55/117 [01:06<01:10,  1.15s/it, loss=2.74][A
Epoch 0:  48%|████▊     | 56/117 [01:06<01:09,  1.15s/it, loss=2.74][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  48%|████▊     | 56/117 [01:07<01:09,  1.15s/it, loss=2.74][A
Epoch 0:  48%|████▊     | 56/117 [01:07<01:09,  1.15s/it, loss=2.83][A
Epoch 0:  49%|████▊     | 57/117 [01:07<01:08,  1.15s/it, loss=2.83][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  49%|████▊     | 57/117 [01:08<01:08,  1.15s/it, loss=2.83][A
Epoch 0:  49%|████▊     | 57/117 [01:08<01:08,  1.15s/it, loss=2.6] [A
Epoch 0:  50%|████▉     | 58/117 [01:08<01:07,  1.15s/it, loss=2.6][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  50%|████▉     | 58/117 [01:09<01:07,  1.15s/it, loss=2.6][A
Epoch 0:  50%|████▉     | 58/117 [01:09<01:07,  1.15s/it, loss=2.77][A
Epoch 0:  50%|█████     | 59/117 [01:09<01:06,  1.15s/it, loss=2.77][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  50%|█████     | 59/117 [01:10<01:06,  1.15s/it, loss=2.77][A
Epoch 0:  50%|█████     | 59/117 [01:10<01:06,  1.15s/it, loss=2.55][A
Epoch 0:  51%|█████▏    | 60/117 [01:10<01:05,  1.15s/it, loss=2.55][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  51%|█████▏    | 60/117 [01:11<01:05,  1.15s/it, loss=2.55][A
Epoch 0:  51%|█████▏    | 60/117 [01:11<01:05,  1.15s/it, loss=2.43][A
Epoch 0:  52%|█████▏    | 61/117 [01:11<01:04,  1.15s/it, loss=2.43][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  52%|█████▏    | 61/117 [01:13<01:04,  1.15s/it, loss=2.43][A
Epoch 0:  52%|█████▏    | 61/117 [01:13<01:04,  1.15s/it, loss=2.4] [A
Epoch 0:  53%|█████▎    | 62/117 [01:13<01:03,  1.15s/it, loss=2.4][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  53%|█████▎    | 62/117 [01:14<01:03,  1.15s/it, loss=2.4][A
Epoch 0:  53%|█████▎    | 62/117 [01:14<01:03,  1.15s/it, loss=2.22][A
Epoch 0:  54%|█████▍    | 63/117 [01:14<01:02,  1.15s/it, loss=2.22][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  54%|█████▍    | 63/117 [01:15<01:02,  1.15s/it, loss=2.22][A
Epoch 0:  54%|█████▍    | 63/117 [01:15<01:02,  1.15s/it, loss=2.36][A
Epoch 0:  55%|█████▍    | 64/117 [01:15<01:01,  1.15s/it, loss=2.36][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  55%|█████▍    | 64/117 [01:16<01:01,  1.15s/it, loss=2.36][A
Epoch 0:  55%|█████▍    | 64/117 [01:16<01:01,  1.15s/it, loss=2.25][A
Epoch 0:  56%|█████▌    | 65/117 [01:16<01:00,  1.15s/it, loss=2.25][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  56%|█████▌    | 65/117 [01:17<01:00,  1.15s/it, loss=2.25][A
Epoch 0:  56%|█████▌    | 65/117 [01:17<01:00,  1.15s/it, loss=2.05][A
Epoch 0:  56%|█████▋    | 66/117 [01:17<00:58,  1.16s/it, loss=2.05][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  56%|█████▋    | 66/117 [01:18<00:58,  1.16s/it, loss=2.05][A
Epoch 0:  56%|█████▋    | 66/117 [01:18<00:58,  1.16s/it, loss=2.07][A
Epoch 0:  57%|█████▋    | 67/117 [01:18<00:57,  1.16s/it, loss=2.07][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  57%|█████▋    | 67/117 [01:20<00:57,  1.16s/it, loss=2.07][A
Epoch 0:  57%|█████▋    | 67/117 [01:20<00:57,  1.16s/it, loss=1.92][A
Epoch 0:  58%|█████▊    | 68/117 [01:20<00:56,  1.16s/it, loss=1.92][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  58%|█████▊    | 68/117 [01:21<00:56,  1.16s/it, loss=1.92][A
Epoch 0:  58%|█████▊    | 68/117 [01:21<00:56,  1.16s/it, loss=1.89][A
Epoch 0:  59%|█████▉    | 69/117 [01:21<00:55,  1.16s/it, loss=1.89][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  59%|█████▉    | 69/117 [01:22<00:55,  1.16s/it, loss=1.89][A
Epoch 0:  59%|█████▉    | 69/117 [01:22<00:55,  1.16s/it, loss=1.81][A
Epoch 0:  60%|█████▉    | 70/117 [01:22<00:54,  1.16s/it, loss=1.81][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  60%|█████▉    | 70/117 [01:23<00:54,  1.16s/it, loss=1.81][A
Epoch 0:  60%|█████▉    | 70/117 [01:23<00:54,  1.16s/it, loss=1.63][A
Epoch 0:  61%|██████    | 71/117 [01:23<00:53,  1.16s/it, loss=1.63][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  61%|██████    | 71/117 [01:24<00:53,  1.16s/it, loss=1.63][A
Epoch 0:  61%|██████    | 71/117 [01:24<00:53,  1.16s/it, loss=1.59][A
Epoch 0:  62%|██████▏   | 72/117 [01:24<00:52,  1.17s/it, loss=1.59][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  62%|██████▏   | 72/117 [01:25<00:52,  1.17s/it, loss=1.59][A
Epoch 0:  62%|██████▏   | 72/117 [01:25<00:52,  1.17s/it, loss=1.46][A
Epoch 0:  62%|██████▏   | 73/117 [01:25<00:51,  1.17s/it, loss=1.46][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  62%|██████▏   | 73/117 [01:27<00:51,  1.17s/it, loss=1.46][A
Epoch 0:  62%|██████▏   | 73/117 [01:27<00:51,  1.17s/it, loss=1.41][A
Epoch 0:  63%|██████▎   | 74/117 [01:27<00:50,  1.17s/it, loss=1.41][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  63%|██████▎   | 74/117 [01:28<00:50,  1.17s/it, loss=1.41][A
Epoch 0:  63%|██████▎   | 74/117 [01:28<00:50,  1.17s/it, loss=1.33][A
Epoch 0:  64%|██████▍   | 75/117 [01:28<00:49,  1.17s/it, loss=1.33][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  64%|██████▍   | 75/117 [01:29<00:49,  1.17s/it, loss=1.33][A
Epoch 0:  64%|██████▍   | 75/117 [01:29<00:49,  1.17s/it, loss=1.28][A
Epoch 0:  65%|██████▍   | 76/117 [01:29<00:48,  1.17s/it, loss=1.28][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  65%|██████▍   | 76/117 [01:30<00:48,  1.17s/it, loss=1.28][A
Epoch 0:  65%|██████▍   | 76/117 [01:30<00:48,  1.17s/it, loss=1.19][A
Epoch 0:  66%|██████▌   | 77/117 [01:30<00:46,  1.17s/it, loss=1.19][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  66%|██████▌   | 77/117 [01:31<00:46,  1.17s/it, loss=1.19][A
Epoch 0:  66%|██████▌   | 77/117 [01:31<00:46,  1.17s/it, loss=1.2] [A
Epoch 0:  67%|██████▋   | 78/117 [01:31<00:45,  1.17s/it, loss=1.2][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  67%|██████▋   | 78/117 [01:32<00:45,  1.17s/it, loss=1.2][A
Epoch 0:  67%|██████▋   | 78/117 [01:32<00:45,  1.17s/it, loss=1.08][A
Epoch 0:  68%|██████▊   | 79/117 [01:32<00:44,  1.17s/it, loss=1.08][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  68%|██████▊   | 79/117 [01:34<00:44,  1.17s/it, loss=1.08][A
Epoch 0:  68%|██████▊   | 79/117 [01:34<00:44,  1.17s/it, loss=1.04][A
Epoch 0:  68%|██████▊   | 80/117 [01:34<00:43,  1.17s/it, loss=1.04][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  68%|██████▊   | 80/117 [01:35<00:43,  1.17s/it, loss=1.04][A
Epoch 0:  68%|██████▊   | 80/117 [01:35<00:43,  1.17s/it, loss=0.946][A
Epoch 0:  69%|██████▉   | 81/117 [01:35<00:42,  1.18s/it, loss=0.946][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  69%|██████▉   | 81/117 [01:36<00:42,  1.18s/it, loss=0.946][A
Epoch 0:  69%|██████▉   | 81/117 [01:36<00:42,  1.18s/it, loss=0.875][A
Epoch 0:  70%|███████   | 82/117 [01:36<00:41,  1.18s/it, loss=0.875][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  70%|███████   | 82/117 [01:37<00:41,  1.18s/it, loss=0.875][A
Epoch 0:  70%|███████   | 82/117 [01:37<00:41,  1.18s/it, loss=0.851][A
Epoch 0:  71%|███████   | 83/117 [01:37<00:40,  1.18s/it, loss=0.851][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  71%|███████   | 83/117 [01:38<00:40,  1.18s/it, loss=0.851][A
Epoch 0:  71%|███████   | 83/117 [01:38<00:40,  1.18s/it, loss=0.788][A
Epoch 0:  72%|███████▏  | 84/117 [01:38<00:38,  1.18s/it, loss=0.788][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  72%|███████▏  | 84/117 [01:40<00:38,  1.18s/it, loss=0.788][A
Epoch 0:  72%|███████▏  | 84/117 [01:40<00:38,  1.18s/it, loss=0.75] [A
Epoch 0:  73%|███████▎  | 85/117 [01:40<00:37,  1.18s/it, loss=0.75][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  73%|███████▎  | 85/117 [01:41<00:37,  1.18s/it, loss=0.75][A
Epoch 0:  73%|███████▎  | 85/117 [01:41<00:37,  1.18s/it, loss=0.736][A
Epoch 0:  74%|███████▎  | 86/117 [01:41<00:36,  1.18s/it, loss=0.736][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  74%|███████▎  | 86/117 [01:42<00:36,  1.18s/it, loss=0.736][A
Epoch 0:  74%|███████▎  | 86/117 [01:42<00:36,  1.18s/it, loss=0.677][A
Epoch 0:  74%|███████▍  | 87/117 [01:42<00:35,  1.18s/it, loss=0.677][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  74%|███████▍  | 87/117 [01:43<00:35,  1.18s/it, loss=0.677][A
Epoch 0:  74%|███████▍  | 87/117 [01:43<00:35,  1.18s/it, loss=0.662][A
Epoch 0:  75%|███████▌  | 88/117 [01:43<00:34,  1.18s/it, loss=0.662][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  75%|███████▌  | 88/117 [01:44<00:34,  1.18s/it, loss=0.662][A
Epoch 0:  75%|███████▌  | 88/117 [01:44<00:34,  1.18s/it, loss=0.605][A
Epoch 0:  76%|███████▌  | 89/117 [01:44<00:33,  1.18s/it, loss=0.605][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  76%|███████▌  | 89/117 [01:45<00:33,  1.18s/it, loss=0.605][A
Epoch 0:  76%|███████▌  | 89/117 [01:45<00:33,  1.18s/it, loss=0.583][A
Epoch 0:  77%|███████▋  | 90/117 [01:45<00:32,  1.19s/it, loss=0.583][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  77%|███████▋  | 90/117 [01:47<00:32,  1.19s/it, loss=0.583][A
Epoch 0:  77%|███████▋  | 90/117 [01:47<00:32,  1.19s/it, loss=0.546][A
Epoch 0:  78%|███████▊  | 91/117 [01:47<00:30,  1.18s/it, loss=0.546][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  78%|███████▊  | 91/117 [01:48<00:30,  1.18s/it, loss=0.546][A
Epoch 0:  78%|███████▊  | 91/117 [01:48<00:30,  1.18s/it, loss=0.533][A
Epoch 0:  79%|███████▊  | 92/117 [01:48<00:29,  1.18s/it, loss=0.533][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  79%|███████▊  | 92/117 [01:49<00:29,  1.18s/it, loss=0.533][A
Epoch 0:  79%|███████▊  | 92/117 [01:49<00:29,  1.18s/it, loss=0.499][A
Epoch 0:  79%|███████▉  | 93/117 [01:49<00:28,  1.18s/it, loss=0.499][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  79%|███████▉  | 93/117 [01:50<00:28,  1.18s/it, loss=0.499][A
Epoch 0:  79%|███████▉  | 93/117 [01:50<00:28,  1.18s/it, loss=0.5]  [A
Epoch 0:  80%|████████  | 94/117 [01:50<00:27,  1.18s/it, loss=0.5][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  80%|████████  | 94/117 [01:51<00:27,  1.18s/it, loss=0.5][A
Epoch 0:  80%|████████  | 94/117 [01:51<00:27,  1.18s/it, loss=0.446][A
Epoch 0:  81%|████████  | 95/117 [01:51<00:25,  1.18s/it, loss=0.446][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  81%|████████  | 95/117 [01:53<00:25,  1.18s/it, loss=0.446][A
Epoch 0:  81%|████████  | 95/117 [01:53<00:25,  1.18s/it, loss=0.464][A
Epoch 0:  82%|████████▏ | 96/117 [01:53<00:24,  1.18s/it, loss=0.464][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  82%|████████▏ | 96/117 [01:54<00:24,  1.18s/it, loss=0.464][A
Epoch 0:  82%|████████▏ | 96/117 [01:54<00:24,  1.18s/it, loss=0.417][A
Epoch 0:  83%|████████▎ | 97/117 [01:54<00:23,  1.18s/it, loss=0.417][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  83%|████████▎ | 97/117 [01:55<00:23,  1.18s/it, loss=0.417][A
Epoch 0:  83%|████████▎ | 97/117 [01:55<00:23,  1.18s/it, loss=0.444][A
Epoch 0:  84%|████████▍ | 98/117 [01:55<00:22,  1.18s/it, loss=0.444][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  84%|████████▍ | 98/117 [01:56<00:22,  1.18s/it, loss=0.444][A
Epoch 0:  84%|████████▍ | 98/117 [01:56<00:22,  1.18s/it, loss=0.371][A
Epoch 0:  85%|████████▍ | 99/117 [01:56<00:21,  1.18s/it, loss=0.371][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  85%|████████▍ | 99/117 [01:57<00:21,  1.18s/it, loss=0.371][A
Epoch 0:  85%|████████▍ | 99/117 [01:57<00:21,  1.18s/it, loss=0.364][A
Epoch 0:  85%|████████▌ | 100/117 [01:57<00:19,  1.18s/it, loss=0.364][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  85%|████████▌ | 100/117 [01:58<00:19,  1.18s/it, loss=0.364][A
Epoch 0:  85%|████████▌ | 100/117 [01:58<00:19,  1.18s/it, loss=0.371][A
Epoch 0:  86%|████████▋ | 101/117 [01:58<00:18,  1.18s/it, loss=0.371][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  86%|████████▋ | 101/117 [02:00<00:18,  1.18s/it, loss=0.371][A
Epoch 0:  86%|████████▋ | 101/117 [02:00<00:18,  1.18s/it, loss=0.385][A
Epoch 0:  87%|████████▋ | 102/117 [02:00<00:17,  1.17s/it, loss=0.385][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  87%|████████▋ | 102/117 [02:01<00:17,  1.17s/it, loss=0.385][A
Epoch 0:  87%|████████▋ | 102/117 [02:01<00:17,  1.17s/it, loss=0.356][A
Epoch 0:  88%|████████▊ | 103/117 [02:01<00:16,  1.17s/it, loss=0.356][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  88%|████████▊ | 103/117 [02:02<00:16,  1.17s/it, loss=0.356][A
Epoch 0:  88%|████████▊ | 103/117 [02:02<00:16,  1.17s/it, loss=0.333][A
Epoch 0:  89%|████████▉ | 104/117 [02:02<00:15,  1.17s/it, loss=0.333][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  89%|████████▉ | 104/117 [02:03<00:15,  1.17s/it, loss=0.333][A
Epoch 0:  89%|████████▉ | 104/117 [02:03<00:15,  1.17s/it, loss=0.347][A
Epoch 0:  90%|████████▉ | 105/117 [02:03<00:14,  1.17s/it, loss=0.347][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  90%|████████▉ | 105/117 [02:04<00:14,  1.17s/it, loss=0.347][A
Epoch 0:  90%|████████▉ | 105/117 [02:04<00:14,  1.17s/it, loss=0.318][A
Epoch 0:  91%|█████████ | 106/117 [02:04<00:12,  1.17s/it, loss=0.318][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  91%|█████████ | 106/117 [02:05<00:12,  1.17s/it, loss=0.318][A
Epoch 0:  91%|█████████ | 106/117 [02:05<00:12,  1.17s/it, loss=0.325][A
Epoch 0:  91%|█████████▏| 107/117 [02:05<00:11,  1.17s/it, loss=0.325][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  91%|█████████▏| 107/117 [02:07<00:11,  1.17s/it, loss=0.325][A
Epoch 0:  91%|█████████▏| 107/117 [02:07<00:11,  1.17s/it, loss=0.311][A
Epoch 0:  92%|█████████▏| 108/117 [02:07<00:10,  1.17s/it, loss=0.311][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  92%|█████████▏| 108/117 [02:08<00:10,  1.17s/it, loss=0.311][A
Epoch 0:  92%|█████████▏| 108/117 [02:08<00:10,  1.17s/it, loss=0.295][A
Epoch 0:  93%|█████████▎| 109/117 [02:08<00:09,  1.17s/it, loss=0.295][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  93%|█████████▎| 109/117 [02:09<00:09,  1.17s/it, loss=0.295][A
Epoch 0:  93%|█████████▎| 109/117 [02:09<00:09,  1.17s/it, loss=0.328][A
Epoch 0:  94%|█████████▍| 110/117 [02:09<00:08,  1.17s/it, loss=0.328][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  94%|█████████▍| 110/117 [02:10<00:08,  1.17s/it, loss=0.328][A
Epoch 0:  94%|█████████▍| 110/117 [02:10<00:08,  1.17s/it, loss=0.311][A
Epoch 0:  95%|█████████▍| 111/117 [02:10<00:06,  1.17s/it, loss=0.311][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  95%|█████████▍| 111/117 [02:11<00:06,  1.17s/it, loss=0.311][A
Epoch 0:  95%|█████████▍| 111/117 [02:11<00:06,  1.17s/it, loss=0.316][A
Epoch 0:  96%|█████████▌| 112/117 [02:11<00:05,  1.17s/it, loss=0.316][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  96%|█████████▌| 112/117 [02:12<00:05,  1.17s/it, loss=0.316][A
Epoch 0:  96%|█████████▌| 112/117 [02:12<00:05,  1.17s/it, loss=0.288][A
Epoch 0:  97%|█████████▋| 113/117 [02:12<00:04,  1.16s/it, loss=0.288][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  97%|█████████▋| 113/117 [02:14<00:04,  1.16s/it, loss=0.288][A
Epoch 0:  97%|█████████▋| 113/117 [02:14<00:04,  1.16s/it, loss=0.278][A
Epoch 0:  97%|█████████▋| 114/117 [02:14<00:03,  1.16s/it, loss=0.278][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  97%|█████████▋| 114/117 [02:15<00:03,  1.16s/it, loss=0.278][A
Epoch 0:  97%|█████████▋| 114/117 [02:15<00:03,  1.16s/it, loss=0.259][A
Epoch 0:  98%|█████████▊| 115/117 [02:15<00:02,  1.16s/it, loss=0.259][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  98%|█████████▊| 115/117 [02:16<00:02,  1.16s/it, loss=0.259][A
Epoch 0:  98%|█████████▊| 115/117 [02:16<00:02,  1.16s/it, loss=0.306][A
Epoch 0:  99%|█████████▉| 116/117 [02:16<00:01,  1.16s/it, loss=0.306][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 0:  99%|█████████▉| 116/117 [02:17<00:01,  1.16s/it, loss=0.306][A
Epoch 0:  99%|█████████▉| 116/117 [02:17<00:01,  1.16s/it, loss=0.3]  [A
Epoch 0: 100%|██████████| 117/117 [02:17<00:00,  1.07s/it, loss=0.3][A
                                                                    [A

Epoch 1



  0%|          | 0/117 [00:00<?, ?it/s][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:   0%|          | 0/117 [00:01<?, ?it/s][A
Epoch 1:   0%|          | 0/117 [00:01<?, ?it/s, loss=0.275][A
Epoch 1:   1%|          | 1/117 [00:01<02:11,  1.14s/it, loss=0.275][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:   1%|          | 1/117 [00:02<02:11,  1.14s/it, loss=0.275][A
Epoch 1:   1%|          | 1/117 [00:02<02:11,  1.14s/it, loss=0.287][A
Epoch 1:   2%|▏         | 2/117 [00:02<02:12,  1.15s/it, loss=0.287][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:   2%|▏         | 2/117 [00:03<02:12,  1.15s/it, loss=0.287][A
Epoch 1:   2%|▏         | 2/117 [00:03<02:12,  1.15s/it, loss=0.274][A
Epoch 1:   3%|▎         | 3/117 [00:03<02:11,  1.15s/it, loss=0.274][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:   3%|▎         | 3/117 [00:04<02:11,  1.15s/it, loss=0.274][A
Epoch 1:   3%|▎         | 3/117 [00:04<02:11,  1.15s/it, loss=0.284][A
Epoch 1:   3%|▎         | 4/117 [00:04<02:10,  1.15s/it, loss=0.284][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:   3%|▎         | 4/117 [00:05<02:10,  1.15s/it, loss=0.284][A
Epoch 1:   3%|▎         | 4/117 [00:05<02:10,  1.15s/it, loss=0.242][A
Epoch 1:   4%|▍         | 5/117 [00:05<02:09,  1.16s/it, loss=0.242][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:   4%|▍         | 5/117 [00:06<02:09,  1.16s/it, loss=0.242][A
Epoch 1:   4%|▍         | 5/117 [00:06<02:09,  1.16s/it, loss=0.283][A
Epoch 1:   5%|▌         | 6/117 [00:06<02:08,  1.16s/it, loss=0.283][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:   5%|▌         | 6/117 [00:08<02:08,  1.16s/it, loss=0.283][A
Epoch 1:   5%|▌         | 6/117 [00:08<02:08,  1.16s/it, loss=0.226][A
Epoch 1:   6%|▌         | 7/117 [00:08<02:07,  1.16s/it, loss=0.226][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:   6%|▌         | 7/117 [00:09<02:07,  1.16s/it, loss=0.226][A
Epoch 1:   6%|▌         | 7/117 [00:09<02:07,  1.16s/it, loss=0.219][A
Epoch 1:   7%|▋         | 8/117 [00:09<02:06,  1.16s/it, loss=0.219][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:   7%|▋         | 8/117 [00:10<02:06,  1.16s/it, loss=0.219][A
Epoch 1:   7%|▋         | 8/117 [00:10<02:06,  1.16s/it, loss=0.232][A
Epoch 1:   8%|▊         | 9/117 [00:10<02:04,  1.16s/it, loss=0.232][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:   8%|▊         | 9/117 [00:11<02:04,  1.16s/it, loss=0.232][A
Epoch 1:   8%|▊         | 9/117 [00:11<02:04,  1.16s/it, loss=0.268][A
Epoch 1:   9%|▊         | 10/117 [00:11<02:04,  1.16s/it, loss=0.268][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:   9%|▊         | 10/117 [00:12<02:04,  1.16s/it, loss=0.268][A
Epoch 1:   9%|▊         | 10/117 [00:12<02:04,  1.16s/it, loss=0.274][A
Epoch 1:   9%|▉         | 11/117 [00:12<02:02,  1.16s/it, loss=0.274][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:   9%|▉         | 11/117 [00:13<02:02,  1.16s/it, loss=0.274][A
Epoch 1:   9%|▉         | 11/117 [00:13<02:02,  1.16s/it, loss=0.238][A
Epoch 1:  10%|█         | 12/117 [00:13<02:01,  1.16s/it, loss=0.238][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  10%|█         | 12/117 [00:15<02:01,  1.16s/it, loss=0.238][A
Epoch 1:  10%|█         | 12/117 [00:15<02:01,  1.16s/it, loss=0.235][A
Epoch 1:  11%|█         | 13/117 [00:15<02:00,  1.16s/it, loss=0.235][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  11%|█         | 13/117 [00:16<02:00,  1.16s/it, loss=0.235][A
Epoch 1:  11%|█         | 13/117 [00:16<02:00,  1.16s/it, loss=0.248][A
Epoch 1:  12%|█▏        | 14/117 [00:16<01:59,  1.16s/it, loss=0.248][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  12%|█▏        | 14/117 [00:17<01:59,  1.16s/it, loss=0.248][A
Epoch 1:  12%|█▏        | 14/117 [00:17<01:59,  1.16s/it, loss=0.223][A
Epoch 1:  13%|█▎        | 15/117 [00:17<01:58,  1.16s/it, loss=0.223][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  13%|█▎        | 15/117 [00:18<01:58,  1.16s/it, loss=0.223][A
Epoch 1:  13%|█▎        | 15/117 [00:18<01:58,  1.16s/it, loss=0.276][A
Epoch 1:  14%|█▎        | 16/117 [00:18<01:57,  1.16s/it, loss=0.276][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  14%|█▎        | 16/117 [00:19<01:57,  1.16s/it, loss=0.276][A
Epoch 1:  14%|█▎        | 16/117 [00:19<01:57,  1.16s/it, loss=0.223][A
Epoch 1:  15%|█▍        | 17/117 [00:19<01:56,  1.16s/it, loss=0.223][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  15%|█▍        | 17/117 [00:20<01:56,  1.16s/it, loss=0.223][A
Epoch 1:  15%|█▍        | 17/117 [00:20<01:56,  1.16s/it, loss=0.212][A
Epoch 1:  15%|█▌        | 18/117 [00:20<01:54,  1.16s/it, loss=0.212][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  15%|█▌        | 18/117 [00:21<01:54,  1.16s/it, loss=0.212][A
Epoch 1:  15%|█▌        | 18/117 [00:22<01:54,  1.16s/it, loss=0.293][A
Epoch 1:  16%|█▌        | 19/117 [00:22<01:54,  1.16s/it, loss=0.293][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  16%|█▌        | 19/117 [00:23<01:54,  1.16s/it, loss=0.293][A
Epoch 1:  16%|█▌        | 19/117 [00:23<01:54,  1.16s/it, loss=0.261][A
Epoch 1:  17%|█▋        | 20/117 [00:23<01:53,  1.17s/it, loss=0.261][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  17%|█▋        | 20/117 [00:24<01:53,  1.17s/it, loss=0.261][A
Epoch 1:  17%|█▋        | 20/117 [00:24<01:53,  1.17s/it, loss=0.245][A
Epoch 1:  18%|█▊        | 21/117 [00:24<01:52,  1.17s/it, loss=0.245][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  18%|█▊        | 21/117 [00:25<01:52,  1.17s/it, loss=0.245][A
Epoch 1:  18%|█▊        | 21/117 [00:25<01:52,  1.17s/it, loss=0.237][A
Epoch 1:  19%|█▉        | 22/117 [00:25<01:50,  1.17s/it, loss=0.237][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  19%|█▉        | 22/117 [00:26<01:50,  1.17s/it, loss=0.237][A
Epoch 1:  19%|█▉        | 22/117 [00:26<01:50,  1.17s/it, loss=0.257][A
Epoch 1:  20%|█▉        | 23/117 [00:26<01:49,  1.16s/it, loss=0.257][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  20%|█▉        | 23/117 [00:27<01:49,  1.16s/it, loss=0.257][A
Epoch 1:  20%|█▉        | 23/117 [00:27<01:49,  1.16s/it, loss=0.244][A
Epoch 1:  21%|██        | 24/117 [00:27<01:48,  1.16s/it, loss=0.244][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  21%|██        | 24/117 [00:28<01:48,  1.16s/it, loss=0.244][A
Epoch 1:  21%|██        | 24/117 [00:28<01:48,  1.16s/it, loss=0.215][A
Epoch 1:  21%|██▏       | 25/117 [00:29<01:47,  1.16s/it, loss=0.215][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  21%|██▏       | 25/117 [00:30<01:47,  1.16s/it, loss=0.215][A
Epoch 1:  21%|██▏       | 25/117 [00:30<01:47,  1.16s/it, loss=0.201][A
Epoch 1:  22%|██▏       | 26/117 [00:30<01:45,  1.16s/it, loss=0.201][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  22%|██▏       | 26/117 [00:31<01:45,  1.16s/it, loss=0.201][A
Epoch 1:  22%|██▏       | 26/117 [00:31<01:45,  1.16s/it, loss=0.182][A
Epoch 1:  23%|██▎       | 27/117 [00:31<01:44,  1.16s/it, loss=0.182][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  23%|██▎       | 27/117 [00:32<01:44,  1.16s/it, loss=0.182][A
Epoch 1:  23%|██▎       | 27/117 [00:32<01:44,  1.16s/it, loss=0.19] [A
Epoch 1:  24%|██▍       | 28/117 [00:32<01:43,  1.16s/it, loss=0.19][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  24%|██▍       | 28/117 [00:33<01:43,  1.16s/it, loss=0.19][A
Epoch 1:  24%|██▍       | 28/117 [00:33<01:43,  1.16s/it, loss=0.259][A
Epoch 1:  25%|██▍       | 29/117 [00:33<01:42,  1.16s/it, loss=0.259][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  25%|██▍       | 29/117 [00:34<01:42,  1.16s/it, loss=0.259][A
Epoch 1:  25%|██▍       | 29/117 [00:34<01:42,  1.16s/it, loss=0.236][A
Epoch 1:  26%|██▌       | 30/117 [00:34<01:41,  1.17s/it, loss=0.236][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  26%|██▌       | 30/117 [00:35<01:41,  1.17s/it, loss=0.236][A
Epoch 1:  26%|██▌       | 30/117 [00:35<01:41,  1.17s/it, loss=0.256][A
Epoch 1:  26%|██▋       | 31/117 [00:35<01:40,  1.17s/it, loss=0.256][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  26%|██▋       | 31/117 [00:37<01:40,  1.17s/it, loss=0.256][A
Epoch 1:  26%|██▋       | 31/117 [00:37<01:40,  1.17s/it, loss=0.224][A
Epoch 1:  27%|██▋       | 32/117 [00:37<01:38,  1.16s/it, loss=0.224][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  27%|██▋       | 32/117 [00:38<01:38,  1.16s/it, loss=0.224][A
Epoch 1:  27%|██▋       | 32/117 [00:38<01:38,  1.16s/it, loss=0.261][A
Epoch 1:  28%|██▊       | 33/117 [00:38<01:37,  1.17s/it, loss=0.261][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  28%|██▊       | 33/117 [00:39<01:37,  1.17s/it, loss=0.261][A
Epoch 1:  28%|██▊       | 33/117 [00:39<01:37,  1.17s/it, loss=0.21] [A
Epoch 1:  29%|██▉       | 34/117 [00:39<01:36,  1.17s/it, loss=0.21][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  29%|██▉       | 34/117 [00:40<01:36,  1.17s/it, loss=0.21][A
Epoch 1:  29%|██▉       | 34/117 [00:40<01:36,  1.17s/it, loss=0.256][A
Epoch 1:  30%|██▉       | 35/117 [00:40<01:35,  1.17s/it, loss=0.256][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  30%|██▉       | 35/117 [00:41<01:35,  1.17s/it, loss=0.256][A
Epoch 1:  30%|██▉       | 35/117 [00:41<01:35,  1.17s/it, loss=0.237][A
Epoch 1:  31%|███       | 36/117 [00:41<01:34,  1.17s/it, loss=0.237][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  31%|███       | 36/117 [00:42<01:34,  1.17s/it, loss=0.237][A
Epoch 1:  31%|███       | 36/117 [00:43<01:34,  1.17s/it, loss=0.235][A
Epoch 1:  32%|███▏      | 37/117 [00:43<01:33,  1.17s/it, loss=0.235][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  32%|███▏      | 37/117 [00:44<01:33,  1.17s/it, loss=0.235][A
Epoch 1:  32%|███▏      | 37/117 [00:44<01:33,  1.17s/it, loss=0.176][A
Epoch 1:  32%|███▏      | 38/117 [00:44<01:32,  1.17s/it, loss=0.176][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  32%|███▏      | 38/117 [00:45<01:32,  1.17s/it, loss=0.176][A
Epoch 1:  32%|███▏      | 38/117 [00:45<01:32,  1.17s/it, loss=0.191][A
Epoch 1:  33%|███▎      | 39/117 [00:45<01:31,  1.17s/it, loss=0.191][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  33%|███▎      | 39/117 [00:46<01:31,  1.17s/it, loss=0.191][A
Epoch 1:  33%|███▎      | 39/117 [00:46<01:31,  1.17s/it, loss=0.239][A
Epoch 1:  34%|███▍      | 40/117 [00:46<01:30,  1.17s/it, loss=0.239][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  34%|███▍      | 40/117 [00:47<01:30,  1.17s/it, loss=0.239][A
Epoch 1:  34%|███▍      | 40/117 [00:47<01:30,  1.17s/it, loss=0.229][A
Epoch 1:  35%|███▌      | 41/117 [00:47<01:28,  1.17s/it, loss=0.229][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  35%|███▌      | 41/117 [00:48<01:28,  1.17s/it, loss=0.229][A
Epoch 1:  35%|███▌      | 41/117 [00:48<01:28,  1.17s/it, loss=0.25] [A
Epoch 1:  36%|███▌      | 42/117 [00:48<01:27,  1.17s/it, loss=0.25][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  36%|███▌      | 42/117 [00:50<01:27,  1.17s/it, loss=0.25][A
Epoch 1:  36%|███▌      | 42/117 [00:50<01:27,  1.17s/it, loss=0.214][A
Epoch 1:  37%|███▋      | 43/117 [00:50<01:26,  1.17s/it, loss=0.214][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  37%|███▋      | 43/117 [00:51<01:26,  1.17s/it, loss=0.214][A
Epoch 1:  37%|███▋      | 43/117 [00:51<01:26,  1.17s/it, loss=0.181][A
Epoch 1:  38%|███▊      | 44/117 [00:51<01:25,  1.17s/it, loss=0.181][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  38%|███▊      | 44/117 [00:52<01:25,  1.17s/it, loss=0.181][A
Epoch 1:  38%|███▊      | 44/117 [00:52<01:25,  1.17s/it, loss=0.242][A
Epoch 1:  38%|███▊      | 45/117 [00:52<01:24,  1.17s/it, loss=0.242][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  38%|███▊      | 45/117 [00:53<01:24,  1.17s/it, loss=0.242][A
Epoch 1:  38%|███▊      | 45/117 [00:53<01:24,  1.17s/it, loss=0.21] [A
Epoch 1:  39%|███▉      | 46/117 [00:53<01:23,  1.17s/it, loss=0.21][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  39%|███▉      | 46/117 [00:54<01:23,  1.17s/it, loss=0.21][A
Epoch 1:  39%|███▉      | 46/117 [00:54<01:23,  1.17s/it, loss=0.224][A
Epoch 1:  40%|████      | 47/117 [00:54<01:22,  1.17s/it, loss=0.224][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  40%|████      | 47/117 [00:55<01:22,  1.17s/it, loss=0.224][A
Epoch 1:  40%|████      | 47/117 [00:55<01:22,  1.17s/it, loss=0.228][A
Epoch 1:  41%|████      | 48/117 [00:55<01:20,  1.17s/it, loss=0.228][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  41%|████      | 48/117 [00:57<01:20,  1.17s/it, loss=0.228][A
Epoch 1:  41%|████      | 48/117 [00:57<01:20,  1.17s/it, loss=0.213][A
Epoch 1:  42%|████▏     | 49/117 [00:57<01:19,  1.17s/it, loss=0.213][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  42%|████▏     | 49/117 [00:58<01:19,  1.17s/it, loss=0.213][A
Epoch 1:  42%|████▏     | 49/117 [00:58<01:19,  1.17s/it, loss=0.219][A
Epoch 1:  43%|████▎     | 50/117 [00:58<01:18,  1.17s/it, loss=0.219][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  43%|████▎     | 50/117 [00:59<01:18,  1.17s/it, loss=0.219][A
Epoch 1:  43%|████▎     | 50/117 [00:59<01:18,  1.17s/it, loss=0.179][A
Epoch 1:  44%|████▎     | 51/117 [00:59<01:16,  1.17s/it, loss=0.179][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  44%|████▎     | 51/117 [01:00<01:16,  1.17s/it, loss=0.179][A
Epoch 1:  44%|████▎     | 51/117 [01:00<01:16,  1.17s/it, loss=0.242][A
Epoch 1:  44%|████▍     | 52/117 [01:00<01:15,  1.17s/it, loss=0.242][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  44%|████▍     | 52/117 [01:01<01:15,  1.17s/it, loss=0.242][A
Epoch 1:  44%|████▍     | 52/117 [01:01<01:15,  1.17s/it, loss=0.202][A
Epoch 1:  45%|████▌     | 53/117 [01:01<01:14,  1.16s/it, loss=0.202][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  45%|████▌     | 53/117 [01:02<01:14,  1.16s/it, loss=0.202][A
Epoch 1:  45%|████▌     | 53/117 [01:02<01:14,  1.16s/it, loss=0.179][A
Epoch 1:  46%|████▌     | 54/117 [01:02<01:13,  1.16s/it, loss=0.179][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  46%|████▌     | 54/117 [01:04<01:13,  1.16s/it, loss=0.179][A
Epoch 1:  46%|████▌     | 54/117 [01:04<01:13,  1.16s/it, loss=0.201][A
Epoch 1:  47%|████▋     | 55/117 [01:04<01:12,  1.17s/it, loss=0.201][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  47%|████▋     | 55/117 [01:05<01:12,  1.17s/it, loss=0.201][A
Epoch 1:  47%|████▋     | 55/117 [01:05<01:12,  1.17s/it, loss=0.164][A
Epoch 1:  48%|████▊     | 56/117 [01:05<01:11,  1.17s/it, loss=0.164][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  48%|████▊     | 56/117 [01:06<01:11,  1.17s/it, loss=0.164][A
Epoch 1:  48%|████▊     | 56/117 [01:06<01:11,  1.17s/it, loss=0.175][A
Epoch 1:  49%|████▊     | 57/117 [01:06<01:09,  1.17s/it, loss=0.175][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  49%|████▊     | 57/117 [01:07<01:09,  1.17s/it, loss=0.175][A
Epoch 1:  49%|████▊     | 57/117 [01:07<01:09,  1.17s/it, loss=0.168][A
Epoch 1:  50%|████▉     | 58/117 [01:07<01:08,  1.16s/it, loss=0.168][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  50%|████▉     | 58/117 [01:08<01:08,  1.16s/it, loss=0.168][A
Epoch 1:  50%|████▉     | 58/117 [01:08<01:08,  1.16s/it, loss=0.227][A
Epoch 1:  50%|█████     | 59/117 [01:08<01:07,  1.17s/it, loss=0.227][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  50%|█████     | 59/117 [01:09<01:07,  1.17s/it, loss=0.227][A
Epoch 1:  50%|█████     | 59/117 [01:09<01:07,  1.17s/it, loss=0.209][A
Epoch 1:  51%|█████▏    | 60/117 [01:09<01:06,  1.17s/it, loss=0.209][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  51%|█████▏    | 60/117 [01:11<01:06,  1.17s/it, loss=0.209][A
Epoch 1:  51%|█████▏    | 60/117 [01:11<01:06,  1.17s/it, loss=0.22] [A
Epoch 1:  52%|█████▏    | 61/117 [01:11<01:05,  1.17s/it, loss=0.22][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  52%|█████▏    | 61/117 [01:12<01:05,  1.17s/it, loss=0.22][A
Epoch 1:  52%|█████▏    | 61/117 [01:12<01:05,  1.17s/it, loss=0.26][A
Epoch 1:  53%|█████▎    | 62/117 [01:12<01:04,  1.17s/it, loss=0.26][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  53%|█████▎    | 62/117 [01:13<01:04,  1.17s/it, loss=0.26][A
Epoch 1:  53%|█████▎    | 62/117 [01:13<01:04,  1.17s/it, loss=0.219][A
Epoch 1:  54%|█████▍    | 63/117 [01:13<01:03,  1.17s/it, loss=0.219][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  54%|█████▍    | 63/117 [01:14<01:03,  1.17s/it, loss=0.219][A
Epoch 1:  54%|█████▍    | 63/117 [01:14<01:03,  1.17s/it, loss=0.185][A
Epoch 1:  55%|█████▍    | 64/117 [01:14<01:01,  1.16s/it, loss=0.185][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  55%|█████▍    | 64/117 [01:15<01:01,  1.16s/it, loss=0.185][A
Epoch 1:  55%|█████▍    | 64/117 [01:15<01:01,  1.16s/it, loss=0.157][A
Epoch 1:  56%|█████▌    | 65/117 [01:15<01:00,  1.16s/it, loss=0.157][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  56%|█████▌    | 65/117 [01:16<01:00,  1.16s/it, loss=0.157][A
Epoch 1:  56%|█████▌    | 65/117 [01:16<01:00,  1.16s/it, loss=0.175][A
Epoch 1:  56%|█████▋    | 66/117 [01:16<00:59,  1.16s/it, loss=0.175][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  56%|█████▋    | 66/117 [01:17<00:59,  1.16s/it, loss=0.175][A
Epoch 1:  56%|█████▋    | 66/117 [01:18<00:59,  1.16s/it, loss=0.221][A
Epoch 1:  57%|█████▋    | 67/117 [01:18<00:58,  1.16s/it, loss=0.221][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  57%|█████▋    | 67/117 [01:19<00:58,  1.16s/it, loss=0.221][A
Epoch 1:  57%|█████▋    | 67/117 [01:19<00:58,  1.16s/it, loss=0.181][A
Epoch 1:  58%|█████▊    | 68/117 [01:19<00:56,  1.16s/it, loss=0.181][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  58%|█████▊    | 68/117 [01:20<00:56,  1.16s/it, loss=0.181][A
Epoch 1:  58%|█████▊    | 68/117 [01:20<00:56,  1.16s/it, loss=0.182][A
Epoch 1:  59%|█████▉    | 69/117 [01:20<00:55,  1.16s/it, loss=0.182][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  59%|█████▉    | 69/117 [01:21<00:55,  1.16s/it, loss=0.182][A
Epoch 1:  59%|█████▉    | 69/117 [01:21<00:55,  1.16s/it, loss=0.183][A
Epoch 1:  60%|█████▉    | 70/117 [01:21<00:54,  1.16s/it, loss=0.183][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  60%|█████▉    | 70/117 [01:22<00:54,  1.16s/it, loss=0.183][A
Epoch 1:  60%|█████▉    | 70/117 [01:22<00:54,  1.16s/it, loss=0.168][A
Epoch 1:  61%|██████    | 71/117 [01:22<00:53,  1.16s/it, loss=0.168][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  61%|██████    | 71/117 [01:23<00:53,  1.16s/it, loss=0.168][A
Epoch 1:  61%|██████    | 71/117 [01:23<00:53,  1.16s/it, loss=0.197][A
Epoch 1:  62%|██████▏   | 72/117 [01:23<00:52,  1.16s/it, loss=0.197][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  62%|██████▏   | 72/117 [01:24<00:52,  1.16s/it, loss=0.197][A
Epoch 1:  62%|██████▏   | 72/117 [01:24<00:52,  1.16s/it, loss=0.217][A
Epoch 1:  62%|██████▏   | 73/117 [01:24<00:51,  1.16s/it, loss=0.217][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  62%|██████▏   | 73/117 [01:26<00:51,  1.16s/it, loss=0.217][A
Epoch 1:  62%|██████▏   | 73/117 [01:26<00:51,  1.16s/it, loss=0.182][A
Epoch 1:  63%|██████▎   | 74/117 [01:26<00:49,  1.16s/it, loss=0.182][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  63%|██████▎   | 74/117 [01:27<00:49,  1.16s/it, loss=0.182][A
Epoch 1:  63%|██████▎   | 74/117 [01:27<00:49,  1.16s/it, loss=0.193][A
Epoch 1:  64%|██████▍   | 75/117 [01:27<00:48,  1.16s/it, loss=0.193][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  64%|██████▍   | 75/117 [01:28<00:48,  1.16s/it, loss=0.193][A
Epoch 1:  64%|██████▍   | 75/117 [01:28<00:48,  1.16s/it, loss=0.2]  [A
Epoch 1:  65%|██████▍   | 76/117 [01:28<00:47,  1.16s/it, loss=0.2][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  65%|██████▍   | 76/117 [01:29<00:47,  1.16s/it, loss=0.2][A
Epoch 1:  65%|██████▍   | 76/117 [01:29<00:47,  1.16s/it, loss=0.184][A
Epoch 1:  66%|██████▌   | 77/117 [01:29<00:46,  1.16s/it, loss=0.184][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  66%|██████▌   | 77/117 [01:30<00:46,  1.16s/it, loss=0.184][A
Epoch 1:  66%|██████▌   | 77/117 [01:30<00:46,  1.16s/it, loss=0.164][A
Epoch 1:  67%|██████▋   | 78/117 [01:30<00:45,  1.16s/it, loss=0.164][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  67%|██████▋   | 78/117 [01:31<00:45,  1.16s/it, loss=0.164][A
Epoch 1:  67%|██████▋   | 78/117 [01:31<00:45,  1.16s/it, loss=0.218][A
Epoch 1:  68%|██████▊   | 79/117 [01:31<00:44,  1.16s/it, loss=0.218][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  68%|██████▊   | 79/117 [01:33<00:44,  1.16s/it, loss=0.218][A
Epoch 1:  68%|██████▊   | 79/117 [01:33<00:44,  1.16s/it, loss=0.18] [A
Epoch 1:  68%|██████▊   | 80/117 [01:33<00:43,  1.16s/it, loss=0.18][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  68%|██████▊   | 80/117 [01:34<00:43,  1.16s/it, loss=0.18][A
Epoch 1:  68%|██████▊   | 80/117 [01:34<00:43,  1.16s/it, loss=0.194][A
Epoch 1:  69%|██████▉   | 81/117 [01:34<00:41,  1.16s/it, loss=0.194][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  69%|██████▉   | 81/117 [01:35<00:41,  1.16s/it, loss=0.194][A
Epoch 1:  69%|██████▉   | 81/117 [01:35<00:41,  1.16s/it, loss=0.226][A
Epoch 1:  70%|███████   | 82/117 [01:35<00:40,  1.16s/it, loss=0.226][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  70%|███████   | 82/117 [01:36<00:40,  1.16s/it, loss=0.226][A
Epoch 1:  70%|███████   | 82/117 [01:36<00:40,  1.16s/it, loss=0.168][A
Epoch 1:  71%|███████   | 83/117 [01:36<00:39,  1.16s/it, loss=0.168][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  71%|███████   | 83/117 [01:37<00:39,  1.16s/it, loss=0.168][A
Epoch 1:  71%|███████   | 83/117 [01:37<00:39,  1.16s/it, loss=0.195][A
Epoch 1:  72%|███████▏  | 84/117 [01:37<00:38,  1.16s/it, loss=0.195][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  72%|███████▏  | 84/117 [01:38<00:38,  1.16s/it, loss=0.195][A
Epoch 1:  72%|███████▏  | 84/117 [01:38<00:38,  1.16s/it, loss=0.213][A
Epoch 1:  73%|███████▎  | 85/117 [01:38<00:37,  1.16s/it, loss=0.213][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  73%|███████▎  | 85/117 [01:40<00:37,  1.16s/it, loss=0.213][A
Epoch 1:  73%|███████▎  | 85/117 [01:40<00:37,  1.16s/it, loss=0.151][A
Epoch 1:  74%|███████▎  | 86/117 [01:40<00:36,  1.16s/it, loss=0.151][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  74%|███████▎  | 86/117 [01:41<00:36,  1.16s/it, loss=0.151][A
Epoch 1:  74%|███████▎  | 86/117 [01:41<00:36,  1.16s/it, loss=0.226][A
Epoch 1:  74%|███████▍  | 87/117 [01:41<00:34,  1.16s/it, loss=0.226][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  74%|███████▍  | 87/117 [01:42<00:34,  1.16s/it, loss=0.226][A
Epoch 1:  74%|███████▍  | 87/117 [01:42<00:34,  1.16s/it, loss=0.269][A
Epoch 1:  75%|███████▌  | 88/117 [01:42<00:33,  1.16s/it, loss=0.269][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  75%|███████▌  | 88/117 [01:43<00:33,  1.16s/it, loss=0.269][A
Epoch 1:  75%|███████▌  | 88/117 [01:43<00:33,  1.16s/it, loss=0.217][A
Epoch 1:  76%|███████▌  | 89/117 [01:43<00:32,  1.16s/it, loss=0.217][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  76%|███████▌  | 89/117 [01:44<00:32,  1.16s/it, loss=0.217][A
Epoch 1:  76%|███████▌  | 89/117 [01:44<00:32,  1.16s/it, loss=0.163][A
Epoch 1:  77%|███████▋  | 90/117 [01:44<00:31,  1.16s/it, loss=0.163][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  77%|███████▋  | 90/117 [01:45<00:31,  1.16s/it, loss=0.163][A
Epoch 1:  77%|███████▋  | 90/117 [01:45<00:31,  1.16s/it, loss=0.157][A
Epoch 1:  78%|███████▊  | 91/117 [01:45<00:30,  1.16s/it, loss=0.157][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  78%|███████▊  | 91/117 [01:47<00:30,  1.16s/it, loss=0.157][A
Epoch 1:  78%|███████▊  | 91/117 [01:47<00:30,  1.16s/it, loss=0.238][A
Epoch 1:  79%|███████▊  | 92/117 [01:47<00:29,  1.16s/it, loss=0.238][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  79%|███████▊  | 92/117 [01:48<00:29,  1.16s/it, loss=0.238][A
Epoch 1:  79%|███████▊  | 92/117 [01:48<00:29,  1.16s/it, loss=0.164][A
Epoch 1:  79%|███████▉  | 93/117 [01:48<00:27,  1.16s/it, loss=0.164][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  79%|███████▉  | 93/117 [01:49<00:27,  1.16s/it, loss=0.164][A
Epoch 1:  79%|███████▉  | 93/117 [01:49<00:27,  1.16s/it, loss=0.174][A
Epoch 1:  80%|████████  | 94/117 [01:49<00:26,  1.16s/it, loss=0.174][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  80%|████████  | 94/117 [01:50<00:26,  1.16s/it, loss=0.174][A
Epoch 1:  80%|████████  | 94/117 [01:50<00:26,  1.16s/it, loss=0.195][A
Epoch 1:  81%|████████  | 95/117 [01:50<00:25,  1.16s/it, loss=0.195][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  81%|████████  | 95/117 [01:51<00:25,  1.16s/it, loss=0.195][A
Epoch 1:  81%|████████  | 95/117 [01:51<00:25,  1.16s/it, loss=0.177][A
Epoch 1:  82%|████████▏ | 96/117 [01:51<00:24,  1.16s/it, loss=0.177][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  82%|████████▏ | 96/117 [01:52<00:24,  1.16s/it, loss=0.177][A
Epoch 1:  82%|████████▏ | 96/117 [01:52<00:24,  1.16s/it, loss=0.172][A
Epoch 1:  83%|████████▎ | 97/117 [01:52<00:23,  1.16s/it, loss=0.172][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  83%|████████▎ | 97/117 [01:54<00:23,  1.16s/it, loss=0.172][A
Epoch 1:  83%|████████▎ | 97/117 [01:54<00:23,  1.16s/it, loss=0.21] [A
Epoch 1:  84%|████████▍ | 98/117 [01:54<00:22,  1.16s/it, loss=0.21][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  84%|████████▍ | 98/117 [01:55<00:22,  1.16s/it, loss=0.21][A
Epoch 1:  84%|████████▍ | 98/117 [01:55<00:22,  1.16s/it, loss=0.191][A
Epoch 1:  85%|████████▍ | 99/117 [01:55<00:20,  1.16s/it, loss=0.191][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  85%|████████▍ | 99/117 [01:56<00:20,  1.16s/it, loss=0.191][A
Epoch 1:  85%|████████▍ | 99/117 [01:56<00:20,  1.16s/it, loss=0.171][A
Epoch 1:  85%|████████▌ | 100/117 [01:56<00:19,  1.16s/it, loss=0.171][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  85%|████████▌ | 100/117 [01:57<00:19,  1.16s/it, loss=0.171][A
Epoch 1:  85%|████████▌ | 100/117 [01:57<00:19,  1.16s/it, loss=0.175][A
Epoch 1:  86%|████████▋ | 101/117 [01:57<00:18,  1.16s/it, loss=0.175][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  86%|████████▋ | 101/117 [01:58<00:18,  1.16s/it, loss=0.175][A
Epoch 1:  86%|████████▋ | 101/117 [01:58<00:18,  1.16s/it, loss=0.158][A
Epoch 1:  87%|████████▋ | 102/117 [01:58<00:17,  1.16s/it, loss=0.158][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  87%|████████▋ | 102/117 [01:59<00:17,  1.16s/it, loss=0.158][A
Epoch 1:  87%|████████▋ | 102/117 [01:59<00:17,  1.16s/it, loss=0.151][A
Epoch 1:  88%|████████▊ | 103/117 [01:59<00:16,  1.16s/it, loss=0.151][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  88%|████████▊ | 103/117 [02:00<00:16,  1.16s/it, loss=0.151][A
Epoch 1:  88%|████████▊ | 103/117 [02:00<00:16,  1.16s/it, loss=0.206][A
Epoch 1:  89%|████████▉ | 104/117 [02:00<00:15,  1.16s/it, loss=0.206][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  89%|████████▉ | 104/117 [02:02<00:15,  1.16s/it, loss=0.206][A
Epoch 1:  89%|████████▉ | 104/117 [02:02<00:15,  1.16s/it, loss=0.189][A
Epoch 1:  90%|████████▉ | 105/117 [02:02<00:13,  1.16s/it, loss=0.189][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  90%|████████▉ | 105/117 [02:03<00:13,  1.16s/it, loss=0.189][A
Epoch 1:  90%|████████▉ | 105/117 [02:03<00:13,  1.16s/it, loss=0.2]  [A
Epoch 1:  91%|█████████ | 106/117 [02:03<00:12,  1.16s/it, loss=0.2][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  91%|█████████ | 106/117 [02:04<00:12,  1.16s/it, loss=0.2][A
Epoch 1:  91%|█████████ | 106/117 [02:04<00:12,  1.16s/it, loss=0.203][A
Epoch 1:  91%|█████████▏| 107/117 [02:04<00:11,  1.17s/it, loss=0.203][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  91%|█████████▏| 107/117 [02:05<00:11,  1.17s/it, loss=0.203][A
Epoch 1:  91%|█████████▏| 107/117 [02:05<00:11,  1.17s/it, loss=0.195][A
Epoch 1:  92%|█████████▏| 108/117 [02:05<00:10,  1.17s/it, loss=0.195][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  92%|█████████▏| 108/117 [02:06<00:10,  1.17s/it, loss=0.195][A
Epoch 1:  92%|█████████▏| 108/117 [02:06<00:10,  1.17s/it, loss=0.262][A
Epoch 1:  93%|█████████▎| 109/117 [02:06<00:09,  1.17s/it, loss=0.262][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  93%|█████████▎| 109/117 [02:07<00:09,  1.17s/it, loss=0.262][A
Epoch 1:  93%|█████████▎| 109/117 [02:07<00:09,  1.17s/it, loss=0.192][A
Epoch 1:  94%|█████████▍| 110/117 [02:08<00:08,  1.17s/it, loss=0.192][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  94%|█████████▍| 110/117 [02:09<00:08,  1.17s/it, loss=0.192][A
Epoch 1:  94%|█████████▍| 110/117 [02:09<00:08,  1.17s/it, loss=0.144][A
Epoch 1:  95%|█████████▍| 111/117 [02:09<00:06,  1.17s/it, loss=0.144][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  95%|█████████▍| 111/117 [02:10<00:06,  1.17s/it, loss=0.144][A
Epoch 1:  95%|█████████▍| 111/117 [02:10<00:06,  1.17s/it, loss=0.181][A
Epoch 1:  96%|█████████▌| 112/117 [02:10<00:05,  1.17s/it, loss=0.181][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  96%|█████████▌| 112/117 [02:11<00:05,  1.17s/it, loss=0.181][A
Epoch 1:  96%|█████████▌| 112/117 [02:11<00:05,  1.17s/it, loss=0.174][A
Epoch 1:  97%|█████████▋| 113/117 [02:11<00:04,  1.17s/it, loss=0.174][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  97%|█████████▋| 113/117 [02:12<00:04,  1.17s/it, loss=0.174][A
Epoch 1:  97%|█████████▋| 113/117 [02:12<00:04,  1.17s/it, loss=0.161][A
Epoch 1:  97%|█████████▋| 114/117 [02:12<00:03,  1.16s/it, loss=0.161][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  97%|█████████▋| 114/117 [02:13<00:03,  1.16s/it, loss=0.161][A
Epoch 1:  97%|█████████▋| 114/117 [02:13<00:03,  1.16s/it, loss=0.163][A
Epoch 1:  98%|█████████▊| 115/117 [02:13<00:02,  1.16s/it, loss=0.163][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  98%|█████████▊| 115/117 [02:14<00:02,  1.16s/it, loss=0.163][A
Epoch 1:  98%|█████████▊| 115/117 [02:14<00:02,  1.16s/it, loss=0.157][A
Epoch 1:  99%|█████████▉| 116/117 [02:14<00:01,  1.16s/it, loss=0.157][A

dict_keys(['input_ids', 'mask', 'labels'])



Epoch 1:  99%|█████████▉| 116/117 [02:15<00:01,  1.16s/it, loss=0.157][A
Epoch 1:  99%|█████████▉| 116/117 [02:15<00:01,  1.16s/it, loss=0.177][A
Epoch 1: 100%|██████████| 117/117 [02:15<00:00,  1.08s/it, loss=0.177][A
                                                                      [A

AttributeError: ignored

In [26]:
print(model)

None


I am up to here

In [12]:
tokenized_text, encodings = build_tensors(paths, tokenizer)

[tensor([[   0, 1202,   37,  ...,    1,    1,    1],
        [   0,    4,  309,  ...,    1,    1,    1],
        [   0,  892,  568,  ...,    1,    1,    1],
        ...,
        [   0, 1543,  261,  ...,    1,    1,    1],
        [   0, 3796,   93,  ...,    1,    1,    1],
        [   0,  564,    4,  ...,    1,    1,    1]])]


Within encodings we should have our three tensors:
1. input_ids - token IDs with a % of tokens masked with the mask token ID which in our case is 4
2. mask - this is a binary tensor of 1 and 0 indicating where the masks are
3. labels - these are just the unmasked token IDs

In [13]:
encodings.keys()

dict_keys(['input_ids', 'mask', 'labels'])

In [14]:
# create the data loader with a batch size of 16
# this function uutilises the torch data loader
loader = create_data_loader(encodings)

In [17]:
# build RoBERTa config
config = RobertaConfig(
    vocab_size=30_522,  # we align this to the tokenizer vocab_size
    max_position_embeddings=514,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1
)


Now actually train the model

In [18]:
train_model(config, loader)

model.save_pretrained(f'{output_dir}/thesis_bert')

Epoch 0




  0%|          | 0/117 [00:00<?, ?it/s]

dict_keys(['input_ids'])


KeyError: ignored

In [None]:

import argparse
from pathlib import Path

#sys.path.insert(0,'./.env/lib/python3.8/site-packages')
from tokenizers import ByteLevelBPETokenizer
#from transformers import RobertaTokenizer
import os

from transformers import RobertaTokenizer, RobertaConfig, RobertaForMaskedLM, AdamW
import torch
import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# constants
PDF_PATH='/Users/christianbromley/Documents/Personal/PhD/Christian_Bromley_Final_Thesis_20211221_cover.pdf'
OUTPUT_PATH='/Users/christianbromley/Documents/Personal/PhD/Christian_Bromley_Final_Thesis_20211221.csv'
TEXT_PARSE_DIR='/Users/christianbromley/Documents/Personal/PhD/thesis_parsed'
MODELS_DIR='../models'

In [None]:
def extract_file_paths(input_dir):
    paths = [str(x) for x in Path(input_dir).glob('**/*.txt')]
    return paths

def run(paths, output_dir):
    # intialise tokeniser
    tokenizer = ByteLevelBPETokenizer()
    # train
    tokenizer.train(files=paths,
                    vocab_size=30_522,
                    min_frequency=2,
                    special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'])

    tokenizer.save_model(output_dir)

In [None]:
paths = extract_file_paths(input_dir=TEXT_PARSE_DIR)

paths

['/Users/christianbromley/Documents/Personal/PhD/thesis_parsed/text_0.txt']

In [None]:
run(paths, MODELS_DIR)






The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizer'.


In [None]:
def masked_language_model(tensor):
    rand = torch.rand(tensor.shape)
    mask_arr = (rand < 0.15) * (tensor > 2)
    for i in range(tensor.shape[0]):
        selection = torch.flatten(mask_arr[i].nonzero()).tolist()
        tensor[i,selection] = 4
    return tensor

In [19]:
## input ids - token IDs with a % of tokens masked with the mask token ID which in our case is 4
input_ids = []
## mask - this is a binary tensor of 1 and 0 indicating where the masks are
mask = []
## labels - these are just the unmasked token IDs
labels = []
# initialise the output
tokenized_text = {}

In [20]:
# read file
with open(paths[0], 'r', encoding='utf-8') as fp:
    lines = fp.read().split('\n')

In [None]:
lines

In [21]:
# get the file name
fname = paths[0].split('/')[-1]
fname

'text_0.txt'

In [22]:
# tokenize the text in these lines
sample = tokenizer(lines, max_length=512, padding='max_length', truncation=True, return_tensors='pt')

In [23]:
sample.keys()

dict_keys(['input_ids', 'attention_mask'])

In [24]:
tokenized_text[fname] = sample

In [25]:
# the sample object contains some of our tensors - extract these
## get the input IDs and append to labels
labels.append(sample.input_ids)
## get the attention mask - the binary
mask.append(sample.attention_mask)


In [26]:
sample.input_ids.detach().clone()

tensor([[   0, 1202,   37,  ...,    1,    1,    1],
        [   0,   56,  309,  ...,    1,    1,    1],
        [   0,  892,  568,  ...,    1,    1,    1],
        ...,
        [   0, 1543,  261,  ...,    1,    1,    1],
        [   0, 3796,   93,  ...,    1,    1,    1],
        [   0,  564, 2787,  ...,    1,    1,    1]])

In [27]:
## now apply the masked language model function on the input IDs to mask 15% of tokens
mlm_on_input = masked_language_model(sample.input_ids.detach().clone())
input_ids.append(mlm_on_input)

In [28]:
# construct the output
encodings = {
    'input_ids': input_ids[0],
    'mask': mask[0],
    'labels': labels[0]
}

In [29]:
len(encodings['labels'])

1868

In [31]:
import torch

In [32]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __len__(self):
        return self.encodings['input_ids'].shape[0]
    def __getitem__(self, i):
        return {
            'input_ids': self.encodings['input_ids'][i]
        }

In [None]:
# def create_data_loader(encodings):
#     dataset = Dataset(encodings)
#     data_loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)
#     return data_loader

In [33]:
loader = create_data_loader(encodings)

In [34]:
loader.dataset.encodings

{'input_ids': tensor([[   0,    4,   37,  ...,    1,    1,    1],
         [   0,   56,  309,  ...,    1,    1,    1],
         [   0,  892,  568,  ...,    1,    1,    1],
         ...,
         [   0, 1543,    4,  ...,    1,    1,    1],
         [   0, 3796,   93,  ...,    1,    1,    1],
         [   0,  564, 2787,  ...,    1,    1,    1]]),
 'mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([[   0, 1202,   37,  ...,    1,    1,    1],
         [   0,   56,  309,  ...,    1,    1,    1],
         [   0,  892,  568,  ...,    1,    1,    1],
         ...,
         [   0, 1543,  261,  ...,    1,    1,    1],
         [   0, 3796,   93,  ...,    1,    1,    1],
         [   0,  564, 2787,  ...,    1,    1,    1]])}

In [35]:
# build RoBERTa config
config = RobertaConfig(
    vocab_size=30_522,  # we align this to the tokenizer vocab_size
    max_position_embeddings=514,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1
)


# init model
model = RobertaForMaskedLM(config)
# set device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)
# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=1e-4)



In [36]:
# set number of epochs
epochs = 2

In [None]:
for epoch in range(epochs):
    input_ids = loader.dataset.encodings['input_ids'].to(device)
    attention_mask = loader.dataset.encodings['mask'].to(device)
    labels = loader.dataset.encodings['labels'].to(device)
    print('go')
    # model
    outputs = model(input_ids,
                    attention_mask=attention_mask,
                    labels=labels)
    # extract the loss
    loss = outputs.loss
    # calculate loss for every parameter that needs grad update
    loss.backward()
    # update parameters
    optim.step()
    # print relevant info to progress bar
    loop.set_description(f'Epoch {epoch}')
    loop.set_postfix(loss=loss.item())


go


In [None]:
#loop = tqdm(loader, leave=True)
# loop through epochs
for epoch in range(epochs):
    print(f'Epoch {str(epoch)}')
    for batch, feed_dict in tqdm.tqdm(enumerate(loader)):
        print(i_batch)
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        #print(batch.keys())
        input_ids = batch['input_ids'][0].to(device)
        attention_mask = batch['mask'][0].to(device)
        labels = batch['labels'][0].to(device)
        # model
        outputs = model(input_ids,
                        attention_mask=attention_mask,
                        labels=labels)
        # extract the loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

model.save_pretrained(f'{output_dir}/thesis_bert')

Epoch 0


0it [00:00, ?it/s]

116





AttributeError: 'int' object has no attribute 'keys'

In [None]:
# loop through epochs
for epoch in range(epochs):
    print(f'Epoch {str(epoch)}')
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    print(loop)
    for batch in loop:
        print(batch.keys())
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        print(batch.keys())
        input_ids = batch['input_ids'][0].to(device)
        attention_mask = batch['mask'][0].to(device)
        labels = batch['labels'][0].to(device)
        # model
        outputs = model(input_ids,
                        attention_mask=attention_mask,
                        labels=labels)
        # extract the loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

model.save_pretrained(f'{output_dir}/thesis_bert')

Epoch 0


TypeError: 'module' object is not callable

In [None]:
def train_model(config, loader):
    # init model
    model = RobertaForMaskedLM(config)
    # set device
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    # and move our model over to the selected device
    model.to(device)
    # activate training mode
    model.train()
    # initialize optimizer
    optim = AdamW(model.parameters(), lr=1e-4)
    # set number of epochs
    epochs = 2
    # loop through epochs
    for epoch in range(epochs):
        print(f'Epoch {str(epoch)}')
        # setup loop with TQDM and dataloader
        loop = tqdm(loader, leave=True)
        for batch in loop:
            print(batch.keys())
            # initialize calculated gradients (from prev step)
            optim.zero_grad()
            # pull all tensor batches required for training
            print(batch.keys())
            input_ids = batch['input_ids'][0].to(device)
            attention_mask = batch['mask'][0].to(device)
            labels = batch['labels'][0].to(device)
            # model
            outputs = model(input_ids,
                            attention_mask=attention_mask,
                            labels=labels)
            # extract the loss
            loss = outputs.loss
            # calculate loss for every parameter that needs grad update
            loss.backward()
            # update parameters
            optim.step()
            # print relevant info to progress bar
            loop.set_description(f'Epoch {epoch}')
            loop.set_postfix(loss=loss.item())

In [None]:

train_model(config, loader)

model.save_pretrained(f'{output_dir}/thesis_bert')

Epoch 0


  0%|                                                                                                                                                                                                           | 0/117 [00:00<?, ?it/s]

dict_keys(['input_ids'])
dict_keys(['input_ids'])





KeyError: 'mask'