Todo

Fine tune a pretrained chinese BERT model
Change hyperparameters (e.g. doc_stride)
Apply linear learning rate decay
Try other pretrained models
Improve preprocessing
Improve postprocessing

https://kozodoi.me/python/deep%20learning/pytorch/tutorial/2021/02/19/gradient-accumulation.html
https://huggingface.co/docs/accelerate/index
https://pytorch.org/blog/accelerating-training-on-nvidia-gpus-with-pytorch-automatic-mixed-precision/

TODO  Error displaying widget: model not found


In [None]:
!nvidia-smi

In [4]:
# You are allowed to change version of transformers or use other toolkits
!pip install transformers==4.5.0

[0m

In [5]:
import json
import numpy as np
import random
import torch
from torch.utils.data import DataLoader, Dataset 
from transformers import AdamW, BertForQuestionAnswering, BertTokenizerFast

from tqdm.auto import tqdm

# device = "cuda" if torch.cuda.is_available() else "cpu"
i=6
device =torch.device(f'cuda:{i}')
# Fix random seed for reproducibility
def same_seeds(seed):
	  torch.manual_seed(seed)
	  if torch.cuda.is_available():
		    torch.cuda.manual_seed(seed)
		    torch.cuda.manual_seed_all(seed)
	  np.random.seed(seed)
	  random.seed(seed)
	  torch.backends.cudnn.benchmark = False
	  torch.backends.cudnn.deterministic = True
same_seeds(0)

In [6]:
!pip install --upgrade pip
!pip install accelerate==0.2.0

[0m

In [7]:

!pip install accelerate

[0m

In [8]:
from accelerate import Accelerator

In [9]:
# Change "fp16_training" to True to support automatic mixed precision training (fp16)	
fp16_training = True

if fp16_training:
    
    
    accelerator = Accelerator(fp16=True)
    device = accelerator.device

# Documentation for the toolkit:  https://huggingface.co/docs/accelerate/

In [10]:
model = BertForQuestionAnswering.from_pretrained("bert-base-chinese").to(device)
tokenizer = BertTokenizerFast.from_pretrained("bert-base-chinese")

# You can safely ignore the warning message (it pops up because new prediction heads for QA are initialized randomly)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-chinese a

In [11]:
import os
def read_data(file):
    with open(file, 'r', encoding="utf-8") as reader:
        data = json.load(reader)
    return data["questions"], data["paragraphs"]
root=r"/home/jovyan/homework/dataset/hw7"
train_questions, train_paragraphs = read_data(os.path.join(root,"hw7_train.json"))
dev_questions, dev_paragraphs = read_data(os.path.join(root,"hw7_dev.json"))
test_questions, test_paragraphs = read_data(os.path.join(root,"hw7_test.json"))

In [12]:
# Tokenize questions and paragraphs separately
# 「add_special_tokens」 is set to False since special tokens will be added when tokenized questions and paragraphs are combined in datset __getitem__ 

train_questions_tokenized = tokenizer([train_question["question_text"] for train_question in train_questions], add_special_tokens=False)
dev_questions_tokenized = tokenizer([dev_question["question_text"] for dev_question in dev_questions], add_special_tokens=False)
test_questions_tokenized = tokenizer([test_question["question_text"] for test_question in test_questions], add_special_tokens=False) 

train_paragraphs_tokenized = tokenizer(train_paragraphs, add_special_tokens=False)
dev_paragraphs_tokenized = tokenizer(dev_paragraphs, add_special_tokens=False)
test_paragraphs_tokenized = tokenizer(test_paragraphs, add_special_tokens=False)

# You can safely ignore the warning message as tokenized sequences will be futher processed in datset __getitem__ before passing to model

Token indices sequence length is longer than the specified maximum sequence length for this model (570 > 512). Running this sequence through the model will result in indexing errors


In [13]:
class QA_Dataset(Dataset):
    def __init__(self, split, questions, tokenized_questions, tokenized_paragraphs):
        self.split = split
        self.questions = questions
        self.tokenized_questions = tokenized_questions
        self.tokenized_paragraphs = tokenized_paragraphs
        self.max_question_len = 40
        self.max_paragraph_len = 150
        
        ##### TODO: Change value of doc_stride #####
        self.doc_stride = 150

        # Input sequence length = [CLS] + question + [SEP] + paragraph + [SEP]
        self.max_seq_len = 1 + self.max_question_len + 1 + self.max_paragraph_len + 1

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        tokenized_question = self.tokenized_questions[idx]
        tokenized_paragraph = self.tokenized_paragraphs[question["paragraph_id"]]

        ##### TODO: Preprocessing #####
        # Hint: How to prevent model from learning something it should not learn

        if self.split == "train":
            # Convert answer's start/end positions in paragraph_text to start/end positions in tokenized_paragraph  
            answer_start_token = tokenized_paragraph.char_to_token(question["answer_start"])
            answer_end_token = tokenized_paragraph.char_to_token(question["answer_end"])

            # A single window is obtained by slicing the portion of paragraph containing the answer
            mid = (answer_start_token + answer_end_token) // 2
            paragraph_start = max(0, min(mid - self.max_paragraph_len // 2, len(tokenized_paragraph) - self.max_paragraph_len))
            paragraph_end = paragraph_start + self.max_paragraph_len
            
            # Slice question/paragraph and add special tokens (101: CLS, 102: SEP)
            input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102] 
            input_ids_paragraph = tokenized_paragraph.ids[paragraph_start : paragraph_end] + [102]		
            
            # Convert answer's start/end positions in tokenized_paragraph to start/end positions in the window  
            answer_start_token += len(input_ids_question) - paragraph_start
            answer_end_token += len(input_ids_question) - paragraph_start
            
            # Pad sequence and obtain inputs to model 
            input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)
            return torch.tensor(input_ids), torch.tensor(token_type_ids), torch.tensor(attention_mask), answer_start_token, answer_end_token

        # Validation/Testing
        else:
            input_ids_list, token_type_ids_list, attention_mask_list = [], [], []
            
            # Paragraph is split into several windows, each with start positions separated by step "doc_stride"
            for i in range(0, len(tokenized_paragraph), self.doc_stride):
                
                # Slice question/paragraph and add special tokens (101: CLS, 102: SEP)
                input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102]
                input_ids_paragraph = tokenized_paragraph.ids[i : i + self.max_paragraph_len] + [102]
                
                # Pad sequence and obtain inputs to model
                input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)
                
                input_ids_list.append(input_ids)
                token_type_ids_list.append(token_type_ids)
                attention_mask_list.append(attention_mask)
            
            return torch.tensor(input_ids_list), torch.tensor(token_type_ids_list), torch.tensor(attention_mask_list)

    def padding(self, input_ids_question, input_ids_paragraph):
        # Pad zeros if sequence length is shorter than max_seq_len
        padding_len = self.max_seq_len - len(input_ids_question) - len(input_ids_paragraph)
        # Indices of input sequence tokens in the vocabulary
        input_ids = input_ids_question + input_ids_paragraph + [0] * padding_len
        # Segment token indices to indicate first and second portions of the inputs. Indices are selected in [0, 1]
        token_type_ids = [0] * len(input_ids_question) + [1] * len(input_ids_paragraph) + [0] * padding_len
        # Mask to avoid performing attention on padding token indices. Mask values selected in [0, 1]
        attention_mask = [1] * (len(input_ids_question) + len(input_ids_paragraph)) + [0] * padding_len
        
        return input_ids, token_type_ids, attention_mask

train_set = QA_Dataset("train", train_questions, train_questions_tokenized, train_paragraphs_tokenized)
dev_set = QA_Dataset("dev", dev_questions, dev_questions_tokenized, dev_paragraphs_tokenized)
test_set = QA_Dataset("test", test_questions, test_questions_tokenized, test_paragraphs_tokenized)

train_batch_size = 32

# Note: Do NOT change batch size of dev_loader / test_loader !
# Although batch size=1, it is actually a batch consisting of several windows from the same QA pair
train_loader = DataLoader(train_set, batch_size=train_batch_size, shuffle=True, pin_memory=True)
dev_loader = DataLoader(dev_set, batch_size=1, shuffle=False, pin_memory=True)
test_loader = DataLoader(test_set, batch_size=1, shuffle=False, pin_memory=True)

In [14]:
def evaluate(data, output):
    ##### TODO: Postprocessing #####
    # There is a bug and room for improvement in postprocessing 
    # Hint: Open your prediction file to see what is wrong 
    
    answer = ''
    max_prob = float('-inf')
    num_of_windows = data[0].shape[1]
    
    for k in range(num_of_windows):
        # Obtain answer by choosing the most probable start position / end position
        start_prob, start_index = torch.max(output.start_logits[k], dim=0)
        end_prob, end_index = torch.max(output.end_logits[k], dim=0)
        
        # Probability of answer is calculated as sum of start_prob and end_prob
        prob = start_prob + end_prob
        
        # Replace answer if calculated probability is larger than previous windows
        if prob > max_prob:
            max_prob = prob
            # Convert tokens to chars (e.g. [1920, 7032] --> "大 金")
            answer = tokenizer.decode(data[0][0][k][start_index : end_index + 1])
    
    # Remove spaces in answer (e.g. "大 金" --> "大金")
    return answer.replace(' ','')

In [15]:
num_epoch = 100
validation = True
logging_step = 100
learning_rate = 1e-4
optimizer = AdamW(model.parameters(), lr=learning_rate)

if fp16_training:
    model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) 

model.train()

print("Start Training ...")

for epoch in range(num_epoch):
    step = 1
    train_loss = train_acc = 0
    
    for data in tqdm(train_loader):	
        # Load all data into GPU
        data = [i.to(device) for i in data]
        
        # Model inputs: input_ids, token_type_ids, attention_mask, start_positions, end_positions (Note: only "input_ids" is mandatory)
        # Model outputs: start_logits, end_logits, loss (return when start_positions/end_positions are provided)  
        output = model(input_ids=data[0], token_type_ids=data[1], attention_mask=data[2], start_positions=data[3], end_positions=data[4])

        # Choose the most probable start position / end position
        start_index = torch.argmax(output.start_logits, dim=1)
        end_index = torch.argmax(output.end_logits, dim=1)
        
        # Prediction is correct only if both start_index and end_index are correct
        train_acc += ((start_index == data[3]) & (end_index == data[4])).float().mean()
        train_loss += output.loss
        
        if fp16_training:
            accelerator.backward(output.loss)
        else:
            output.loss.backward()
        
        optimizer.step()
        optimizer.zero_grad()
        step += 1

        ##### TODO: Apply linear learning rate decay #####
        
        
        # Print training loss and accuracy over past logging step
        if step % logging_step == 0:
            print(f"Epoch {epoch + 1} | Step {step} | loss = {train_loss.item() / logging_step:.3f}, acc = {train_acc / logging_step:.3f}")
            train_loss = train_acc = 0

    if validation:
        print("Evaluating Dev Set ...")
        model.eval()
        with torch.no_grad():
            dev_acc = 0
            for i, data in enumerate(tqdm(dev_loader)):
                output = model(input_ids=data[0].squeeze(dim=0).to(device), token_type_ids=data[1].squeeze(dim=0).to(device),
                       attention_mask=data[2].squeeze(dim=0).to(device))
                # prediction is correct only if answer text exactly matches
                dev_acc += evaluate(data, output) == dev_questions[i]["answer_text"]
            print(f"Validation | Epoch {epoch + 1} | acc = {dev_acc / len(dev_loader):.3f}")
        model.train()

# Save a model and its configuration file to the directory 「saved_model」 
# i.e. there are two files under the direcory 「saved_model」: 「pytorch_model.bin」 and 「config.json」
# Saved model can be re-loaded using 「model = BertForQuestionAnswering.from_pretrained("saved_model")」
print("Saving Model ...")
model_save_dir = "saved_model" 
model.save_pretrained(model_save_dir)

Start Training ...


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 1 | Step 100 | loss = 1.674, acc = 0.437
Epoch 1 | Step 200 | loss = 0.846, acc = 0.656
Epoch 1 | Step 300 | loss = 0.713, acc = 0.711
Epoch 1 | Step 400 | loss = 0.680, acc = 0.713
Epoch 1 | Step 500 | loss = 0.714, acc = 0.711
Epoch 1 | Step 600 | loss = 0.636, acc = 0.738
Epoch 1 | Step 700 | loss = 0.556, acc = 0.761
Epoch 1 | Step 800 | loss = 0.595, acc = 0.746
Epoch 1 | Step 900 | loss = 0.608, acc = 0.741
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 1 | acc = 0.421


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 2 | Step 100 | loss = 0.340, acc = 0.822
Epoch 2 | Step 200 | loss = 0.343, acc = 0.842
Epoch 2 | Step 300 | loss = 0.355, acc = 0.834
Epoch 2 | Step 400 | loss = 0.363, acc = 0.825
Epoch 2 | Step 500 | loss = 0.406, acc = 0.809
Epoch 2 | Step 600 | loss = 0.402, acc = 0.812
Epoch 2 | Step 700 | loss = 0.394, acc = 0.821
Epoch 2 | Step 800 | loss = 0.397, acc = 0.817
Epoch 2 | Step 900 | loss = 0.389, acc = 0.829
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 2 | acc = 0.485


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 3 | Step 100 | loss = 0.227, acc = 0.876
Epoch 3 | Step 200 | loss = 0.236, acc = 0.879
Epoch 3 | Step 300 | loss = 0.244, acc = 0.882
Epoch 3 | Step 400 | loss = 0.269, acc = 0.862
Epoch 3 | Step 500 | loss = 0.278, acc = 0.862
Epoch 3 | Step 600 | loss = 0.284, acc = 0.867
Epoch 3 | Step 700 | loss = 0.281, acc = 0.860
Epoch 3 | Step 800 | loss = 0.285, acc = 0.859
Epoch 3 | Step 900 | loss = 0.290, acc = 0.861
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 3 | acc = 0.451


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 4 | Step 100 | loss = 0.190, acc = 0.889
Epoch 4 | Step 200 | loss = 0.173, acc = 0.909
Epoch 4 | Step 300 | loss = 0.203, acc = 0.898
Epoch 4 | Step 400 | loss = 0.264, acc = 0.866
Epoch 4 | Step 500 | loss = 0.225, acc = 0.889
Epoch 4 | Step 600 | loss = 0.234, acc = 0.883
Epoch 4 | Step 700 | loss = 0.233, acc = 0.881
Epoch 4 | Step 800 | loss = 0.257, acc = 0.882
Epoch 4 | Step 900 | loss = 0.251, acc = 0.875
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 4 | acc = 0.426


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 5 | Step 100 | loss = 0.142, acc = 0.917
Epoch 5 | Step 200 | loss = 0.128, acc = 0.934
Epoch 5 | Step 300 | loss = 0.206, acc = 0.898
Epoch 5 | Step 400 | loss = 0.200, acc = 0.902
Epoch 5 | Step 500 | loss = 0.210, acc = 0.898
Epoch 5 | Step 600 | loss = 0.184, acc = 0.912
Epoch 5 | Step 700 | loss = 0.188, acc = 0.906
Epoch 5 | Step 800 | loss = 0.213, acc = 0.889
Epoch 5 | Step 900 | loss = 0.204, acc = 0.897
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 5 | acc = 0.502


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 6 | Step 100 | loss = 0.135, acc = 0.917
Epoch 6 | Step 200 | loss = 0.153, acc = 0.922
Epoch 6 | Step 300 | loss = 0.149, acc = 0.932
Epoch 6 | Step 400 | loss = 0.201, acc = 0.909
Epoch 6 | Step 500 | loss = 0.170, acc = 0.913
Epoch 6 | Step 600 | loss = 0.181, acc = 0.908
Epoch 6 | Step 700 | loss = 0.174, acc = 0.912
Epoch 6 | Step 800 | loss = 0.179, acc = 0.911
Epoch 6 | Step 900 | loss = 0.185, acc = 0.906
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 6 | acc = 0.472


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 7 | Step 100 | loss = 0.114, acc = 0.933
Epoch 7 | Step 200 | loss = 0.112, acc = 0.945
Epoch 7 | Step 300 | loss = 0.118, acc = 0.942
Epoch 7 | Step 400 | loss = 0.126, acc = 0.940
Epoch 7 | Step 500 | loss = 0.179, acc = 0.911
Epoch 7 | Step 600 | loss = 0.175, acc = 0.910
Epoch 7 | Step 700 | loss = 0.174, acc = 0.903
Epoch 7 | Step 800 | loss = 0.177, acc = 0.910
Epoch 7 | Step 900 | loss = 0.165, acc = 0.919
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 7 | acc = 0.467


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 8 | Step 100 | loss = 0.109, acc = 0.934
Epoch 8 | Step 200 | loss = 0.152, acc = 0.926
Epoch 8 | Step 300 | loss = 0.131, acc = 0.935
Epoch 8 | Step 400 | loss = 0.249, acc = 0.890
Epoch 8 | Step 500 | loss = 0.344, acc = 0.843
Epoch 8 | Step 600 | loss = 0.343, acc = 0.844
Epoch 8 | Step 700 | loss = 0.222, acc = 0.892
Epoch 8 | Step 800 | loss = 0.189, acc = 0.910
Epoch 8 | Step 900 | loss = 0.173, acc = 0.908
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 8 | acc = 0.468


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 9 | Step 100 | loss = 0.094, acc = 0.943
Epoch 9 | Step 200 | loss = 0.168, acc = 0.924
Epoch 9 | Step 300 | loss = 0.129, acc = 0.936
Epoch 9 | Step 400 | loss = 0.139, acc = 0.929
Epoch 9 | Step 500 | loss = 0.320, acc = 0.870
Epoch 9 | Step 600 | loss = 0.210, acc = 0.900
Epoch 9 | Step 700 | loss = 0.191, acc = 0.913
Epoch 9 | Step 800 | loss = 0.160, acc = 0.923
Epoch 9 | Step 900 | loss = 0.143, acc = 0.923
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 9 | acc = 0.435


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 10 | Step 100 | loss = 0.118, acc = 0.932
Epoch 10 | Step 200 | loss = 0.091, acc = 0.951
Epoch 10 | Step 300 | loss = 0.105, acc = 0.944
Epoch 10 | Step 400 | loss = 0.094, acc = 0.952
Epoch 10 | Step 500 | loss = 0.108, acc = 0.940
Epoch 10 | Step 600 | loss = 0.100, acc = 0.949
Epoch 10 | Step 700 | loss = 0.113, acc = 0.942
Epoch 10 | Step 800 | loss = 0.114, acc = 0.942
Epoch 10 | Step 900 | loss = 0.099, acc = 0.951
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 10 | acc = 0.492


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 11 | Step 100 | loss = 0.088, acc = 0.945
Epoch 11 | Step 200 | loss = 0.091, acc = 0.955
Epoch 11 | Step 300 | loss = 0.090, acc = 0.952
Epoch 11 | Step 400 | loss = 0.114, acc = 0.942
Epoch 11 | Step 500 | loss = 0.127, acc = 0.942
Epoch 11 | Step 600 | loss = 0.125, acc = 0.937
Epoch 11 | Step 700 | loss = 0.131, acc = 0.938
Epoch 11 | Step 800 | loss = 0.143, acc = 0.923
Epoch 11 | Step 900 | loss = 0.122, acc = 0.940
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 11 | acc = 0.439


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 12 | Step 100 | loss = 0.101, acc = 0.942
Epoch 12 | Step 200 | loss = 0.090, acc = 0.957
Epoch 12 | Step 300 | loss = 0.145, acc = 0.932
Epoch 12 | Step 400 | loss = 0.181, acc = 0.914
Epoch 12 | Step 500 | loss = 0.270, acc = 0.892
Epoch 12 | Step 600 | loss = 0.165, acc = 0.921
Epoch 12 | Step 700 | loss = 0.151, acc = 0.925
Epoch 12 | Step 800 | loss = 0.131, acc = 0.937
Epoch 12 | Step 900 | loss = 0.146, acc = 0.934
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 12 | acc = 0.478


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 13 | Step 100 | loss = 0.112, acc = 0.938
Epoch 13 | Step 200 | loss = 0.084, acc = 0.958
Epoch 13 | Step 300 | loss = 0.121, acc = 0.943
Epoch 13 | Step 400 | loss = 0.102, acc = 0.951
Epoch 13 | Step 500 | loss = 0.103, acc = 0.948
Epoch 13 | Step 600 | loss = 0.106, acc = 0.949
Epoch 13 | Step 700 | loss = 0.113, acc = 0.948
Epoch 13 | Step 800 | loss = 0.143, acc = 0.933
Epoch 13 | Step 900 | loss = 0.124, acc = 0.942
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 13 | acc = 0.419


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 14 | Step 100 | loss = 0.099, acc = 0.945
Epoch 14 | Step 200 | loss = 0.088, acc = 0.949
Epoch 14 | Step 300 | loss = 0.097, acc = 0.953
Epoch 14 | Step 400 | loss = 0.094, acc = 0.956
Epoch 14 | Step 500 | loss = 0.109, acc = 0.949
Epoch 14 | Step 600 | loss = 0.108, acc = 0.945
Epoch 14 | Step 700 | loss = 0.114, acc = 0.941
Epoch 14 | Step 800 | loss = 0.103, acc = 0.953
Epoch 14 | Step 900 | loss = 0.113, acc = 0.941
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 14 | acc = 0.460


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 15 | Step 100 | loss = 0.059, acc = 0.959
Epoch 15 | Step 200 | loss = 0.068, acc = 0.968
Epoch 15 | Step 300 | loss = 0.078, acc = 0.955
Epoch 15 | Step 400 | loss = 0.085, acc = 0.959
Epoch 15 | Step 500 | loss = 0.096, acc = 0.951
Epoch 15 | Step 600 | loss = 0.106, acc = 0.946
Epoch 15 | Step 700 | loss = 0.166, acc = 0.926
Epoch 15 | Step 800 | loss = 0.131, acc = 0.939
Epoch 15 | Step 900 | loss = 0.125, acc = 0.938
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 15 | acc = 0.432


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 16 | Step 100 | loss = 0.085, acc = 0.947
Epoch 16 | Step 200 | loss = 0.089, acc = 0.954
Epoch 16 | Step 300 | loss = 0.088, acc = 0.957
Epoch 16 | Step 400 | loss = 0.088, acc = 0.951
Epoch 16 | Step 500 | loss = 0.105, acc = 0.949
Epoch 16 | Step 600 | loss = 0.116, acc = 0.945
Epoch 16 | Step 700 | loss = 0.089, acc = 0.954
Epoch 16 | Step 800 | loss = 0.096, acc = 0.951
Epoch 16 | Step 900 | loss = 0.126, acc = 0.939
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 16 | acc = 0.425


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 17 | Step 100 | loss = 0.054, acc = 0.960
Epoch 17 | Step 200 | loss = 0.068, acc = 0.966
Epoch 17 | Step 300 | loss = 0.072, acc = 0.958
Epoch 17 | Step 400 | loss = 0.075, acc = 0.961
Epoch 17 | Step 500 | loss = 0.089, acc = 0.955
Epoch 17 | Step 600 | loss = 0.094, acc = 0.951
Epoch 17 | Step 700 | loss = 0.087, acc = 0.953
Epoch 17 | Step 800 | loss = 0.107, acc = 0.949
Epoch 17 | Step 900 | loss = 0.101, acc = 0.953
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 17 | acc = 0.443


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 18 | Step 100 | loss = 0.085, acc = 0.952
Epoch 18 | Step 200 | loss = 0.093, acc = 0.954
Epoch 18 | Step 300 | loss = 0.091, acc = 0.957
Epoch 18 | Step 400 | loss = 0.085, acc = 0.957
Epoch 18 | Step 500 | loss = 0.109, acc = 0.950
Epoch 18 | Step 600 | loss = 0.096, acc = 0.956
Epoch 18 | Step 700 | loss = 0.099, acc = 0.952
Epoch 18 | Step 800 | loss = 0.104, acc = 0.948
Epoch 18 | Step 900 | loss = 0.093, acc = 0.957
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 18 | acc = 0.429


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 19 | Step 100 | loss = 0.062, acc = 0.957
Epoch 19 | Step 200 | loss = 0.085, acc = 0.957
Epoch 19 | Step 300 | loss = 0.104, acc = 0.952
Epoch 19 | Step 400 | loss = 0.074, acc = 0.963
Epoch 19 | Step 500 | loss = 0.092, acc = 0.957
Epoch 19 | Step 600 | loss = 0.091, acc = 0.953
Epoch 19 | Step 700 | loss = 0.100, acc = 0.947
Epoch 19 | Step 800 | loss = 0.098, acc = 0.956
Epoch 19 | Step 900 | loss = 0.110, acc = 0.944
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 19 | acc = 0.447


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 20 | Step 100 | loss = 0.076, acc = 0.953
Epoch 20 | Step 200 | loss = 0.081, acc = 0.958
Epoch 20 | Step 300 | loss = 0.091, acc = 0.955
Epoch 20 | Step 400 | loss = 0.104, acc = 0.951
Epoch 20 | Step 500 | loss = 0.092, acc = 0.950
Epoch 20 | Step 600 | loss = 0.092, acc = 0.954
Epoch 20 | Step 700 | loss = 0.083, acc = 0.956
Epoch 20 | Step 800 | loss = 0.095, acc = 0.953
Epoch 20 | Step 900 | loss = 0.085, acc = 0.955
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 20 | acc = 0.439


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 21 | Step 100 | loss = 0.067, acc = 0.956
Epoch 21 | Step 200 | loss = 0.089, acc = 0.954
Epoch 21 | Step 300 | loss = 0.079, acc = 0.960
Epoch 21 | Step 400 | loss = 0.071, acc = 0.964
Epoch 21 | Step 500 | loss = 0.078, acc = 0.959
Epoch 21 | Step 600 | loss = 0.084, acc = 0.959
Epoch 21 | Step 700 | loss = 0.096, acc = 0.954
Epoch 21 | Step 800 | loss = 0.079, acc = 0.961
Epoch 21 | Step 900 | loss = 0.093, acc = 0.960
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 21 | acc = 0.474


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 22 | Step 100 | loss = 0.066, acc = 0.954
Epoch 22 | Step 200 | loss = 0.071, acc = 0.965
Epoch 22 | Step 300 | loss = 0.065, acc = 0.969
Epoch 22 | Step 400 | loss = 0.083, acc = 0.954
Epoch 22 | Step 500 | loss = 0.079, acc = 0.965
Epoch 22 | Step 600 | loss = 0.099, acc = 0.952
Epoch 22 | Step 700 | loss = 0.103, acc = 0.950
Epoch 22 | Step 800 | loss = 0.108, acc = 0.945
Epoch 22 | Step 900 | loss = 0.102, acc = 0.948
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 22 | acc = 0.479


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 23 | Step 100 | loss = 0.072, acc = 0.955
Epoch 23 | Step 200 | loss = 0.063, acc = 0.969
Epoch 23 | Step 300 | loss = 0.087, acc = 0.958
Epoch 23 | Step 400 | loss = 0.078, acc = 0.958
Epoch 23 | Step 500 | loss = 0.116, acc = 0.942
Epoch 23 | Step 600 | loss = 0.074, acc = 0.965
Epoch 23 | Step 700 | loss = 0.095, acc = 0.952
Epoch 23 | Step 800 | loss = 0.093, acc = 0.954
Epoch 23 | Step 900 | loss = 0.082, acc = 0.955
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 23 | acc = 0.406


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 24 | Step 100 | loss = 0.058, acc = 0.958
Epoch 24 | Step 200 | loss = 0.059, acc = 0.971
Epoch 24 | Step 300 | loss = 0.064, acc = 0.970
Epoch 24 | Step 400 | loss = 0.070, acc = 0.964
Epoch 24 | Step 500 | loss = 0.075, acc = 0.964
Epoch 24 | Step 600 | loss = 0.083, acc = 0.958
Epoch 24 | Step 700 | loss = 0.067, acc = 0.965
Epoch 24 | Step 800 | loss = 0.080, acc = 0.958
Epoch 24 | Step 900 | loss = 0.069, acc = 0.967
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 24 | acc = 0.433


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 25 | Step 100 | loss = 0.067, acc = 0.959
Epoch 25 | Step 200 | loss = 0.081, acc = 0.966
Epoch 25 | Step 300 | loss = 0.077, acc = 0.961
Epoch 25 | Step 400 | loss = 0.077, acc = 0.965
Epoch 25 | Step 500 | loss = 0.078, acc = 0.959
Epoch 25 | Step 600 | loss = 0.076, acc = 0.966
Epoch 25 | Step 700 | loss = 0.075, acc = 0.963
Epoch 25 | Step 800 | loss = 0.082, acc = 0.960
Epoch 25 | Step 900 | loss = 0.082, acc = 0.960
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 25 | acc = 0.424


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 26 | Step 100 | loss = 0.069, acc = 0.957
Epoch 26 | Step 200 | loss = 0.047, acc = 0.975
Epoch 26 | Step 300 | loss = 0.083, acc = 0.958
Epoch 26 | Step 400 | loss = 0.068, acc = 0.967
Epoch 26 | Step 500 | loss = 0.078, acc = 0.961
Epoch 26 | Step 600 | loss = 0.089, acc = 0.954
Epoch 26 | Step 700 | loss = 0.075, acc = 0.965
Epoch 26 | Step 800 | loss = 0.096, acc = 0.953
Epoch 26 | Step 900 | loss = 0.083, acc = 0.959
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 26 | acc = 0.459


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 27 | Step 100 | loss = 0.060, acc = 0.962
Epoch 27 | Step 200 | loss = 0.073, acc = 0.967
Epoch 27 | Step 300 | loss = 0.071, acc = 0.964
Epoch 27 | Step 400 | loss = 0.082, acc = 0.963
Epoch 27 | Step 500 | loss = 0.069, acc = 0.970
Epoch 27 | Step 600 | loss = 0.087, acc = 0.961
Epoch 27 | Step 700 | loss = 0.078, acc = 0.963
Epoch 27 | Step 800 | loss = 0.074, acc = 0.968
Epoch 27 | Step 900 | loss = 0.070, acc = 0.967
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 27 | acc = 0.384


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 28 | Step 100 | loss = 0.058, acc = 0.964
Epoch 28 | Step 200 | loss = 0.062, acc = 0.970
Epoch 28 | Step 300 | loss = 0.067, acc = 0.967
Epoch 28 | Step 400 | loss = 0.073, acc = 0.965
Epoch 28 | Step 500 | loss = 0.082, acc = 0.963
Epoch 28 | Step 600 | loss = 0.087, acc = 0.963
Epoch 28 | Step 700 | loss = 0.080, acc = 0.965
Epoch 28 | Step 800 | loss = 0.077, acc = 0.963
Epoch 28 | Step 900 | loss = 0.070, acc = 0.967
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 28 | acc = 0.453


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 29 | Step 100 | loss = 0.065, acc = 0.958
Epoch 29 | Step 200 | loss = 0.063, acc = 0.969
Epoch 29 | Step 300 | loss = 0.065, acc = 0.972
Epoch 29 | Step 400 | loss = 0.056, acc = 0.972
Epoch 29 | Step 500 | loss = 0.059, acc = 0.969
Epoch 29 | Step 600 | loss = 0.083, acc = 0.958
Epoch 29 | Step 700 | loss = 0.101, acc = 0.958
Epoch 29 | Step 800 | loss = 0.075, acc = 0.962
Epoch 29 | Step 900 | loss = 0.084, acc = 0.958
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 29 | acc = 0.396


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 30 | Step 100 | loss = 0.071, acc = 0.955
Epoch 30 | Step 200 | loss = 0.074, acc = 0.962
Epoch 30 | Step 300 | loss = 0.067, acc = 0.965
Epoch 30 | Step 400 | loss = 0.073, acc = 0.962
Epoch 30 | Step 500 | loss = 0.088, acc = 0.957
Epoch 30 | Step 600 | loss = 0.080, acc = 0.963
Epoch 30 | Step 700 | loss = 0.079, acc = 0.960
Epoch 30 | Step 800 | loss = 0.081, acc = 0.959
Epoch 30 | Step 900 | loss = 0.081, acc = 0.959
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 30 | acc = 0.458


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 31 | Step 100 | loss = 0.054, acc = 0.962
Epoch 31 | Step 200 | loss = 0.060, acc = 0.969
Epoch 31 | Step 300 | loss = 0.056, acc = 0.972
Epoch 31 | Step 400 | loss = 0.078, acc = 0.962
Epoch 31 | Step 500 | loss = 0.084, acc = 0.963
Epoch 31 | Step 600 | loss = 0.069, acc = 0.972
Epoch 31 | Step 700 | loss = 0.070, acc = 0.967
Epoch 31 | Step 800 | loss = 0.088, acc = 0.958
Epoch 31 | Step 900 | loss = 0.064, acc = 0.964
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 31 | acc = 0.407


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 32 | Step 100 | loss = 0.049, acc = 0.968
Epoch 32 | Step 200 | loss = 0.054, acc = 0.969
Epoch 32 | Step 300 | loss = 0.054, acc = 0.974
Epoch 32 | Step 400 | loss = 0.059, acc = 0.970
Epoch 32 | Step 500 | loss = 0.067, acc = 0.964
Epoch 32 | Step 600 | loss = 0.062, acc = 0.966
Epoch 32 | Step 700 | loss = 0.071, acc = 0.962
Epoch 32 | Step 800 | loss = 0.077, acc = 0.959
Epoch 32 | Step 900 | loss = 0.083, acc = 0.962
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 32 | acc = 0.412


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 33 | Step 100 | loss = 0.044, acc = 0.967
Epoch 33 | Step 200 | loss = 0.047, acc = 0.979
Epoch 33 | Step 300 | loss = 0.069, acc = 0.967
Epoch 33 | Step 400 | loss = 0.058, acc = 0.970
Epoch 33 | Step 500 | loss = 0.063, acc = 0.967
Epoch 33 | Step 600 | loss = 0.071, acc = 0.966
Epoch 33 | Step 700 | loss = 0.063, acc = 0.969
Epoch 33 | Step 800 | loss = 0.052, acc = 0.972
Epoch 33 | Step 900 | loss = 0.067, acc = 0.971
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 33 | acc = 0.377


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 34 | Step 100 | loss = 0.060, acc = 0.961
Epoch 34 | Step 200 | loss = 0.057, acc = 0.971
Epoch 34 | Step 300 | loss = 0.076, acc = 0.961
Epoch 34 | Step 400 | loss = 0.081, acc = 0.965
Epoch 34 | Step 500 | loss = 0.071, acc = 0.965
Epoch 34 | Step 600 | loss = 0.066, acc = 0.966
Epoch 34 | Step 700 | loss = 0.067, acc = 0.964
Epoch 34 | Step 800 | loss = 0.068, acc = 0.967
Epoch 34 | Step 900 | loss = 0.086, acc = 0.964
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 34 | acc = 0.400


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 35 | Step 100 | loss = 0.060, acc = 0.959
Epoch 35 | Step 200 | loss = 0.055, acc = 0.970
Epoch 35 | Step 300 | loss = 0.062, acc = 0.971
Epoch 35 | Step 400 | loss = 0.055, acc = 0.974
Epoch 35 | Step 500 | loss = 0.050, acc = 0.977
Epoch 35 | Step 600 | loss = 0.057, acc = 0.972
Epoch 35 | Step 700 | loss = 0.062, acc = 0.973
Epoch 35 | Step 800 | loss = 0.047, acc = 0.976
Epoch 35 | Step 900 | loss = 0.066, acc = 0.970
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 35 | acc = 0.375


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 36 | Step 100 | loss = 0.055, acc = 0.963
Epoch 36 | Step 200 | loss = 0.059, acc = 0.969
Epoch 36 | Step 300 | loss = 0.055, acc = 0.971
Epoch 36 | Step 400 | loss = 0.062, acc = 0.972
Epoch 36 | Step 500 | loss = 0.060, acc = 0.971
Epoch 36 | Step 600 | loss = 0.057, acc = 0.971
Epoch 36 | Step 700 | loss = 0.059, acc = 0.970
Epoch 36 | Step 800 | loss = 0.071, acc = 0.962
Epoch 36 | Step 900 | loss = 0.074, acc = 0.963
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 36 | acc = 0.402


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 37 | Step 100 | loss = 0.062, acc = 0.962
Epoch 37 | Step 200 | loss = 0.068, acc = 0.970
Epoch 37 | Step 300 | loss = 0.078, acc = 0.957
Epoch 37 | Step 400 | loss = 0.061, acc = 0.971
Epoch 37 | Step 500 | loss = 0.064, acc = 0.966
Epoch 37 | Step 600 | loss = 0.066, acc = 0.968
Epoch 37 | Step 700 | loss = 0.066, acc = 0.971
Epoch 37 | Step 800 | loss = 0.055, acc = 0.972
Epoch 37 | Step 900 | loss = 0.051, acc = 0.971
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 37 | acc = 0.399


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 38 | Step 100 | loss = 0.060, acc = 0.966
Epoch 38 | Step 200 | loss = 0.041, acc = 0.977
Epoch 38 | Step 300 | loss = 0.064, acc = 0.971
Epoch 38 | Step 400 | loss = 0.072, acc = 0.962
Epoch 38 | Step 500 | loss = 0.043, acc = 0.978
Epoch 38 | Step 600 | loss = 0.062, acc = 0.971
Epoch 38 | Step 700 | loss = 0.096, acc = 0.957
Epoch 38 | Step 800 | loss = 0.076, acc = 0.963
Epoch 38 | Step 900 | loss = 0.082, acc = 0.958
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 38 | acc = 0.456


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 39 | Step 100 | loss = 0.049, acc = 0.967
Epoch 39 | Step 200 | loss = 0.050, acc = 0.977
Epoch 39 | Step 300 | loss = 0.053, acc = 0.972
Epoch 39 | Step 400 | loss = 0.066, acc = 0.967
Epoch 39 | Step 500 | loss = 0.086, acc = 0.965
Epoch 39 | Step 600 | loss = 0.074, acc = 0.964
Epoch 39 | Step 700 | loss = 0.075, acc = 0.966
Epoch 39 | Step 800 | loss = 0.058, acc = 0.973
Epoch 39 | Step 900 | loss = 0.073, acc = 0.965
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 39 | acc = 0.359


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 40 | Step 100 | loss = 0.047, acc = 0.966
Epoch 40 | Step 200 | loss = 0.055, acc = 0.976
Epoch 40 | Step 300 | loss = 0.081, acc = 0.961
Epoch 40 | Step 400 | loss = 0.070, acc = 0.962
Epoch 40 | Step 500 | loss = 0.057, acc = 0.972
Epoch 40 | Step 600 | loss = 0.066, acc = 0.973
Epoch 40 | Step 700 | loss = 0.061, acc = 0.972
Epoch 40 | Step 800 | loss = 0.046, acc = 0.975
Epoch 40 | Step 900 | loss = 0.074, acc = 0.967
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 40 | acc = 0.363


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 41 | Step 100 | loss = 0.564, acc = 0.800
Epoch 41 | Step 200 | loss = 0.237, acc = 0.900
Epoch 41 | Step 300 | loss = 0.121, acc = 0.941
Epoch 41 | Step 400 | loss = 0.093, acc = 0.957
Epoch 41 | Step 500 | loss = 0.084, acc = 0.958
Epoch 41 | Step 600 | loss = 0.072, acc = 0.962
Epoch 41 | Step 700 | loss = 0.061, acc = 0.970
Epoch 41 | Step 800 | loss = 0.059, acc = 0.970
Epoch 41 | Step 900 | loss = 0.057, acc = 0.975
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 41 | acc = 0.405


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 42 | Step 100 | loss = 0.029, acc = 0.972
Epoch 42 | Step 200 | loss = 0.028, acc = 0.984
Epoch 42 | Step 300 | loss = 0.032, acc = 0.981
Epoch 42 | Step 400 | loss = 0.027, acc = 0.989
Epoch 42 | Step 500 | loss = 0.041, acc = 0.980
Epoch 42 | Step 600 | loss = 0.042, acc = 0.978
Epoch 42 | Step 700 | loss = 0.025, acc = 0.987
Epoch 42 | Step 800 | loss = 0.030, acc = 0.985
Epoch 42 | Step 900 | loss = 0.033, acc = 0.982
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 42 | acc = 0.436


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 43 | Step 100 | loss = 0.028, acc = 0.975
Epoch 43 | Step 200 | loss = 0.038, acc = 0.979
Epoch 43 | Step 300 | loss = 0.025, acc = 0.988
Epoch 43 | Step 400 | loss = 0.044, acc = 0.979
Epoch 43 | Step 500 | loss = 0.043, acc = 0.978
Epoch 43 | Step 600 | loss = 0.036, acc = 0.981
Epoch 43 | Step 700 | loss = 0.035, acc = 0.982
Epoch 43 | Step 800 | loss = 0.037, acc = 0.981
Epoch 43 | Step 900 | loss = 0.040, acc = 0.981
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 43 | acc = 0.375


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 44 | Step 100 | loss = 0.036, acc = 0.968
Epoch 44 | Step 200 | loss = 0.036, acc = 0.981
Epoch 44 | Step 300 | loss = 0.038, acc = 0.978
Epoch 44 | Step 400 | loss = 0.062, acc = 0.969
Epoch 44 | Step 500 | loss = 0.048, acc = 0.979
Epoch 44 | Step 600 | loss = 0.058, acc = 0.974
Epoch 44 | Step 700 | loss = 0.063, acc = 0.967
Epoch 44 | Step 800 | loss = 0.056, acc = 0.974
Epoch 44 | Step 900 | loss = 0.056, acc = 0.969
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 44 | acc = 0.396


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 45 | Step 100 | loss = 0.033, acc = 0.973
Epoch 45 | Step 200 | loss = 0.041, acc = 0.980
Epoch 45 | Step 300 | loss = 0.044, acc = 0.975
Epoch 45 | Step 400 | loss = 0.056, acc = 0.969
Epoch 45 | Step 500 | loss = 0.069, acc = 0.967
Epoch 45 | Step 600 | loss = 0.059, acc = 0.974
Epoch 45 | Step 700 | loss = 0.060, acc = 0.967
Epoch 45 | Step 800 | loss = 0.063, acc = 0.968
Epoch 45 | Step 900 | loss = 0.083, acc = 0.962
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 45 | acc = 0.385


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 46 | Step 100 | loss = 0.058, acc = 0.959
Epoch 46 | Step 200 | loss = 0.080, acc = 0.963
Epoch 46 | Step 300 | loss = 0.056, acc = 0.972
Epoch 46 | Step 400 | loss = 0.060, acc = 0.971
Epoch 46 | Step 500 | loss = 0.046, acc = 0.977
Epoch 46 | Step 600 | loss = 0.064, acc = 0.967
Epoch 46 | Step 700 | loss = 0.054, acc = 0.975
Epoch 46 | Step 800 | loss = 0.060, acc = 0.970
Epoch 46 | Step 900 | loss = 0.069, acc = 0.968
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 46 | acc = 0.381


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 47 | Step 100 | loss = 0.045, acc = 0.969
Epoch 47 | Step 200 | loss = 0.039, acc = 0.983
Epoch 47 | Step 300 | loss = 0.093, acc = 0.953
Epoch 47 | Step 400 | loss = 0.100, acc = 0.953
Epoch 47 | Step 500 | loss = 0.071, acc = 0.963
Epoch 47 | Step 600 | loss = 0.061, acc = 0.972
Epoch 47 | Step 700 | loss = 0.072, acc = 0.967
Epoch 47 | Step 800 | loss = 0.061, acc = 0.970
Epoch 47 | Step 900 | loss = 0.050, acc = 0.975
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 47 | acc = 0.427


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 48 | Step 100 | loss = 0.053, acc = 0.966
Epoch 48 | Step 200 | loss = 0.061, acc = 0.968
Epoch 48 | Step 300 | loss = 0.048, acc = 0.975
Epoch 48 | Step 400 | loss = 0.053, acc = 0.972
Epoch 48 | Step 500 | loss = 0.047, acc = 0.979
Epoch 48 | Step 600 | loss = 0.048, acc = 0.979
Epoch 48 | Step 700 | loss = 0.057, acc = 0.975
Epoch 48 | Step 800 | loss = 0.053, acc = 0.974
Epoch 48 | Step 900 | loss = 0.056, acc = 0.971
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 48 | acc = 0.355


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 49 | Step 100 | loss = 0.046, acc = 0.969
Epoch 49 | Step 200 | loss = 0.038, acc = 0.979
Epoch 49 | Step 300 | loss = 0.045, acc = 0.977
Epoch 49 | Step 400 | loss = 0.050, acc = 0.974
Epoch 49 | Step 500 | loss = 0.040, acc = 0.981
Epoch 49 | Step 600 | loss = 0.054, acc = 0.977
Epoch 49 | Step 700 | loss = 0.059, acc = 0.973
Epoch 49 | Step 800 | loss = 0.054, acc = 0.973
Epoch 49 | Step 900 | loss = 0.059, acc = 0.968
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 49 | acc = 0.389


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 50 | Step 100 | loss = 0.053, acc = 0.965
Epoch 50 | Step 200 | loss = 0.047, acc = 0.980
Epoch 50 | Step 300 | loss = 0.045, acc = 0.980
Epoch 50 | Step 400 | loss = 0.051, acc = 0.975
Epoch 50 | Step 500 | loss = 0.048, acc = 0.975
Epoch 50 | Step 600 | loss = 0.040, acc = 0.978
Epoch 50 | Step 700 | loss = 0.048, acc = 0.976
Epoch 50 | Step 800 | loss = 0.065, acc = 0.966
Epoch 50 | Step 900 | loss = 0.054, acc = 0.974
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 50 | acc = 0.381


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 51 | Step 100 | loss = 0.047, acc = 0.968
Epoch 51 | Step 200 | loss = 0.053, acc = 0.975
Epoch 51 | Step 300 | loss = 0.048, acc = 0.979
Epoch 51 | Step 400 | loss = 0.047, acc = 0.972
Epoch 51 | Step 500 | loss = 0.053, acc = 0.974
Epoch 51 | Step 600 | loss = 0.059, acc = 0.970
Epoch 51 | Step 700 | loss = 0.061, acc = 0.966
Epoch 51 | Step 800 | loss = 0.058, acc = 0.969
Epoch 51 | Step 900 | loss = 0.064, acc = 0.967
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 51 | acc = 0.356


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 52 | Step 100 | loss = 0.043, acc = 0.967
Epoch 52 | Step 200 | loss = 0.064, acc = 0.970
Epoch 52 | Step 300 | loss = 0.084, acc = 0.963
Epoch 52 | Step 400 | loss = 0.066, acc = 0.970
Epoch 52 | Step 500 | loss = 0.053, acc = 0.971
Epoch 52 | Step 600 | loss = 0.052, acc = 0.971
Epoch 52 | Step 700 | loss = 0.099, acc = 0.960
Epoch 52 | Step 800 | loss = 0.066, acc = 0.969
Epoch 52 | Step 900 | loss = 0.060, acc = 0.975
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 52 | acc = 0.399


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 53 | Step 100 | loss = 0.050, acc = 0.968
Epoch 53 | Step 200 | loss = 0.044, acc = 0.978
Epoch 53 | Step 300 | loss = 0.048, acc = 0.972
Epoch 53 | Step 400 | loss = 0.057, acc = 0.970
Epoch 53 | Step 500 | loss = 0.044, acc = 0.979
Epoch 53 | Step 600 | loss = 0.046, acc = 0.978
Epoch 53 | Step 700 | loss = 0.055, acc = 0.974
Epoch 53 | Step 800 | loss = 0.058, acc = 0.974
Epoch 53 | Step 900 | loss = 0.055, acc = 0.972
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 53 | acc = 0.360


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 54 | Step 100 | loss = 0.046, acc = 0.966
Epoch 54 | Step 200 | loss = 0.035, acc = 0.980
Epoch 54 | Step 300 | loss = 0.048, acc = 0.976
Epoch 54 | Step 400 | loss = 0.049, acc = 0.974
Epoch 54 | Step 500 | loss = 0.056, acc = 0.975
Epoch 54 | Step 600 | loss = 0.052, acc = 0.972
Epoch 54 | Step 700 | loss = 0.048, acc = 0.975
Epoch 54 | Step 800 | loss = 0.065, acc = 0.970
Epoch 54 | Step 900 | loss = 0.051, acc = 0.978
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 54 | acc = 0.366


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 55 | Step 100 | loss = 0.037, acc = 0.975
Epoch 55 | Step 200 | loss = 0.039, acc = 0.978
Epoch 55 | Step 300 | loss = 0.052, acc = 0.976
Epoch 55 | Step 400 | loss = 0.049, acc = 0.973
Epoch 55 | Step 500 | loss = 0.050, acc = 0.979
Epoch 55 | Step 600 | loss = 0.043, acc = 0.975
Epoch 55 | Step 700 | loss = 0.056, acc = 0.976
Epoch 55 | Step 800 | loss = 0.042, acc = 0.981
Epoch 55 | Step 900 | loss = 0.038, acc = 0.977
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 55 | acc = 0.377


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 56 | Step 100 | loss = 0.049, acc = 0.968
Epoch 56 | Step 200 | loss = 0.043, acc = 0.978
Epoch 56 | Step 300 | loss = 0.060, acc = 0.975
Epoch 56 | Step 400 | loss = 0.043, acc = 0.977
Epoch 56 | Step 500 | loss = 0.054, acc = 0.975
Epoch 56 | Step 600 | loss = 0.053, acc = 0.975
Epoch 56 | Step 700 | loss = 0.076, acc = 0.967
Epoch 56 | Step 800 | loss = 0.118, acc = 0.954
Epoch 56 | Step 900 | loss = 0.082, acc = 0.962
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 56 | acc = 0.349


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 57 | Step 100 | loss = 0.044, acc = 0.967
Epoch 57 | Step 200 | loss = 0.043, acc = 0.978
Epoch 57 | Step 300 | loss = 0.050, acc = 0.976
Epoch 57 | Step 400 | loss = 0.041, acc = 0.981
Epoch 57 | Step 500 | loss = 0.039, acc = 0.981
Epoch 57 | Step 600 | loss = 0.044, acc = 0.978
Epoch 57 | Step 700 | loss = 0.062, acc = 0.971
Epoch 57 | Step 800 | loss = 0.050, acc = 0.974
Epoch 57 | Step 900 | loss = 0.057, acc = 0.971
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 57 | acc = 0.404


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 58 | Step 100 | loss = 0.039, acc = 0.970
Epoch 58 | Step 200 | loss = 0.045, acc = 0.975
Epoch 58 | Step 300 | loss = 0.056, acc = 0.974
Epoch 58 | Step 400 | loss = 0.055, acc = 0.973
Epoch 58 | Step 500 | loss = 0.051, acc = 0.978
Epoch 58 | Step 600 | loss = 0.043, acc = 0.977
Epoch 58 | Step 700 | loss = 0.058, acc = 0.971
Epoch 58 | Step 800 | loss = 0.045, acc = 0.980
Epoch 58 | Step 900 | loss = 0.064, acc = 0.971
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 58 | acc = 0.404


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 59 | Step 100 | loss = 0.036, acc = 0.974
Epoch 59 | Step 200 | loss = 0.041, acc = 0.977
Epoch 59 | Step 300 | loss = 0.043, acc = 0.977
Epoch 59 | Step 400 | loss = 0.045, acc = 0.977
Epoch 59 | Step 500 | loss = 0.043, acc = 0.978
Epoch 59 | Step 600 | loss = 0.054, acc = 0.974
Epoch 59 | Step 700 | loss = 0.041, acc = 0.980
Epoch 59 | Step 800 | loss = 0.051, acc = 0.976
Epoch 59 | Step 900 | loss = 0.056, acc = 0.971
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 59 | acc = 0.395


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 60 | Step 100 | loss = 0.038, acc = 0.971
Epoch 60 | Step 200 | loss = 0.042, acc = 0.980
Epoch 60 | Step 300 | loss = 0.029, acc = 0.986
Epoch 60 | Step 400 | loss = 0.055, acc = 0.970
Epoch 60 | Step 500 | loss = 0.036, acc = 0.980
Epoch 60 | Step 600 | loss = 0.071, acc = 0.967
Epoch 60 | Step 700 | loss = 0.054, acc = 0.974
Epoch 60 | Step 800 | loss = 0.049, acc = 0.974
Epoch 60 | Step 900 | loss = 0.040, acc = 0.978
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 60 | acc = 0.396


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 61 | Step 100 | loss = 0.029, acc = 0.972
Epoch 61 | Step 200 | loss = 0.035, acc = 0.981
Epoch 61 | Step 300 | loss = 0.038, acc = 0.982
Epoch 61 | Step 400 | loss = 0.041, acc = 0.978
Epoch 61 | Step 500 | loss = 0.038, acc = 0.980
Epoch 61 | Step 600 | loss = 0.046, acc = 0.975
Epoch 61 | Step 700 | loss = 0.047, acc = 0.977
Epoch 61 | Step 800 | loss = 0.046, acc = 0.976
Epoch 61 | Step 900 | loss = 0.049, acc = 0.974
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 61 | acc = 0.382


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 62 | Step 100 | loss = 0.045, acc = 0.968
Epoch 62 | Step 200 | loss = 0.039, acc = 0.982
Epoch 62 | Step 300 | loss = 0.046, acc = 0.980
Epoch 62 | Step 400 | loss = 0.042, acc = 0.982
Epoch 62 | Step 500 | loss = 0.042, acc = 0.979
Epoch 62 | Step 600 | loss = 0.064, acc = 0.970
Epoch 62 | Step 700 | loss = 0.061, acc = 0.974
Epoch 62 | Step 800 | loss = 0.055, acc = 0.973
Epoch 62 | Step 900 | loss = 0.054, acc = 0.971
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 62 | acc = 0.390


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 63 | Step 100 | loss = 0.041, acc = 0.968
Epoch 63 | Step 200 | loss = 0.044, acc = 0.980
Epoch 63 | Step 300 | loss = 0.055, acc = 0.975
Epoch 63 | Step 400 | loss = 0.040, acc = 0.984
Epoch 63 | Step 500 | loss = 0.037, acc = 0.981
Epoch 63 | Step 600 | loss = 0.052, acc = 0.978
Epoch 63 | Step 700 | loss = 0.043, acc = 0.981
Epoch 63 | Step 800 | loss = 0.033, acc = 0.985
Epoch 63 | Step 900 | loss = 0.043, acc = 0.976
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 63 | acc = 0.391


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 64 | Step 100 | loss = 0.045, acc = 0.971
Epoch 64 | Step 200 | loss = 0.033, acc = 0.983
Epoch 64 | Step 300 | loss = 0.040, acc = 0.981
Epoch 64 | Step 400 | loss = 0.050, acc = 0.975
Epoch 64 | Step 500 | loss = 0.045, acc = 0.979
Epoch 64 | Step 600 | loss = 0.049, acc = 0.973
Epoch 64 | Step 700 | loss = 0.052, acc = 0.969
Epoch 64 | Step 800 | loss = 0.053, acc = 0.976
Epoch 64 | Step 900 | loss = 0.046, acc = 0.975
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 64 | acc = 0.384


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 65 | Step 100 | loss = 0.024, acc = 0.974
Epoch 65 | Step 200 | loss = 0.039, acc = 0.982
Epoch 65 | Step 300 | loss = 0.039, acc = 0.982
Epoch 65 | Step 400 | loss = 0.042, acc = 0.982
Epoch 65 | Step 500 | loss = 0.041, acc = 0.977
Epoch 65 | Step 600 | loss = 0.042, acc = 0.980
Epoch 65 | Step 700 | loss = 0.048, acc = 0.975
Epoch 65 | Step 800 | loss = 0.040, acc = 0.979
Epoch 65 | Step 900 | loss = 0.082, acc = 0.964
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 65 | acc = 0.375


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 66 | Step 100 | loss = 0.043, acc = 0.967
Epoch 66 | Step 200 | loss = 0.035, acc = 0.982
Epoch 66 | Step 300 | loss = 0.049, acc = 0.977
Epoch 66 | Step 400 | loss = 0.068, acc = 0.969
Epoch 66 | Step 500 | loss = 0.053, acc = 0.974
Epoch 66 | Step 600 | loss = 0.061, acc = 0.968
Epoch 66 | Step 700 | loss = 0.057, acc = 0.975
Epoch 66 | Step 800 | loss = 0.048, acc = 0.975
Epoch 66 | Step 900 | loss = 0.049, acc = 0.975
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 66 | acc = 0.356


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 67 | Step 100 | loss = 0.032, acc = 0.974
Epoch 67 | Step 200 | loss = 0.034, acc = 0.985
Epoch 67 | Step 300 | loss = 0.030, acc = 0.986
Epoch 67 | Step 400 | loss = 0.038, acc = 0.983
Epoch 67 | Step 500 | loss = 0.031, acc = 0.983
Epoch 67 | Step 600 | loss = 0.047, acc = 0.977
Epoch 67 | Step 700 | loss = 0.050, acc = 0.974
Epoch 67 | Step 800 | loss = 0.047, acc = 0.977
Epoch 67 | Step 900 | loss = 0.052, acc = 0.974
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 67 | acc = 0.371


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 68 | Step 100 | loss = 0.045, acc = 0.970
Epoch 68 | Step 200 | loss = 0.042, acc = 0.975
Epoch 68 | Step 300 | loss = 0.041, acc = 0.978
Epoch 68 | Step 400 | loss = 0.047, acc = 0.976
Epoch 68 | Step 500 | loss = 0.033, acc = 0.983
Epoch 68 | Step 600 | loss = 0.032, acc = 0.982
Epoch 68 | Step 700 | loss = 0.040, acc = 0.979
Epoch 68 | Step 800 | loss = 0.040, acc = 0.980
Epoch 68 | Step 900 | loss = 0.042, acc = 0.981
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 68 | acc = 0.376


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 69 | Step 100 | loss = 0.023, acc = 0.980
Epoch 69 | Step 200 | loss = 0.035, acc = 0.986
Epoch 69 | Step 300 | loss = 0.033, acc = 0.981
Epoch 69 | Step 400 | loss = 0.039, acc = 0.982
Epoch 69 | Step 500 | loss = 0.039, acc = 0.982
Epoch 69 | Step 600 | loss = 0.045, acc = 0.980
Epoch 69 | Step 700 | loss = 0.045, acc = 0.976
Epoch 69 | Step 800 | loss = 0.057, acc = 0.973
Epoch 69 | Step 900 | loss = 0.041, acc = 0.975
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 69 | acc = 0.386


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 70 | Step 100 | loss = 0.045, acc = 0.969
Epoch 70 | Step 200 | loss = 0.041, acc = 0.977
Epoch 70 | Step 300 | loss = 0.036, acc = 0.981
Epoch 70 | Step 400 | loss = 0.036, acc = 0.980
Epoch 70 | Step 500 | loss = 0.040, acc = 0.980
Epoch 70 | Step 600 | loss = 0.044, acc = 0.977
Epoch 70 | Step 700 | loss = 0.045, acc = 0.978
Epoch 70 | Step 800 | loss = 0.049, acc = 0.974
Epoch 70 | Step 900 | loss = 0.057, acc = 0.971
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 70 | acc = 0.336


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 71 | Step 100 | loss = 0.051, acc = 0.966
Epoch 71 | Step 200 | loss = 0.032, acc = 0.983
Epoch 71 | Step 300 | loss = 0.039, acc = 0.983
Epoch 71 | Step 400 | loss = 0.044, acc = 0.978
Epoch 71 | Step 500 | loss = 0.063, acc = 0.975
Epoch 71 | Step 600 | loss = 0.052, acc = 0.976
Epoch 71 | Step 700 | loss = 0.053, acc = 0.978
Epoch 71 | Step 800 | loss = 0.055, acc = 0.976
Epoch 71 | Step 900 | loss = 0.028, acc = 0.988
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 71 | acc = 0.399


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 72 | Step 100 | loss = 0.043, acc = 0.969
Epoch 72 | Step 200 | loss = 0.058, acc = 0.970
Epoch 72 | Step 300 | loss = 0.042, acc = 0.978
Epoch 72 | Step 400 | loss = 0.036, acc = 0.981
Epoch 72 | Step 500 | loss = 0.033, acc = 0.982
Epoch 72 | Step 600 | loss = 0.039, acc = 0.981
Epoch 72 | Step 700 | loss = 0.050, acc = 0.975
Epoch 72 | Step 800 | loss = 0.046, acc = 0.979
Epoch 72 | Step 900 | loss = 0.049, acc = 0.976
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 72 | acc = 0.394


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 73 | Step 100 | loss = 0.046, acc = 0.971
Epoch 73 | Step 200 | loss = 0.029, acc = 0.985
Epoch 73 | Step 300 | loss = 0.021, acc = 0.991
Epoch 73 | Step 400 | loss = 0.028, acc = 0.986
Epoch 73 | Step 500 | loss = 0.034, acc = 0.984
Epoch 73 | Step 600 | loss = 0.047, acc = 0.978
Epoch 73 | Step 700 | loss = 0.045, acc = 0.981
Epoch 73 | Step 800 | loss = 0.040, acc = 0.979
Epoch 73 | Step 900 | loss = 0.044, acc = 0.977
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 73 | acc = 0.363


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 74 | Step 100 | loss = 0.029, acc = 0.975
Epoch 74 | Step 200 | loss = 0.040, acc = 0.981
Epoch 74 | Step 300 | loss = 0.040, acc = 0.982
Epoch 74 | Step 400 | loss = 0.031, acc = 0.986
Epoch 74 | Step 500 | loss = 0.036, acc = 0.984
Epoch 74 | Step 600 | loss = 0.033, acc = 0.984
Epoch 74 | Step 700 | loss = 0.042, acc = 0.981
Epoch 74 | Step 800 | loss = 0.047, acc = 0.978
Epoch 74 | Step 900 | loss = 0.050, acc = 0.978
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 74 | acc = 0.352


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 75 | Step 100 | loss = 0.042, acc = 0.969
Epoch 75 | Step 200 | loss = 0.036, acc = 0.983
Epoch 75 | Step 300 | loss = 0.036, acc = 0.982
Epoch 75 | Step 400 | loss = 0.034, acc = 0.982
Epoch 75 | Step 500 | loss = 0.055, acc = 0.973
Epoch 75 | Step 600 | loss = 0.049, acc = 0.977
Epoch 75 | Step 700 | loss = 0.034, acc = 0.982
Epoch 75 | Step 800 | loss = 0.047, acc = 0.977
Epoch 75 | Step 900 | loss = 0.047, acc = 0.975
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 75 | acc = 0.368


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 76 | Step 100 | loss = 0.033, acc = 0.973
Epoch 76 | Step 200 | loss = 0.039, acc = 0.982
Epoch 76 | Step 300 | loss = 0.049, acc = 0.977
Epoch 76 | Step 400 | loss = 0.040, acc = 0.982
Epoch 76 | Step 500 | loss = 0.035, acc = 0.981
Epoch 76 | Step 600 | loss = 0.028, acc = 0.985
Epoch 76 | Step 700 | loss = 0.055, acc = 0.976
Epoch 76 | Step 800 | loss = 0.045, acc = 0.979
Epoch 76 | Step 900 | loss = 0.039, acc = 0.982
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 76 | acc = 0.371


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 77 | Step 100 | loss = 0.037, acc = 0.971
Epoch 77 | Step 200 | loss = 0.025, acc = 0.987
Epoch 77 | Step 300 | loss = 0.029, acc = 0.983
Epoch 77 | Step 400 | loss = 0.030, acc = 0.982
Epoch 77 | Step 500 | loss = 0.050, acc = 0.973
Epoch 77 | Step 600 | loss = 0.049, acc = 0.976
Epoch 77 | Step 700 | loss = 0.052, acc = 0.978
Epoch 77 | Step 800 | loss = 0.042, acc = 0.979
Epoch 77 | Step 900 | loss = 0.045, acc = 0.980
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 77 | acc = 0.359


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 78 | Step 100 | loss = 0.040, acc = 0.970
Epoch 78 | Step 200 | loss = 0.037, acc = 0.979
Epoch 78 | Step 300 | loss = 0.040, acc = 0.980
Epoch 78 | Step 400 | loss = 0.034, acc = 0.984
Epoch 78 | Step 500 | loss = 0.029, acc = 0.986
Epoch 78 | Step 600 | loss = 0.042, acc = 0.980
Epoch 78 | Step 700 | loss = 0.036, acc = 0.982
Epoch 78 | Step 800 | loss = 0.053, acc = 0.976
Epoch 78 | Step 900 | loss = 0.044, acc = 0.981
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 78 | acc = 0.368


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 79 | Step 100 | loss = 0.026, acc = 0.977
Epoch 79 | Step 200 | loss = 0.046, acc = 0.979
Epoch 79 | Step 300 | loss = 0.056, acc = 0.973
Epoch 79 | Step 400 | loss = 0.059, acc = 0.972
Epoch 79 | Step 500 | loss = 0.037, acc = 0.983
Epoch 79 | Step 600 | loss = 0.037, acc = 0.982
Epoch 79 | Step 700 | loss = 0.040, acc = 0.983
Epoch 79 | Step 800 | loss = 0.042, acc = 0.980
Epoch 79 | Step 900 | loss = 0.061, acc = 0.973
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 79 | acc = 0.362


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 80 | Step 100 | loss = 0.033, acc = 0.972
Epoch 80 | Step 200 | loss = 0.025, acc = 0.984
Epoch 80 | Step 300 | loss = 0.030, acc = 0.987
Epoch 80 | Step 400 | loss = 0.025, acc = 0.984
Epoch 80 | Step 500 | loss = 0.038, acc = 0.979
Epoch 80 | Step 600 | loss = 0.047, acc = 0.977
Epoch 80 | Step 700 | loss = 0.046, acc = 0.977
Epoch 80 | Step 800 | loss = 0.044, acc = 0.973
Epoch 80 | Step 900 | loss = 0.038, acc = 0.981
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 80 | acc = 0.374


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 81 | Step 100 | loss = 0.027, acc = 0.974
Epoch 81 | Step 200 | loss = 0.030, acc = 0.982
Epoch 81 | Step 300 | loss = 0.039, acc = 0.978
Epoch 81 | Step 400 | loss = 0.029, acc = 0.984
Epoch 81 | Step 500 | loss = 0.053, acc = 0.976
Epoch 81 | Step 600 | loss = 0.044, acc = 0.977
Epoch 81 | Step 700 | loss = 0.050, acc = 0.976
Epoch 81 | Step 800 | loss = 0.037, acc = 0.983
Epoch 81 | Step 900 | loss = 0.046, acc = 0.980
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 81 | acc = 0.385


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 82 | Step 100 | loss = 0.037, acc = 0.972
Epoch 82 | Step 200 | loss = 0.035, acc = 0.979
Epoch 82 | Step 300 | loss = 0.031, acc = 0.985
Epoch 82 | Step 400 | loss = 0.044, acc = 0.980
Epoch 82 | Step 500 | loss = 0.043, acc = 0.982
Epoch 82 | Step 600 | loss = 0.038, acc = 0.983
Epoch 82 | Step 700 | loss = 0.039, acc = 0.982
Epoch 82 | Step 800 | loss = 0.041, acc = 0.984
Epoch 82 | Step 900 | loss = 0.045, acc = 0.978
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 82 | acc = 0.353


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 83 | Step 100 | loss = 0.032, acc = 0.972
Epoch 83 | Step 200 | loss = 0.037, acc = 0.982
Epoch 83 | Step 300 | loss = 0.028, acc = 0.986
Epoch 83 | Step 400 | loss = 0.041, acc = 0.980
Epoch 83 | Step 500 | loss = 0.045, acc = 0.978
Epoch 83 | Step 600 | loss = 0.037, acc = 0.982
Epoch 83 | Step 700 | loss = 0.033, acc = 0.984
Epoch 83 | Step 800 | loss = 0.049, acc = 0.978
Epoch 83 | Step 900 | loss = 0.044, acc = 0.976
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 83 | acc = 0.389


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 84 | Step 100 | loss = 0.026, acc = 0.974
Epoch 84 | Step 200 | loss = 0.043, acc = 0.979
Epoch 84 | Step 300 | loss = 0.031, acc = 0.985
Epoch 84 | Step 400 | loss = 0.040, acc = 0.980
Epoch 84 | Step 500 | loss = 0.044, acc = 0.978
Epoch 84 | Step 600 | loss = 0.106, acc = 0.965
Epoch 84 | Step 700 | loss = 0.088, acc = 0.967
Epoch 84 | Step 800 | loss = 0.072, acc = 0.972
Epoch 84 | Step 900 | loss = 0.063, acc = 0.970
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 84 | acc = 0.000


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 85 | Step 100 | loss = 5.216, acc = 0.000
Epoch 85 | Step 200 | loss = 5.268, acc = 0.000
Epoch 85 | Step 300 | loss = 5.287, acc = 0.000
Epoch 85 | Step 400 | loss = 5.273, acc = 0.000
Epoch 85 | Step 500 | loss = 5.275, acc = 0.000
Epoch 85 | Step 600 | loss = 5.273, acc = 0.000
Epoch 85 | Step 700 | loss = 5.269, acc = 0.000
Epoch 85 | Step 800 | loss = 5.274, acc = 0.000
Epoch 85 | Step 900 | loss = 5.269, acc = 0.000
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 85 | acc = 0.000


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 86 | Step 100 | loss = 5.216, acc = 0.000
Epoch 86 | Step 200 | loss = 5.267, acc = 0.000
Epoch 86 | Step 300 | loss = 5.267, acc = 0.000
Epoch 86 | Step 400 | loss = 5.270, acc = 0.000
Epoch 86 | Step 500 | loss = 5.268, acc = 0.000
Epoch 86 | Step 600 | loss = 5.267, acc = 0.000
Epoch 86 | Step 700 | loss = 5.267, acc = 0.000
Epoch 86 | Step 800 | loss = 5.266, acc = 0.000
Epoch 86 | Step 900 | loss = 5.266, acc = 0.000
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 86 | acc = 0.000


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 87 | Step 100 | loss = 5.213, acc = 0.000
Epoch 87 | Step 200 | loss = 5.266, acc = 0.000
Epoch 87 | Step 300 | loss = 5.267, acc = 0.000
Epoch 87 | Step 400 | loss = 5.265, acc = 0.000
Epoch 87 | Step 500 | loss = 5.267, acc = 0.000
Epoch 87 | Step 600 | loss = 5.264, acc = 0.000
Epoch 87 | Step 700 | loss = 5.265, acc = 0.000
Epoch 87 | Step 800 | loss = 5.266, acc = 0.000
Epoch 87 | Step 900 | loss = 5.266, acc = 0.000
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 87 | acc = 0.000


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 88 | Step 100 | loss = 5.213, acc = 0.000
Epoch 88 | Step 200 | loss = 5.267, acc = 0.000
Epoch 88 | Step 300 | loss = 5.265, acc = 0.000
Epoch 88 | Step 400 | loss = 5.263, acc = 0.000
Epoch 88 | Step 500 | loss = 5.266, acc = 0.000
Epoch 88 | Step 600 | loss = 5.265, acc = 0.000
Epoch 88 | Step 700 | loss = 5.264, acc = 0.000
Epoch 88 | Step 800 | loss = 5.266, acc = 0.000
Epoch 88 | Step 900 | loss = 5.266, acc = 0.000
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 88 | acc = 0.000


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 89 | Step 100 | loss = 5.212, acc = 0.000
Epoch 89 | Step 200 | loss = 5.266, acc = 0.000
Epoch 89 | Step 300 | loss = 5.265, acc = 0.000
Epoch 89 | Step 400 | loss = 5.264, acc = 0.000
Epoch 89 | Step 500 | loss = 5.265, acc = 0.000
Epoch 89 | Step 600 | loss = 5.264, acc = 0.000
Epoch 89 | Step 700 | loss = 5.266, acc = 0.000
Epoch 89 | Step 800 | loss = 5.263, acc = 0.000
Epoch 89 | Step 900 | loss = 5.266, acc = 0.000
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 89 | acc = 0.000


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 90 | Step 100 | loss = 5.211, acc = 0.000
Epoch 90 | Step 200 | loss = 5.265, acc = 0.000
Epoch 90 | Step 300 | loss = 5.265, acc = 0.000
Epoch 90 | Step 400 | loss = 5.264, acc = 0.000
Epoch 90 | Step 500 | loss = 5.266, acc = 0.000
Epoch 90 | Step 600 | loss = 5.265, acc = 0.000
Epoch 90 | Step 700 | loss = 5.264, acc = 0.000
Epoch 90 | Step 800 | loss = 5.264, acc = 0.000
Epoch 90 | Step 900 | loss = 5.264, acc = 0.000
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 90 | acc = 0.000


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 91 | Step 100 | loss = 5.212, acc = 0.000
Epoch 91 | Step 200 | loss = 5.264, acc = 0.000
Epoch 91 | Step 300 | loss = 5.264, acc = 0.000
Epoch 91 | Step 400 | loss = 5.264, acc = 0.000
Epoch 91 | Step 500 | loss = 5.264, acc = 0.000
Epoch 91 | Step 600 | loss = 5.266, acc = 0.000
Epoch 91 | Step 700 | loss = 5.264, acc = 0.000
Epoch 91 | Step 800 | loss = 5.264, acc = 0.000
Epoch 91 | Step 900 | loss = 5.264, acc = 0.000
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 91 | acc = 0.000


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 92 | Step 100 | loss = 5.212, acc = 0.000
Epoch 92 | Step 200 | loss = 5.264, acc = 0.000
Epoch 92 | Step 300 | loss = 5.265, acc = 0.000
Epoch 92 | Step 400 | loss = 5.263, acc = 0.000
Epoch 92 | Step 500 | loss = 5.264, acc = 0.000
Epoch 92 | Step 600 | loss = 5.264, acc = 0.000
Epoch 92 | Step 700 | loss = 5.264, acc = 0.000
Epoch 92 | Step 800 | loss = 5.264, acc = 0.000
Epoch 92 | Step 900 | loss = 5.265, acc = 0.000
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 92 | acc = 0.000


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 93 | Step 100 | loss = 5.211, acc = 0.000
Epoch 93 | Step 200 | loss = 5.264, acc = 0.000
Epoch 93 | Step 300 | loss = 5.264, acc = 0.000
Epoch 93 | Step 400 | loss = 5.263, acc = 0.000
Epoch 93 | Step 500 | loss = 5.263, acc = 0.000
Epoch 93 | Step 600 | loss = 5.264, acc = 0.000
Epoch 93 | Step 700 | loss = 5.265, acc = 0.000
Epoch 93 | Step 800 | loss = 5.263, acc = 0.000
Epoch 93 | Step 900 | loss = 5.264, acc = 0.000
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 93 | acc = 0.000


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 94 | Step 100 | loss = 5.211, acc = 0.000
Epoch 94 | Step 200 | loss = 5.263, acc = 0.000
Epoch 94 | Step 300 | loss = 5.264, acc = 0.000
Epoch 94 | Step 400 | loss = 5.264, acc = 0.000
Epoch 94 | Step 500 | loss = 5.263, acc = 0.000
Epoch 94 | Step 600 | loss = 5.263, acc = 0.000
Epoch 94 | Step 700 | loss = 5.263, acc = 0.000
Epoch 94 | Step 800 | loss = 5.264, acc = 0.000
Epoch 94 | Step 900 | loss = 5.263, acc = 0.000
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 94 | acc = 0.000


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 95 | Step 100 | loss = 5.211, acc = 0.000
Epoch 95 | Step 200 | loss = 5.263, acc = 0.000
Epoch 95 | Step 300 | loss = 5.263, acc = 0.000
Epoch 95 | Step 400 | loss = 5.263, acc = 0.000
Epoch 95 | Step 500 | loss = 5.265, acc = 0.000
Epoch 95 | Step 600 | loss = 5.264, acc = 0.000
Epoch 95 | Step 700 | loss = 5.264, acc = 0.000
Epoch 95 | Step 800 | loss = 5.263, acc = 0.000
Epoch 95 | Step 900 | loss = 5.264, acc = 0.000
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 95 | acc = 0.000


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 96 | Step 100 | loss = 5.209, acc = 0.000
Epoch 96 | Step 200 | loss = 5.264, acc = 0.000
Epoch 96 | Step 300 | loss = 5.264, acc = 0.000
Epoch 96 | Step 400 | loss = 5.264, acc = 0.000
Epoch 96 | Step 500 | loss = 5.264, acc = 0.000
Epoch 96 | Step 600 | loss = 5.264, acc = 0.000
Epoch 96 | Step 700 | loss = 5.264, acc = 0.000
Epoch 96 | Step 800 | loss = 5.265, acc = 0.000
Epoch 96 | Step 900 | loss = 5.264, acc = 0.000
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 96 | acc = 0.000


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 97 | Step 100 | loss = 5.212, acc = 0.000
Epoch 97 | Step 200 | loss = 5.265, acc = 0.000
Epoch 97 | Step 300 | loss = 5.263, acc = 0.000
Epoch 97 | Step 400 | loss = 5.263, acc = 0.000
Epoch 97 | Step 500 | loss = 5.263, acc = 0.000
Epoch 97 | Step 600 | loss = 5.263, acc = 0.000
Epoch 97 | Step 700 | loss = 5.264, acc = 0.000
Epoch 97 | Step 800 | loss = 5.263, acc = 0.000
Epoch 97 | Step 900 | loss = 5.264, acc = 0.000
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 97 | acc = 0.000


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 98 | Step 100 | loss = 5.210, acc = 0.000
Epoch 98 | Step 200 | loss = 5.264, acc = 0.000
Epoch 98 | Step 300 | loss = 5.263, acc = 0.000
Epoch 98 | Step 400 | loss = 5.263, acc = 0.000
Epoch 98 | Step 500 | loss = 5.265, acc = 0.000
Epoch 98 | Step 600 | loss = 5.264, acc = 0.000
Epoch 98 | Step 700 | loss = 5.264, acc = 0.000
Epoch 98 | Step 800 | loss = 5.264, acc = 0.000
Epoch 98 | Step 900 | loss = 5.264, acc = 0.000
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 98 | acc = 0.000


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 99 | Step 100 | loss = 5.210, acc = 0.000
Epoch 99 | Step 200 | loss = 5.264, acc = 0.000
Epoch 99 | Step 300 | loss = 5.264, acc = 0.000
Epoch 99 | Step 400 | loss = 5.264, acc = 0.000
Epoch 99 | Step 500 | loss = 5.263, acc = 0.000
Epoch 99 | Step 600 | loss = 5.264, acc = 0.000
Epoch 99 | Step 700 | loss = 5.263, acc = 0.000
Epoch 99 | Step 800 | loss = 5.264, acc = 0.000
Epoch 99 | Step 900 | loss = 5.263, acc = 0.000
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 99 | acc = 0.000


  0%|          | 0/991 [00:00<?, ?it/s]

Epoch 100 | Step 100 | loss = 5.210, acc = 0.000
Epoch 100 | Step 200 | loss = 5.264, acc = 0.000
Epoch 100 | Step 300 | loss = 5.263, acc = 0.000
Epoch 100 | Step 400 | loss = 5.264, acc = 0.000
Epoch 100 | Step 500 | loss = 5.264, acc = 0.000
Epoch 100 | Step 600 | loss = 5.264, acc = 0.000
Epoch 100 | Step 700 | loss = 5.263, acc = 0.000
Epoch 100 | Step 800 | loss = 5.263, acc = 0.000
Epoch 100 | Step 900 | loss = 5.264, acc = 0.000
Evaluating Dev Set ...


  0%|          | 0/4131 [00:00<?, ?it/s]

Validation | Epoch 100 | acc = 0.000
Saving Model ...


In [16]:
print("Evaluating Test Set ...")

result = []

model.eval()
with torch.no_grad():
    for data in tqdm(test_loader):
        output = model(input_ids=data[0].squeeze(dim=0).to(device), token_type_ids=data[1].squeeze(dim=0).to(device),
                       attention_mask=data[2].squeeze(dim=0).to(device))
        result.append(evaluate(data, output))

result_file = "result.csv"
with open(result_file, 'w') as f:	
	  f.write("ID,Answer\n")
	  for i, test_question in enumerate(test_questions):
        # Replace commas in answers with empty strings (since csv is separated by comma)
        # Answers in kaggle are processed in the same way
		    f.write(f"{test_question['id']},{result[i].replace(',','')}\n")

print(f"Completed! Result is in {result_file}")

Evaluating Test Set ...


  0%|          | 0/4957 [00:00<?, ?it/s]

Completed! Result is in result.csv
