In [20]:
#Load train, test and dev data as list of tuples
def load_data(file_path, mode):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        index, words, tags = [], [], []
        maxlen = 0
        for line in f:
            line = line.strip()
            if not line:
                if words:
                    data.append((index, words, tags))
                    maxlen = max(maxlen, len(words))
                    index, words, tags = [], [], []
            else:
                if mode == 'train' or mode == 'dev':
                  ind, word, tag = line.split(' ')[0], line.split(' ')[1], line.split(' ')[2]
                elif mode == 'test':
                  ind, word = line.split(' ')[0], line.split(' ')[1]

                if mode == 'test':
                  index.append(ind)
                  words.append(word)
                if mode == 'train' or mode == 'dev':
                  index.append(ind)
                  words.append(word)
                  tags.append(tag)
                
    print(f"max sentence length = {maxlen}")
    return data

train_data = load_data('data/train', mode = 'train')
dev_data = load_data('data/dev', mode = 'dev')
test_data = load_data('/data/test', mode = 'test')

max sentence length = 113
max sentence length = 109
max sentence length = 124


In [21]:
# Experiment: Skip learning from sentence that has only O tags to ameliorate class imbalance
# Observation: Might work -> Can't vet reliability

# from collections import Counter
# from copy import deepcopy

# def augment_data(data):
#     count_only_O = 0
#     count_one_other = 0
#     for i, outer_tuple in enumerate(data):
#         tgs = outer_tuple[2]
#         tg_counter = Counter(tg for tg in tgs)
#         keys = list(tg_counter.keys())
#         if len(keys) == 1 and keys[0] == 'O':
#             count_only_O += 1
#             data.remove(outer_tuple)
#         else:
#             count_one_other += 1
#     print(f"Only O count: {count_only_O}")
#     print(f"One other O count: {count_one_other}")
#     return data
    
# augmented_train_data = augment_data(deepcopy(train_data))
# augmented_dev_data = augment_data(deepcopy(dev_data))

In [56]:
# Build vocabulary and word<->tag maps 

from collections import Counter
def build_vocab(data):
    word_counts = Counter(word for _, sentence, _ in data for word in sentence)
    filtered_dict = {key: value for key, value in word_counts.items()}
    vocabulary = ['<pad>', '<unk>'] + sorted(filtered_dict)
    word2idx = {word: idx for idx, word in enumerate(vocabulary)}
    return vocabulary, word2idx

def build_tag_map(data):
    tags = set(tag for _,_, tags in data for tag in tags)
    # tags.add('<pad>')
    tag2idx = {tag: idx for idx, tag in enumerate(sorted(tags))}
    return tag2idx

vocabulary, word2idx = build_vocab(train_data + dev_data + test_data)
tag2idx = build_tag_map(train_data + dev_data)

In [57]:
# Make a reverse word2idx -> idx2word. Needed for decoding after forward pass.
# reverse_word2idx = {v: k for k, v in word2idx.items()}

In [250]:
test_data[0]

(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'],
 ['SOCCER',
  '-',
  'JAPAN',
  'GET',
  'LUCKY',
  'WIN',
  ',',
  'CHINA',
  'IN',
  'SURPRISE',
  'DEFEAT',
  '.'],
 [])

In [268]:
#Make a wrapper class for our Dataset -- Take care of no tag in test data

import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from copy import deepcopy

class NERDataset(Dataset):
    def __init__(self, data, word2idx, tag2idx, mode = 'train', test_word2idx_untouched=None):
        self.data = data
        self.word2idx = word2idx
        self.tag2idx = tag2idx
        self.maxlen=125
        self.mode = mode

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        mode = self.mode
        if mode == 'train' or mode == 'devtrain' or mode == 'dev_out' or mode == 'dev_perl':
          index, words, tags = self.data[idx]
        if mode == 'test_out':
          index, words, _ = self.data[idx]

        
        # print(words)
        temp = [self.word2idx.get(word, self.word2idx['<unk>']) for word in words]
        if len(temp) < self.maxlen:
            pad_arr = np.pad(temp, (0, self.maxlen-len(temp)), 'constant', constant_values=(-1, 0))
        elif len(temp) > self.maxlen:
            pad_arr = temp[:self.maxlen]
        else:
            pad_arr = temp
        x = torch.tensor(pad_arr)

        if mode == 'test_out' or mode =='dev_out' or mode == 'dev_perl':
#           words_untouched = deepcopy(words)
#           print(word)
          temp = [test_word2idx_untouched.get(word, -1) for word in words]
          # print(index)
          if len(temp) < self.maxlen:
              pad_arr = np.pad(temp, (0, self.maxlen-len(temp)), 'constant', constant_values=(-1, 0))
          elif len(temp) > self.maxlen:
              pad_arr = temp[:self.maxlen]
          else:
              pad_arr = temp
          k = torch.tensor(pad_arr)

        # print(words)
        # print(index)
        temp = [int(ind) for ind in index]
        if len(temp) < self.maxlen:
            pad_arr = np.pad(temp, (0, self.maxlen-len(temp)), 'constant', constant_values=(-1, 0))
        elif len(temp) > self.maxlen:
            pad_arr = temp[:self.maxlen]
        else:
            pad_arr = temp
        z = torch.tensor(pad_arr)

        if mode == 'train' or mode == 'devtrain' or mode =="dev_out" or mode == 'dev_perl':
          temp = [self.tag2idx[tag] for tag in tags]
          if len(temp) < self.maxlen:
              pad_arr = np.pad(temp, (0,self.maxlen-len(temp)), 'constant', constant_values=(-1, -1))
          elif len(temp) > self.maxlen:
              pad_arr = temp[:self.maxlen]
          else:
              pad_arr = temp
          y = torch.LongTensor(pad_arr)
        
        if mode == 'train' or mode == 'devtrain':
          return x, y
        # print(x, z, k)
        if mode == 'test_out' or mode == 'dev_out':
            return x, k, z
        if mode == 'dev_perl':
            return x, y, z, k


train_dataset = NERDataset(train_data, word2idx, tag2idx, mode = 'train')
dev_dataset = NERDataset(dev_data, word2idx, tag2idx, mode = 'devtrain')

batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size)

In [59]:
tag2idx

{'B-LOC': 0,
 'B-MISC': 1,
 'B-ORG': 2,
 'B-PER': 3,
 'I-LOC': 4,
 'I-MISC': 5,
 'I-ORG': 6,
 'I-PER': 7,
 'O': 8}

In [60]:
#Define Model

import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn.init as init


class BLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout, pretrained_embedding = None):
        super(BLSTM, self).__init__()
        if pretrained_embedding:
            self.embedding = pretrained_embedding
        else:
            self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
#         init.xavier_uniform_(self.embedding.weight)
        self.blstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        self.dropout1 = nn.Dropout(dropout)
        self.linear1 = nn.Linear(hidden_dim * 2, 128)
        self.activation = nn.ELU()
        self.classifier = nn.Sequential(
            nn.Linear(128, 256),
            nn.Tanh(),
            nn.Dropout(0.2),
            nn.Linear(256, output_dim)
        )

    def forward(self, x):
        # print(x.shape)
        embedded = self.embedding(x)
        seq_lengths = torch.count_nonzero(x, dim=1).cpu()
        x = pack_padded_sequence(embedded, seq_lengths, batch_first=True, enforce_sorted=False)
        x, _ = self.blstm(x)
        x, unpacked_len = pad_packed_sequence(x, batch_first=True)
        x = self.dropout1(x)
        x = self.linear1(x)
        x = self.activation(x)
        x = self.classifier(x)
        return x

In [61]:
!pip install torchmetrics

[0m

In [62]:
# piecewise accuracy 
def accuracy(outputs, labels):
    acc = 0
    count = 0
    for i in range(outputs.shape[0]):
        sentence_pred = outputs[i]
        for j, word in enumerate(sentence_pred):
            word_pred = torch.argmax(word).item()
            label = labels[i][j].item()
            if label == -1:
                continue
            count += 1
            if word_pred == label:
                acc += 1
    return acc/count

#evaluate function for dev test during training
def evaluate(model, criterion, dataloader, device = 'cuda'):
  with torch.no_grad():
    dev_loss, dev_acc, dev_f1 = 0.0, 0.0, 0.0
    for batch_x, batch_y in tqdm(dataloader):
      batch_x = batch_x.to(device)
      batch_y = batch_y.to(device)
      outputs = model(batch_x)
      seq_lengths = torch.count_nonzero(batch_x, dim=1).to('cpu')
      packed_y = pack_padded_sequence(batch_y, seq_lengths, batch_first=True, enforce_sorted=False)
      unpacked_y, unpacked_len = pad_packed_sequence(packed_y, batch_first=True, padding_value=-1)
      unpacked_y = unpacked_y.to(device)
      loss = criterion(outputs.permute(0, 2, 1), unpacked_y)
      dev_loss += loss.item()
      out_for_f1 = torch.argmax(outputs, dim = -1)
      mask = (unpacked_y >= 0)
      f1 = f1_score(out_for_f1[mask].cpu(), unpacked_y[mask].cpu(), average='weighted')
      #-->costly operation, uncomment to see accuracy
#       acc = accuracy(outputs, batch_y) 
#       dev_acc += acc
      dev_f1 += f1
    dev_loss /= len(dataloader)
    dev_acc /= len(dataloader)
    dev_f1 /= len(dataloader)

    print(f"Average Dev Loss: {dev_loss}")
    print(f"Average Dev accuracy: {dev_acc}")
    print(f"Average Dev F1: {dev_f1}")
    return dev_loss, dev_f1


In [63]:
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

In [64]:
from sklearn.utils.class_weight import compute_class_weight 

def get_class_weights(data):
    all_y = []
    for data in data:
        all_y.extend(data[2])

    class_weights = compute_class_weight(
                                            class_weight = "balanced",
                                            classes = np.unique(all_y),
                                            y = all_y                                                    
                                        )
    class_weights=torch.tensor(class_weights,dtype=torch.float).to('cuda')
    return class_weights

class_weights = get_class_weights(train_data + dev_data)

In [65]:
# class_weights -= torch.min(class_weights)
class_weights

tensor([ 3.1704,  6.5276,  3.7145,  3.3713, 20.1275, 18.9609,  6.3884,  4.8775,
         0.1333], device='cuda:0')

In [66]:
from tqdm import tqdm
from sklearn.metrics import f1_score
from torchmetrics.functional.classification import multiclass_f1_score
from torch.optim.lr_scheduler import MultiStepLR


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BLSTM(len(vocabulary), embedding_dim=100, hidden_dim=256, output_dim=len(tag2idx), dropout=0.33).to(device)
# optimizer = torch.optim.Adam(model.parameters(), lr = 0.001, weight_decay = 1e-5, eps=1e-08, betas = (0.9, 0.999))
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01, momentum = 0.9, weight_decay = 1e-3)
# scheduler = MultiStepLR(optimizer, milestones=[3,5,7,9], gamma=0.1)
# scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=0.5, max_lr=1.2, step_size_up=20, step_size_down=None, mode='triangular', gamma=1.0)
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', min_lr=1, verbose=True)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',factor=0.55, patience = 3, threshold=0.1, verbose=True, min_lr=5e-4)
# scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=0.5, total_iters=20)

# class_weights = torch.tensor([1,1,1,1,1,1,1,1,0.01], dtype=torch.float).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=-1, reduction='mean', weight=class_weights)
    

def train(model, train_loader, optimizer, criterion, device, epochs):
    model.train()
    SAVE_PATH = "./best_model.pt"
    best_f1 = -1
    for epoch in range(epochs):
      print(f"Epoch: {epoch}")
      train_loss, train_acc, train_f1 = 0.0, 0.0, 0.0
      for batch_x, batch_y in tqdm(train_loader):
          batch_x = batch_x.to(device)
          batch_y = batch_y.to(device)
          outputs = model(batch_x)

          seq_lengths = torch.count_nonzero(batch_x, dim=1).to('cpu')
          packed_y = pack_padded_sequence(batch_y, seq_lengths, batch_first=True, enforce_sorted=False)
          unpacked_y, unpacked_len = pad_packed_sequence(packed_y, batch_first=True, padding_value = -1)
          unpacked_y = unpacked_y.to(device)
            
          loss = criterion(outputs.permute(0, 2, 1), unpacked_y)
          train_loss += loss.item()
  
#           mask = (unpacked_y >= 0) -->costly operation, uncomment to see accuracy, f1
#           acc = accuracy(outputs, batch_y)
#           out_for_f1 = torch.argmax(outputs, dim = -1)
#           f1 = f1_score(out_for_f1[mask].cpu(), unpacked_y[mask].cpu(), average='weighted')
#           train_acc += acc
#           train_f1 += f1

          optimizer.zero_grad()
          loss.backward()
          optimizer.step()
          # print(get_lr(optimizer))

      train_loss /= len(train_loader)
#       train_acc /= len(train_loader)
#       train_f1 /= len(train_loader)
      
#       print(f"Average train accuracy: {train_acc}")
#       print(f"Average train f1: {train_f1}")
      val_loss, val_f1 = evaluate(model, criterion, dev_loader)
      if val_f1 > best_f1:
            best_f1 = val_f1
            torch.save(model.state_dict(), SAVE_PATH)
      scheduler.step(val_loss)
      print(f"Average train Loss: {train_loss}")
      print(f"Current Learning Rate: {get_lr(optimizer)}")
      print(f"Best sklearn masked F1: {best_f1}")

    return model

model = train(model, train_loader, optimizer, criterion, device, epochs = 100)


Epoch: 0


100%|██████████| 1874/1874 [00:18<00:00, 101.18it/s]
100%|██████████| 434/434 [00:02<00:00, 164.55it/s]


Average Dev Loss: 1.4999754266804814
Average Dev accuracy: 0.0
Average Dev F1: 0.44007233480329083
Average train Loss: 1.7876404129135697
Current Learning Rate: 0.01
Best sklearn masked F1: 0.44007233480329083
Epoch: 1


100%|██████████| 1874/1874 [00:18<00:00, 103.24it/s]
100%|██████████| 434/434 [00:02<00:00, 168.88it/s]


Average Dev Loss: 1.2480844843634813
Average Dev accuracy: 0.0
Average Dev F1: 0.512941720119775
Average train Loss: 1.3169132836854827
Current Learning Rate: 0.01
Best sklearn masked F1: 0.512941720119775
Epoch: 2


100%|██████████| 1874/1874 [00:18<00:00, 99.47it/s] 
100%|██████████| 434/434 [00:02<00:00, 180.12it/s]


Average Dev Loss: 1.130530881613905
Average Dev accuracy: 0.0
Average Dev F1: 0.514520515226967
Average train Loss: 1.0797002766849901
Current Learning Rate: 0.01
Best sklearn masked F1: 0.514520515226967
Epoch: 3


100%|██████████| 1874/1874 [00:18<00:00, 103.54it/s]
100%|██████████| 434/434 [00:02<00:00, 153.28it/s]


Average Dev Loss: 1.064261501354556
Average Dev accuracy: 0.0
Average Dev F1: 0.5735050098465089
Average train Loss: 0.9368670036024319
Current Learning Rate: 0.01
Best sklearn masked F1: 0.5735050098465089
Epoch: 4


100%|██████████| 1874/1874 [00:18<00:00, 103.72it/s]
100%|██████████| 434/434 [00:02<00:00, 176.84it/s]


Average Dev Loss: 1.0281429839779705
Average Dev accuracy: 0.0
Average Dev F1: 0.6522480739744296
Average train Loss: 0.8510956046198322
Current Learning Rate: 0.01
Best sklearn masked F1: 0.6522480739744296
Epoch: 5


100%|██████████| 1874/1874 [00:18<00:00, 101.43it/s]
100%|██████████| 434/434 [00:02<00:00, 157.99it/s]


Average Dev Loss: 1.0095814675401706
Average Dev accuracy: 0.0
Average Dev F1: 0.494183893979238
Average train Loss: 0.7891013945354469
Current Learning Rate: 0.01
Best sklearn masked F1: 0.6522480739744296
Epoch: 6


100%|██████████| 1874/1874 [00:17<00:00, 104.99it/s]
100%|██████████| 434/434 [00:03<00:00, 129.39it/s]


Average Dev Loss: 0.9445313932159529
Average Dev accuracy: 0.0
Average Dev F1: 0.5514244565029169
Average train Loss: 0.7330168228449345
Current Learning Rate: 0.01
Best sklearn masked F1: 0.6522480739744296
Epoch: 7


100%|██████████| 1874/1874 [00:17<00:00, 105.17it/s]
100%|██████████| 434/434 [00:02<00:00, 167.72it/s]


Average Dev Loss: 0.844585098489295
Average Dev accuracy: 0.0
Average Dev F1: 0.6604513368750051
Average train Loss: 0.6564743758607636
Current Learning Rate: 0.01
Best sklearn masked F1: 0.6604513368750051
Epoch: 8


100%|██████████| 1874/1874 [00:18<00:00, 101.00it/s]
100%|██████████| 434/434 [00:02<00:00, 174.89it/s]


Average Dev Loss: 0.771380121909803
Average Dev accuracy: 0.0
Average Dev F1: 0.6502297873333864
Average train Loss: 0.5470623770647204
Current Learning Rate: 0.01
Best sklearn masked F1: 0.6604513368750051
Epoch: 9


100%|██████████| 1874/1874 [00:17<00:00, 104.61it/s]
100%|██████████| 434/434 [00:02<00:00, 155.50it/s]


Average Dev Loss: 0.7848207376519656
Average Dev accuracy: 0.0
Average Dev F1: 0.7484228179118905
Average train Loss: 0.45110162590871883
Current Learning Rate: 0.01
Best sklearn masked F1: 0.7484228179118905
Epoch: 10


100%|██████████| 1874/1874 [00:18<00:00, 102.63it/s]
100%|██████████| 434/434 [00:02<00:00, 176.01it/s]


Average Dev Loss: 0.7393187300349298
Average Dev accuracy: 0.0
Average Dev F1: 0.7963076131068119
Average train Loss: 0.35331375297882545
Current Learning Rate: 0.01
Best sklearn masked F1: 0.7963076131068119
Epoch: 11


100%|██████████| 1874/1874 [00:18<00:00, 100.04it/s]
100%|██████████| 434/434 [00:02<00:00, 175.39it/s]


Average Dev Loss: 0.6710721600311487
Average Dev accuracy: 0.0
Average Dev F1: 0.7839808407411788
Average train Loss: 0.2753413212659551
Current Learning Rate: 0.01
Best sklearn masked F1: 0.7963076131068119
Epoch: 12


100%|██████████| 1874/1874 [00:18<00:00, 104.02it/s]
100%|██████████| 434/434 [00:02<00:00, 174.52it/s]


Average Dev Loss: 0.6625001065793538
Average Dev accuracy: 0.0
Average Dev F1: 0.7980084265106325
Average train Loss: 0.21655556874890416
Current Learning Rate: 0.01
Best sklearn masked F1: 0.7980084265106325
Epoch: 13


100%|██████████| 1874/1874 [00:18<00:00, 99.33it/s] 
100%|██████████| 434/434 [00:02<00:00, 175.40it/s]


Average Dev Loss: 0.6193133681617712
Average Dev accuracy: 0.0
Average Dev F1: 0.8468065335447617
Average train Loss: 0.1635887921815629
Current Learning Rate: 0.01
Best sklearn masked F1: 0.8468065335447617
Epoch: 14


100%|██████████| 1874/1874 [00:18<00:00, 101.39it/s]
100%|██████████| 434/434 [00:02<00:00, 180.05it/s]


Average Dev Loss: 0.6622465192361774
Average Dev accuracy: 0.0
Average Dev F1: 0.8832944068887653
Average train Loss: 0.13550277006600683
Current Learning Rate: 0.01
Best sklearn masked F1: 0.8832944068887653
Epoch: 15


100%|██████████| 1874/1874 [00:17<00:00, 106.81it/s]
100%|██████████| 434/434 [00:02<00:00, 159.56it/s]


Average Dev Loss: 0.7985770021552383
Average Dev accuracy: 0.0
Average Dev F1: 0.8879235069796121
Average train Loss: 0.14454920303817986
Current Learning Rate: 0.01
Best sklearn masked F1: 0.8879235069796121
Epoch: 16


100%|██████████| 1874/1874 [00:18<00:00, 101.65it/s]
100%|██████████| 434/434 [00:02<00:00, 161.30it/s]


Average Dev Loss: 0.5892526331542206
Average Dev accuracy: 0.0
Average Dev F1: 0.907269602116235
Average train Loss: 0.14065833947310183
Current Learning Rate: 0.01
Best sklearn masked F1: 0.907269602116235
Epoch: 17


100%|██████████| 1874/1874 [00:18<00:00, 102.43it/s]
100%|██████████| 434/434 [00:02<00:00, 164.95it/s]


Average Dev Loss: 0.5404795522808016
Average Dev accuracy: 0.0
Average Dev F1: 0.8882028142478163
Average train Loss: 0.12336549964825859
Current Learning Rate: 0.01
Best sklearn masked F1: 0.907269602116235
Epoch: 18


100%|██████████| 1874/1874 [00:17<00:00, 106.70it/s]
100%|██████████| 434/434 [00:02<00:00, 180.18it/s]


Average Dev Loss: 0.5522902141140628
Average Dev accuracy: 0.0
Average Dev F1: 0.8207288538771875
Average train Loss: 0.12209372708212143
Current Learning Rate: 0.01
Best sklearn masked F1: 0.907269602116235
Epoch: 19


100%|██████████| 1874/1874 [00:18<00:00, 99.24it/s] 
100%|██████████| 434/434 [00:02<00:00, 177.98it/s]


Average Dev Loss: 0.5559448633999629
Average Dev accuracy: 0.0
Average Dev F1: 0.8941424735937628
Average train Loss: 0.12718491694714035
Current Learning Rate: 0.01
Best sklearn masked F1: 0.907269602116235
Epoch: 20


100%|██████████| 1874/1874 [00:18<00:00, 100.62it/s]
100%|██████████| 434/434 [00:02<00:00, 181.38it/s]


Average Dev Loss: 0.5652647981847194
Average Dev accuracy: 0.0
Average Dev F1: 0.8848081275531343
Epoch 00021: reducing learning rate of group 0 to 5.5000e-03.
Average train Loss: 0.138814229696836
Current Learning Rate: 0.0055000000000000005
Best sklearn masked F1: 0.907269602116235
Epoch: 21


100%|██████████| 1874/1874 [00:17<00:00, 104.85it/s]
100%|██████████| 434/434 [00:02<00:00, 179.44it/s]


Average Dev Loss: 0.5171162204394194
Average Dev accuracy: 0.0
Average Dev F1: 0.9291542311817061
Average train Loss: 0.07150061884020299
Current Learning Rate: 0.0055000000000000005
Best sklearn masked F1: 0.9291542311817061
Epoch: 22


100%|██████████| 1874/1874 [00:18<00:00, 101.42it/s]
100%|██████████| 434/434 [00:02<00:00, 178.90it/s]


Average Dev Loss: 0.4627846716742207
Average Dev accuracy: 0.0
Average Dev F1: 0.9084329461422918
Average train Loss: 0.054476069276401994
Current Learning Rate: 0.0055000000000000005
Best sklearn masked F1: 0.9291542311817061
Epoch: 23


100%|██████████| 1874/1874 [00:18<00:00, 102.25it/s]
100%|██████████| 434/434 [00:02<00:00, 181.07it/s]


Average Dev Loss: 0.5892483890523345
Average Dev accuracy: 0.0
Average Dev F1: 0.9432747507853879
Average train Loss: 0.05017059958955371
Current Learning Rate: 0.0055000000000000005
Best sklearn masked F1: 0.9432747507853879
Epoch: 24


100%|██████████| 1874/1874 [00:17<00:00, 107.00it/s]
100%|██████████| 434/434 [00:02<00:00, 155.65it/s]


Average Dev Loss: 0.5732304112683158
Average Dev accuracy: 0.0
Average Dev F1: 0.9248346605465257
Average train Loss: 0.05114158124338911
Current Learning Rate: 0.0055000000000000005
Best sklearn masked F1: 0.9432747507853879
Epoch: 25


100%|██████████| 1874/1874 [00:18<00:00, 101.53it/s]
100%|██████████| 434/434 [00:02<00:00, 162.66it/s]


Average Dev Loss: 0.6417392429440767
Average Dev accuracy: 0.0
Average Dev F1: 0.9087040009811914
Average train Loss: 0.06948207600068067
Current Learning Rate: 0.0055000000000000005
Best sklearn masked F1: 0.9432747507853879
Epoch: 26


100%|██████████| 1874/1874 [00:18<00:00, 103.50it/s]
100%|██████████| 434/434 [00:02<00:00, 180.20it/s]


Average Dev Loss: 0.5226940676604274
Average Dev accuracy: 0.0
Average Dev F1: 0.8890601953579155
Epoch 00027: reducing learning rate of group 0 to 3.0250e-03.
Average train Loss: 0.06622153572020545
Current Learning Rate: 0.0030250000000000003
Best sklearn masked F1: 0.9432747507853879
Epoch: 27


100%|██████████| 1874/1874 [00:17<00:00, 105.62it/s]
100%|██████████| 434/434 [00:02<00:00, 181.82it/s]


Average Dev Loss: 0.5103786404133015
Average Dev accuracy: 0.0
Average Dev F1: 0.9467638182087739
Average train Loss: 0.04504686199812459
Current Learning Rate: 0.0030250000000000003
Best sklearn masked F1: 0.9467638182087739
Epoch: 28


100%|██████████| 1874/1874 [00:18<00:00, 100.16it/s]
100%|██████████| 434/434 [00:02<00:00, 178.98it/s]


Average Dev Loss: 0.5526041169092918
Average Dev accuracy: 0.0
Average Dev F1: 0.9501818741224434
Average train Loss: 0.035685574441547865
Current Learning Rate: 0.0030250000000000003
Best sklearn masked F1: 0.9501818741224434
Epoch: 29


100%|██████████| 1874/1874 [00:18<00:00, 101.73it/s]
100%|██████████| 434/434 [00:02<00:00, 179.84it/s]


Average Dev Loss: 0.4964179217499951
Average Dev accuracy: 0.0
Average Dev F1: 0.9485634963502978
Average train Loss: 0.034402520647158166
Current Learning Rate: 0.0030250000000000003
Best sklearn masked F1: 0.9501818741224434
Epoch: 30


100%|██████████| 1874/1874 [00:18<00:00, 103.35it/s]
100%|██████████| 434/434 [00:02<00:00, 175.67it/s]


Average Dev Loss: 0.5539881996107653
Average Dev accuracy: 0.0
Average Dev F1: 0.9545033819255129
Epoch 00031: reducing learning rate of group 0 to 1.6638e-03.
Average train Loss: 0.035568564326632586
Current Learning Rate: 0.0016637500000000003
Best sklearn masked F1: 0.9545033819255129
Epoch: 31


100%|██████████| 1874/1874 [00:18<00:00, 101.07it/s]
100%|██████████| 434/434 [00:02<00:00, 181.84it/s]


Average Dev Loss: 0.49398434677532516
Average Dev accuracy: 0.0
Average Dev F1: 0.951606439518711
Average train Loss: 0.029350286822148828
Current Learning Rate: 0.0016637500000000003
Best sklearn masked F1: 0.9545033819255129
Epoch: 32


100%|██████████| 1874/1874 [00:17<00:00, 105.25it/s]
100%|██████████| 434/434 [00:02<00:00, 146.34it/s]


Average Dev Loss: 0.48585651505712746
Average Dev accuracy: 0.0
Average Dev F1: 0.945636939646246
Average train Loss: 0.03005341692341108
Current Learning Rate: 0.0016637500000000003
Best sklearn masked F1: 0.9545033819255129
Epoch: 33


100%|██████████| 1874/1874 [00:18<00:00, 99.12it/s] 
100%|██████████| 434/434 [00:02<00:00, 169.24it/s]


Average Dev Loss: 0.5040041729142624
Average Dev accuracy: 0.0
Average Dev F1: 0.9518190275437689
Average train Loss: 0.02841280624103954
Current Learning Rate: 0.0016637500000000003
Best sklearn masked F1: 0.9545033819255129
Epoch: 34


100%|██████████| 1874/1874 [00:18<00:00, 101.21it/s]
100%|██████████| 434/434 [00:02<00:00, 160.56it/s]


Average Dev Loss: 0.5433510855201864
Average Dev accuracy: 0.0
Average Dev F1: 0.9586137989939412
Epoch 00035: reducing learning rate of group 0 to 9.1506e-04.
Average train Loss: 0.028029488994498423
Current Learning Rate: 0.0009150625000000002
Best sklearn masked F1: 0.9586137989939412
Epoch: 35


100%|██████████| 1874/1874 [00:17<00:00, 106.02it/s]
100%|██████████| 434/434 [00:03<00:00, 135.84it/s]


Average Dev Loss: 0.5190065345981197
Average Dev accuracy: 0.0
Average Dev F1: 0.9553085519788052
Average train Loss: 0.026563351132768816
Current Learning Rate: 0.0009150625000000002
Best sklearn masked F1: 0.9586137989939412
Epoch: 36


100%|██████████| 1874/1874 [00:18<00:00, 103.94it/s]
100%|██████████| 434/434 [00:02<00:00, 174.30it/s]


Average Dev Loss: 0.49420831729024906
Average Dev accuracy: 0.0
Average Dev F1: 0.95154073362952
Average train Loss: 0.026031320380258672
Current Learning Rate: 0.0009150625000000002
Best sklearn masked F1: 0.9586137989939412
Epoch: 37


100%|██████████| 1874/1874 [00:18<00:00, 101.32it/s]
100%|██████████| 434/434 [00:02<00:00, 179.88it/s]


Average Dev Loss: 0.4954144683508803
Average Dev accuracy: 0.0
Average Dev F1: 0.9538095810003172
Average train Loss: 0.02592437876735813
Current Learning Rate: 0.0009150625000000002
Best sklearn masked F1: 0.9586137989939412
Epoch: 38


100%|██████████| 1874/1874 [00:17<00:00, 105.22it/s]
100%|██████████| 434/434 [00:02<00:00, 147.49it/s]


Average Dev Loss: 0.49609620673721444
Average Dev accuracy: 0.0
Average Dev F1: 0.9536176858206864
Epoch 00039: reducing learning rate of group 0 to 5.0328e-04.
Average train Loss: 0.02558693283234038
Current Learning Rate: 0.0005032843750000001
Best sklearn masked F1: 0.9586137989939412
Epoch: 39


100%|██████████| 1874/1874 [00:18<00:00, 103.63it/s]
100%|██████████| 434/434 [00:02<00:00, 176.07it/s]


Average Dev Loss: 0.5213525064862526
Average Dev accuracy: 0.0
Average Dev F1: 0.9564477498291976
Average train Loss: 0.02483578140423966
Current Learning Rate: 0.0005032843750000001
Best sklearn masked F1: 0.9586137989939412
Epoch: 40


100%|██████████| 1874/1874 [00:18<00:00, 101.07it/s]
100%|██████████| 434/434 [00:02<00:00, 179.10it/s]


Average Dev Loss: 0.5076256749903663
Average Dev accuracy: 0.0
Average Dev F1: 0.9564566381864565
Average train Loss: 0.024498249280138246
Current Learning Rate: 0.0005032843750000001
Best sklearn masked F1: 0.9586137989939412
Epoch: 41


100%|██████████| 1874/1874 [00:17<00:00, 105.08it/s]
100%|██████████| 434/434 [00:02<00:00, 177.73it/s]


Average Dev Loss: 0.5131901696990652
Average Dev accuracy: 0.0
Average Dev F1: 0.9553126683767884
Average train Loss: 0.024834927340314436
Current Learning Rate: 0.0005032843750000001
Best sklearn masked F1: 0.9586137989939412
Epoch: 42


100%|██████████| 1874/1874 [00:18<00:00, 100.61it/s]
100%|██████████| 434/434 [00:02<00:00, 175.12it/s]


Average Dev Loss: 0.5312311725201695
Average Dev accuracy: 0.0
Average Dev F1: 0.9558648858779323
Epoch 00043: reducing learning rate of group 0 to 5.0000e-04.
Average train Loss: 0.024787299041666107
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9586137989939412
Epoch: 43


100%|██████████| 1874/1874 [00:18<00:00, 101.72it/s]
100%|██████████| 434/434 [00:02<00:00, 160.21it/s]


Average Dev Loss: 0.5243587680171728
Average Dev accuracy: 0.0
Average Dev F1: 0.9571033076660588
Average train Loss: 0.024805034804003853
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9586137989939412
Epoch: 44


100%|██████████| 1874/1874 [00:17<00:00, 106.36it/s]
100%|██████████| 434/434 [00:02<00:00, 163.22it/s]


Average Dev Loss: 0.5318925149989859
Average Dev accuracy: 0.0
Average Dev F1: 0.9573924693575294
Average train Loss: 0.02508204504402738
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9586137989939412
Epoch: 45


100%|██████████| 1874/1874 [00:18<00:00, 102.26it/s]
100%|██████████| 434/434 [00:02<00:00, 179.99it/s]


Average Dev Loss: 0.49632678950561665
Average Dev accuracy: 0.0
Average Dev F1: 0.953731828185093
Average train Loss: 0.024735132924339702
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9586137989939412
Epoch: 46


100%|██████████| 1874/1874 [00:18<00:00, 101.33it/s]
100%|██████████| 434/434 [00:02<00:00, 183.24it/s]


Average Dev Loss: 0.5090225390544857
Average Dev accuracy: 0.0
Average Dev F1: 0.9569594446322426
Average train Loss: 0.02500309744748419
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9586137989939412
Epoch: 47


100%|██████████| 1874/1874 [00:17<00:00, 105.46it/s]
100%|██████████| 434/434 [00:02<00:00, 181.99it/s]


Average Dev Loss: 0.49623678520070774
Average Dev accuracy: 0.0
Average Dev F1: 0.9549238258694712
Average train Loss: 0.02471084234910036
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9586137989939412
Epoch: 48


100%|██████████| 1874/1874 [00:18<00:00, 100.47it/s]
100%|██████████| 434/434 [00:02<00:00, 177.08it/s]


Average Dev Loss: 0.5238808714613009
Average Dev accuracy: 0.0
Average Dev F1: 0.9566368366099411
Average train Loss: 0.024758686950398467
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9586137989939412
Epoch: 49


100%|██████████| 1874/1874 [00:18<00:00, 102.40it/s]
100%|██████████| 434/434 [00:02<00:00, 179.27it/s]


Average Dev Loss: 0.5241402261852918
Average Dev accuracy: 0.0
Average Dev F1: 0.9574215187258607
Average train Loss: 0.02473101542771594
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9586137989939412
Epoch: 50


100%|██████████| 1874/1874 [00:17<00:00, 105.90it/s]
100%|██████████| 434/434 [00:02<00:00, 176.59it/s]


Average Dev Loss: 0.5308078815516788
Average Dev accuracy: 0.0
Average Dev F1: 0.9575341031359015
Average train Loss: 0.024617549944162098
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9586137989939412
Epoch: 51


100%|██████████| 1874/1874 [00:18<00:00, 102.45it/s]
100%|██████████| 434/434 [00:02<00:00, 160.53it/s]


Average Dev Loss: 0.5196646113599939
Average Dev accuracy: 0.0
Average Dev F1: 0.9577203229309822
Average train Loss: 0.02491253394926247
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9586137989939412
Epoch: 52


100%|██████████| 1874/1874 [00:18<00:00, 104.03it/s]
100%|██████████| 434/434 [00:02<00:00, 165.94it/s]


Average Dev Loss: 0.5099104048988958
Average Dev accuracy: 0.0
Average Dev F1: 0.9553601982675807
Average train Loss: 0.024989594885306145
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9586137989939412
Epoch: 53


100%|██████████| 1874/1874 [00:17<00:00, 104.54it/s]
100%|██████████| 434/434 [00:02<00:00, 178.28it/s]


Average Dev Loss: 0.5168288928410467
Average Dev accuracy: 0.0
Average Dev F1: 0.9558663139144794
Average train Loss: 0.025219468833737498
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9586137989939412
Epoch: 54


100%|██████████| 1874/1874 [00:18<00:00, 100.36it/s]
100%|██████████| 434/434 [00:02<00:00, 179.88it/s]


Average Dev Loss: 0.5099845534943748
Average Dev accuracy: 0.0
Average Dev F1: 0.9547802808141241
Average train Loss: 0.02542801315646734
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9586137989939412
Epoch: 55


100%|██████████| 1874/1874 [00:18<00:00, 102.05it/s]
100%|██████████| 434/434 [00:02<00:00, 177.23it/s]


Average Dev Loss: 0.5166763762990656
Average Dev accuracy: 0.0
Average Dev F1: 0.9556775408476218
Average train Loss: 0.025246096993851195
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9586137989939412
Epoch: 56


100%|██████████| 1874/1874 [00:17<00:00, 104.63it/s]
100%|██████████| 434/434 [00:02<00:00, 178.09it/s]


Average Dev Loss: 0.5008210004108834
Average Dev accuracy: 0.0
Average Dev F1: 0.9529033592297445
Average train Loss: 0.02522931516164538
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9586137989939412
Epoch: 57


100%|██████████| 1874/1874 [00:18<00:00, 101.53it/s]
100%|██████████| 434/434 [00:02<00:00, 180.56it/s]


Average Dev Loss: 0.5267910385495781
Average Dev accuracy: 0.0
Average Dev F1: 0.9577082910029601
Average train Loss: 0.025097342479870662
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9586137989939412
Epoch: 58


100%|██████████| 1874/1874 [00:18<00:00, 104.01it/s]
100%|██████████| 434/434 [00:03<00:00, 141.64it/s]


Average Dev Loss: 0.5310622220537046
Average Dev accuracy: 0.0
Average Dev F1: 0.9547201614518014
Average train Loss: 0.025013475653391407
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9586137989939412
Epoch: 59


100%|██████████| 1874/1874 [00:18<00:00, 104.08it/s]
100%|██████████| 434/434 [00:02<00:00, 178.40it/s]


Average Dev Loss: 0.5386066934767647
Average Dev accuracy: 0.0
Average Dev F1: 0.9584973398782425
Average train Loss: 0.02450032528761174
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9586137989939412
Epoch: 60


100%|██████████| 1874/1874 [00:18<00:00, 101.22it/s]
100%|██████████| 434/434 [00:02<00:00, 157.91it/s]


Average Dev Loss: 0.5345784968713702
Average Dev accuracy: 0.0
Average Dev F1: 0.9567098578965825
Average train Loss: 0.02465295808274486
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9586137989939412
Epoch: 61


100%|██████████| 1874/1874 [00:17<00:00, 106.57it/s]
100%|██████████| 434/434 [00:03<00:00, 137.09it/s]


Average Dev Loss: 0.5366326287669915
Average Dev accuracy: 0.0
Average Dev F1: 0.9583609051780319
Average train Loss: 0.024963835691148566
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9586137989939412
Epoch: 62


100%|██████████| 1874/1874 [00:17<00:00, 106.53it/s]
100%|██████████| 434/434 [00:02<00:00, 174.65it/s]


Average Dev Loss: 0.50486313750697
Average Dev accuracy: 0.0
Average Dev F1: 0.9546890089421761
Average train Loss: 0.02580907455243701
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9586137989939412
Epoch: 63


100%|██████████| 1874/1874 [00:18<00:00, 101.27it/s]
100%|██████████| 434/434 [00:02<00:00, 180.74it/s]


Average Dev Loss: 0.5082594714205807
Average Dev accuracy: 0.0
Average Dev F1: 0.9528154598657427
Average train Loss: 0.024451355225100453
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9586137989939412
Epoch: 64


100%|██████████| 1874/1874 [00:17<00:00, 104.38it/s]
100%|██████████| 434/434 [00:02<00:00, 179.60it/s]


Average Dev Loss: 0.5458326456823691
Average Dev accuracy: 0.0
Average Dev F1: 0.9589052561006882
Average train Loss: 0.025193837691153614
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9589052561006882
Epoch: 65


100%|██████████| 1874/1874 [00:18<00:00, 101.08it/s]
100%|██████████| 434/434 [00:02<00:00, 176.66it/s]


Average Dev Loss: 0.5331980920287304
Average Dev accuracy: 0.0
Average Dev F1: 0.9569542380807249
Average train Loss: 0.024823872767599536
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9589052561006882
Epoch: 66


100%|██████████| 1874/1874 [00:18<00:00, 100.47it/s]
100%|██████████| 434/434 [00:02<00:00, 174.46it/s]


Average Dev Loss: 0.536122080606563
Average Dev accuracy: 0.0
Average Dev F1: 0.9571795386263144
Average train Loss: 0.024865301093335086
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9589052561006882
Epoch: 67


100%|██████████| 1874/1874 [00:18<00:00, 102.53it/s]
100%|██████████| 434/434 [00:02<00:00, 178.58it/s]


Average Dev Loss: 0.5141477736903028
Average Dev accuracy: 0.0
Average Dev F1: 0.9556949433024174
Average train Loss: 0.02473770835801657
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9589052561006882
Epoch: 68


100%|██████████| 1874/1874 [00:18<00:00, 102.05it/s]
100%|██████████| 434/434 [00:02<00:00, 179.41it/s]


Average Dev Loss: 0.5362068440181457
Average Dev accuracy: 0.0
Average Dev F1: 0.9570537601646297
Average train Loss: 0.024837046356229638
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9589052561006882
Epoch: 69


100%|██████████| 1874/1874 [00:18<00:00, 102.47it/s]
100%|██████████| 434/434 [00:02<00:00, 159.80it/s]


Average Dev Loss: 0.5593087645875255
Average Dev accuracy: 0.0
Average Dev F1: 0.9594512111891867
Average train Loss: 0.024850876016265008
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9594512111891867
Epoch: 70


100%|██████████| 1874/1874 [00:17<00:00, 105.13it/s]
100%|██████████| 434/434 [00:02<00:00, 162.56it/s]


Average Dev Loss: 0.5634360502260493
Average Dev accuracy: 0.0
Average Dev F1: 0.959490004714848
Average train Loss: 0.024885634519954947
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.959490004714848
Epoch: 71


100%|██████████| 1874/1874 [00:18<00:00, 103.11it/s]
100%|██████████| 434/434 [00:02<00:00, 179.39it/s]


Average Dev Loss: 0.5255306155596804
Average Dev accuracy: 0.0
Average Dev F1: 0.9548675633902068
Average train Loss: 0.02466592366688884
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.959490004714848
Epoch: 72


100%|██████████| 1874/1874 [00:18<00:00, 101.02it/s]
100%|██████████| 434/434 [00:02<00:00, 180.66it/s]


Average Dev Loss: 0.532541634712351
Average Dev accuracy: 0.0
Average Dev F1: 0.9572757674959286
Average train Loss: 0.025095726458282234
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.959490004714848
Epoch: 73


100%|██████████| 1874/1874 [00:18<00:00, 103.20it/s]
100%|██████████| 434/434 [00:02<00:00, 184.43it/s]


Average Dev Loss: 0.5363046515905749
Average Dev accuracy: 0.0
Average Dev F1: 0.9582060505625389
Average train Loss: 0.025159406310852853
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.959490004714848
Epoch: 74


100%|██████████| 1874/1874 [00:18<00:00, 101.32it/s]
100%|██████████| 434/434 [00:02<00:00, 178.20it/s]


Average Dev Loss: 0.5759180517296397
Average Dev accuracy: 0.0
Average Dev F1: 0.9596342079482945
Average train Loss: 0.02470302767803139
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9596342079482945
Epoch: 75


100%|██████████| 1874/1874 [00:18<00:00, 100.65it/s]
100%|██████████| 434/434 [00:02<00:00, 177.40it/s]


Average Dev Loss: 0.515766860412923
Average Dev accuracy: 0.0
Average Dev F1: 0.9557625637096162
Average train Loss: 0.0253076410309184
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9596342079482945
Epoch: 76


100%|██████████| 1874/1874 [00:18<00:00, 101.07it/s]
100%|██████████| 434/434 [00:02<00:00, 183.84it/s]


Average Dev Loss: 0.5058564495945496
Average Dev accuracy: 0.0
Average Dev F1: 0.954624001987343
Average train Loss: 0.02454933798881874
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9596342079482945
Epoch: 77


100%|██████████| 1874/1874 [00:18<00:00, 100.05it/s]
100%|██████████| 434/434 [00:02<00:00, 173.22it/s]


Average Dev Loss: 0.5366435028389439
Average Dev accuracy: 0.0
Average Dev F1: 0.9582630255579149
Average train Loss: 0.024738387053876144
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9596342079482945
Epoch: 78


100%|██████████| 1874/1874 [00:18<00:00, 100.68it/s]
100%|██████████| 434/434 [00:02<00:00, 176.78it/s]


Average Dev Loss: 0.5462962718758827
Average Dev accuracy: 0.0
Average Dev F1: 0.9579958611852926
Average train Loss: 0.02477162748487247
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9596342079482945
Epoch: 79


100%|██████████| 1874/1874 [00:17<00:00, 105.17it/s]
100%|██████████| 434/434 [00:02<00:00, 163.31it/s]


Average Dev Loss: 0.5215105217063911
Average Dev accuracy: 0.0
Average Dev F1: 0.9567501021629387
Average train Loss: 0.0247328689071154
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9596342079482945
Epoch: 80


100%|██████████| 1874/1874 [00:18<00:00, 102.15it/s]
100%|██████████| 434/434 [00:02<00:00, 159.14it/s]


Average Dev Loss: 0.5339030304141471
Average Dev accuracy: 0.0
Average Dev F1: 0.957063506286473
Average train Loss: 0.024599326225449882
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9596342079482945
Epoch: 81


100%|██████████| 1874/1874 [00:18<00:00, 101.63it/s]
100%|██████████| 434/434 [00:02<00:00, 177.27it/s]


Average Dev Loss: 0.5115180816286788
Average Dev accuracy: 0.0
Average Dev F1: 0.9543443452601892
Average train Loss: 0.024741769144848276
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9596342079482945
Epoch: 82


100%|██████████| 1874/1874 [00:17<00:00, 104.17it/s]
100%|██████████| 434/434 [00:02<00:00, 177.33it/s]


Average Dev Loss: 0.5286455220062165
Average Dev accuracy: 0.0
Average Dev F1: 0.9528276362742669
Average train Loss: 0.025111017062894307
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9596342079482945
Epoch: 83


100%|██████████| 1874/1874 [00:18<00:00, 100.92it/s]
100%|██████████| 434/434 [00:02<00:00, 178.76it/s]


Average Dev Loss: 0.5256873474543279
Average Dev accuracy: 0.0
Average Dev F1: 0.9551385479561869
Average train Loss: 0.0248981564889166
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9596342079482945
Epoch: 84


100%|██████████| 1874/1874 [00:18<00:00, 100.10it/s]
100%|██████████| 434/434 [00:02<00:00, 179.48it/s]


Average Dev Loss: 0.5307582686826896
Average Dev accuracy: 0.0
Average Dev F1: 0.9557963199446208
Average train Loss: 0.02456116761312509
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9596342079482945
Epoch: 85


100%|██████████| 1874/1874 [00:17<00:00, 105.35it/s]
100%|██████████| 434/434 [00:02<00:00, 181.84it/s]


Average Dev Loss: 0.5198615184987248
Average Dev accuracy: 0.0
Average Dev F1: 0.9542208481757558
Average train Loss: 0.024724854513199696
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9596342079482945
Epoch: 86


100%|██████████| 1874/1874 [00:18<00:00, 101.39it/s]
100%|██████████| 434/434 [00:02<00:00, 174.21it/s]


Average Dev Loss: 0.5309718994204865
Average Dev accuracy: 0.0
Average Dev F1: 0.956586528252427
Average train Loss: 0.02525987066507435
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9596342079482945
Epoch: 87


100%|██████████| 1874/1874 [00:18<00:00, 101.52it/s]
100%|██████████| 434/434 [00:02<00:00, 155.42it/s]


Average Dev Loss: 0.5504928594143966
Average Dev accuracy: 0.0
Average Dev F1: 0.9580298634519724
Average train Loss: 0.024504284365967376
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9596342079482945
Epoch: 88


100%|██████████| 1874/1874 [00:17<00:00, 105.16it/s]
100%|██████████| 434/434 [00:02<00:00, 159.18it/s]


Average Dev Loss: 0.5345790118653937
Average Dev accuracy: 0.0
Average Dev F1: 0.9570234944763346
Average train Loss: 0.02471729672750957
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9596342079482945
Epoch: 89


100%|██████████| 1874/1874 [00:18<00:00, 102.15it/s]
100%|██████████| 434/434 [00:02<00:00, 158.80it/s]


Average Dev Loss: 0.5237683891275821
Average Dev accuracy: 0.0
Average Dev F1: 0.9558378324719145
Average train Loss: 0.024890618644413224
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9596342079482945
Epoch: 90


100%|██████████| 1874/1874 [00:17<00:00, 105.47it/s]
100%|██████████| 434/434 [00:03<00:00, 141.05it/s]


Average Dev Loss: 0.5708617403232041
Average Dev accuracy: 0.0
Average Dev F1: 0.958722920423951
Average train Loss: 0.02426586563553832
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9596342079482945
Epoch: 91


100%|██████████| 1874/1874 [00:17<00:00, 105.75it/s]
100%|██████████| 434/434 [00:02<00:00, 179.31it/s]


Average Dev Loss: 0.5128031480539169
Average Dev accuracy: 0.0
Average Dev F1: 0.9548022320883662
Average train Loss: 0.024657135480207766
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9596342079482945
Epoch: 92


100%|██████████| 1874/1874 [00:18<00:00, 100.66it/s]
100%|██████████| 434/434 [00:02<00:00, 173.29it/s]


Average Dev Loss: 0.5367378963255173
Average Dev accuracy: 0.0
Average Dev F1: 0.9573136030497559
Average train Loss: 0.024197830812070765
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9596342079482945
Epoch: 93


100%|██████████| 1874/1874 [00:18<00:00, 103.34it/s]
100%|██████████| 434/434 [00:02<00:00, 151.03it/s]


Average Dev Loss: 0.5108109898151709
Average Dev accuracy: 0.0
Average Dev F1: 0.9499535725712688
Average train Loss: 0.02485531952188389
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9596342079482945
Epoch: 94


100%|██████████| 1874/1874 [00:18<00:00, 101.80it/s]
100%|██████████| 434/434 [00:02<00:00, 177.79it/s]


Average Dev Loss: 0.527703418314869
Average Dev accuracy: 0.0
Average Dev F1: 0.9566209490684192
Average train Loss: 0.02436251421245322
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9596342079482945
Epoch: 95


100%|██████████| 1874/1874 [00:18<00:00, 99.43it/s] 
100%|██████████| 434/434 [00:02<00:00, 174.03it/s]


Average Dev Loss: 0.5247278810775955
Average Dev accuracy: 0.0
Average Dev F1: 0.9539465449150528
Average train Loss: 0.025110982044991563
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9596342079482945
Epoch: 96


100%|██████████| 1874/1874 [00:18<00:00, 103.45it/s]
100%|██████████| 434/434 [00:02<00:00, 160.55it/s]


Average Dev Loss: 0.5426721245561156
Average Dev accuracy: 0.0
Average Dev F1: 0.9574477554482997
Average train Loss: 0.02484619014177336
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9596342079482945
Epoch: 97


100%|██████████| 1874/1874 [00:18<00:00, 102.49it/s]
100%|██████████| 434/434 [00:02<00:00, 178.45it/s]


Average Dev Loss: 0.5446643792452239
Average Dev accuracy: 0.0
Average Dev F1: 0.9574796125732961
Average train Loss: 0.02480737278372121
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9596342079482945
Epoch: 98


100%|██████████| 1874/1874 [00:18<00:00, 101.17it/s]
100%|██████████| 434/434 [00:02<00:00, 156.27it/s]


Average Dev Loss: 0.5294855833879762
Average Dev accuracy: 0.0
Average Dev F1: 0.9546081339872722
Average train Loss: 0.024253755565230777
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9596342079482945
Epoch: 99


100%|██████████| 1874/1874 [00:17<00:00, 104.61it/s]
100%|██████████| 434/434 [00:02<00:00, 160.47it/s]

Average Dev Loss: 0.5564461686929542
Average Dev accuracy: 0.0
Average Dev F1: 0.9567873982440075
Average train Loss: 0.024438474007152624
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9596342079482945





In [86]:
from IPython.display import FileLink
FileLink(r'./best_model.pt')


In [143]:
SAVE_PATH = "best_model.pt"
model = BLSTM(len(vocabulary), embedding_dim=100, hidden_dim=256, output_dim=len(tag2idx), dropout=0.33).to(device)
model.load_state_dict(torch.load(SAVE_PATH))
model.eval()

BLSTM(
  (embedding): Embedding(30292, 100, padding_idx=0)
  (blstm): LSTM(100, 256, batch_first=True, bidirectional=True)
  (dropout1): Dropout(p=0.33, inplace=False)
  (linear1): Linear(in_features=512, out_features=128, bias=True)
  (activation): ELU(alpha=1.0)
  (classifier): Sequential(
    (0): Linear(in_features=128, out_features=256, bias=True)
    (1): Tanh()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=256, out_features=9, bias=True)
  )
)

In [144]:
evaluate(model, criterion, dev_loader)

100%|██████████| 434/434 [00:02<00:00, 174.39it/s]

Average Dev Loss: 0.5625147515669688
Average Dev accuracy: 0.0
Average Dev F1: 0.9627304227974504





(0.5625147515669688, 0.9627304227974504)

In [245]:
def build_test_vocab(data):
    # print(data)
    word_counts = Counter(word for _, sentence, _ in data for word in sentence)
    vocabulary = ['<pad>', '<unk>'] + sorted(word_counts)
    word2idx = {word: idx for idx, word in enumerate(vocabulary)}
    return vocabulary, word2idx

_, test_word2idx_untouched = build_test_vocab(train_data+dev_data+test_data)
# print(test_word2idx_untouched)

In [240]:
def make_dev_for_perl_helper(idx2tag, reverse_test_word2idx_untouched, outputs, batch_x_untouched, batch_gold, batch_ind, file):
    acc = 0
    for i in range(outputs.shape[0]):
        sentence_pred = outputs[i]
        one_x = batch_x_untouched[i]
        sentence_index = batch_ind[i]
        goldens = batch_gold[i]
        for j, word_probs in enumerate(sentence_pred):
            word_pred = torch.argmax(word_probs).item()
            # tag = tag2idx[str(word_pred)]
            # print(sentence_index)
            index = sentence_index[j].item()
            # with open("")
            if index == 0:
              break
            word = reverse_test_word2idx_untouched[one_x[j].item()]
            tag = idx2tag[word_pred]
            gold = goldens[j].item()
            gold = idx2tag[gold]

            if index == 1:
              # print('\n')
              file.write('\n')
            file.write(str(index) + ' ' + word + ' ' + gold+ ' ' + tag + '\n')
            # print(index, word, tag)

def make_dev_for_perl(model, idx2tag, reverse_test_word2idx_untouched, dataloader, file_name, device):
  with torch.no_grad():
    with open(file_name, 'w') as file:
      dev_acc = 0.0
      for batch_x, batch_gold, batch_ind, batch_x_untouched in tqdm(dataloader):
        batch_x = batch_x.to(device)
        batch_gold = batch_gold.to(device)
#         print(batch_x.shape)
        outputs = model(batch_x)
        make_dev_for_perl_helper(idx2tag, reverse_test_word2idx_untouched, outputs, batch_x_untouched, batch_gold, batch_ind, file)
        acc = accuracy(outputs, batch_gold)
        dev_acc += acc
    dev_acc /= len(dataloader)
    print(f"Average accuracy: {dev_acc}")

In [241]:
def make_output_file_helper(idx2tag, reverse_test_word2idx_untouched, outputs, batch_x_untouched, batch_ind, file):
    acc = 0
    for i in range(outputs.shape[0]):
        sentence_pred = outputs[i]
        one_x = batch_x_untouched[i]
        sentence_index = batch_ind[i]
        for j, word_probs in enumerate(sentence_pred):
            word_pred = torch.argmax(word_probs).item()
            # tag = tag2idx[str(word_pred)]
            # print(sentence_index)
            index = sentence_index[j].item()
            # with open("")
            if index == 0:
              break
            word = reverse_test_word2idx_untouched[one_x[j].item()]
            tag = idx2tag[word_pred]

            if index == 1:
              # print('\n')
              file.write('\n')
            file.write(str(index) + ' ' + word + ' ' + tag + '\n')
            # print(index, word, tag)

def make_output_file(model, idx2tag, reverse_test_word2idx_untouched, dataloader, file_name, device):
  with torch.no_grad():
    with open(file_name, 'w') as file:
      dev_acc = 0.0
      for batch_x, batch_x_untouched, batch_ind in tqdm(dataloader):
        batch_x = batch_x.to(device)
        outputs = model(batch_x)
        make_output_file_helper(idx2tag, reverse_test_word2idx_untouched, outputs, batch_x_untouched, batch_ind, file)

In [277]:
mode = 'test_out'

if mode == 'test_out':
    mode_data = test_data
elif mode == 'dev_out' or mode == 'dev_perl':
    mode_data = dev_data
    
print_dataset = NERDataset(mode_data, word2idx, tag2idx, mode, test_word2idx_untouched)
print_loader = DataLoader(print_dataset, batch_size=batch_size)

reverse_test_word2idx_untouched = {v: k for k, v in test_word2idx_untouched.items()}
idx2tag = {v: k for k, v in tag2idx.items()}

In [278]:
from IPython.display import FileLink
if mode == 'dev_perl':
    file_name = 'dev1_perl.out'
    make_dev_for_perl(model, idx2tag, reverse_test_word2idx_untouched, print_loader, file_name, device)
    
FileLink(r'dev1_perl.out')

In [279]:
if mode == 'dev_out':
    file_name = 'dev1.out'
    make_output_file(model, idx2tag, reverse_test_word2idx_untouched, print_loader, file_name, device)    
FileLink(r'dev1.out')

In [280]:
if mode == 'test_out':
    file_name = 'test1.out'
    make_output_file(model, idx2tag, reverse_test_word2idx_untouched, print_loader, file_name, device)    
FileLink(r'test1.out')

100%|██████████| 461/461 [00:04<00:00, 108.68it/s]


In [128]:
!perl conll03eval < dev1.out

processed 51577 tokens with 5942 phrases; found: 5673 phrases; correct: 4645.
accuracy:  96.06%; precision:  81.88%; recall:  78.17%; FB1:  79.98
              LOC: precision:  90.58%; recall:  85.36%; FB1:  87.89  1731
             MISC: precision:  79.12%; recall:  74.40%; FB1:  76.69  867
              ORG: precision:  73.10%; recall:  73.97%; FB1:  73.54  1357
              PER: precision:  81.43%; recall:  75.95%; FB1:  78.60  1718


In [84]:
embeddings_index = {}
with open('glove.6B.100d', 'r', encoding='utf-8') as f:
    for line in tqdm(f):
        values = line.split()
        word = values[0].lower()
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
        
len(embeddings_index)

400000it [00:09, 41931.32it/s]


400000

In [87]:
def make_weight_matrix(word2idx):
    weights_matrix = np.zeros((len(vocabulary), 100))
    hits = misses = 0
    # Initialize the unk and pad vector randomly using a normal distribution
    unk_weight = np.random.normal(scale=0.8, size=(100,))
    pad_weight = np.random.normal(scale=0.8, size=(100,))
#     pad_weight = np.zeros(100)
    for word, i in word2idx.items():
    #     print(word)
        embedding_vector = embeddings_index.get(word.lower())
        if embedding_vector is not None:
            weights_matrix[i] = embedding_vector
            hits += 1
        else:
            misses += 1
            if word == '<pad>':
                weights_matrix[i] = pad_weight
            else:
                weights_matrix[i] = unk_weight
    print(f"Hits: {hits} Misses: {misses} Hit Ratio: {hits/(hits+misses)}")
    return weights_matrix


weights_matrix = make_weight_matrix(word2idx)       
embedding_layer = nn.Embedding(len(vocabulary), 100)
embedding_layer.weight.data.copy_(torch.from_numpy(weights_matrix))
embedding_layer.weight.requires_grad = True

Hits: 26340 Misses: 3952 Hit Ratio: 0.8695365112901096


In [88]:
from tqdm import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BLSTM(len(vocabulary), embedding_dim=100, hidden_dim=256, output_dim=len(tag2idx), dropout=0.33, pretrained_embedding = embedding_layer).to(device)
# optimizer = torch.optim.Adam(model.parameters(), lr = 0.001, weight_decay = 1e-5, eps=1e-08, betas = (0.9, 0.999))
# optimizer = torch.optim.SGD(model.parameters(), lr = 0.05, momentum = 0.65, weight_decay = 1e-3)
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01, momentum = 0.9, weight_decay = 1e-3)

# scheduler = MultiStepLR(optimizer, milestones=[3,5,7,9], gamma=0.1)
# scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=0.5, max_lr=1.2, step_size_up=20, step_size_down=None, mode='triangular', gamma=1.0)
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', min_lr=1, verbose=True)
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',factor=0.75, patience = 5, threshold=0.1, verbose=True, min_lr=5e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',factor=0.55, patience = 3, threshold=0.1, verbose=True, min_lr=5e-4)

# scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=0.5, total_iters=20)

# class_weights = torch.tensor([1,1,1,1,1,1,1,1,0.01], dtype=torch.float).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=-1, reduction='mean', weight=class_weights)
    

def train(model, train_loader, optimizer, criterion, device, epochs):
    model.train()
    SAVE_PATH = "./best_model_2.pt"
    best_f1 = -1
    for epoch in range(epochs):
      print(f"Epoch: {epoch}")
      train_loss, train_acc, train_f1 = 0.0, 0.0, 0.0
      for batch_x, batch_y in tqdm(train_loader):
          batch_x = batch_x.to(device)
          batch_y = batch_y.to(device)
          outputs = model(batch_x)

          seq_lengths = torch.count_nonzero(batch_x, dim=1).to('cpu')
          packed_y = pack_padded_sequence(batch_y, seq_lengths, batch_first=True, enforce_sorted=False)
          unpacked_y, unpacked_len = pad_packed_sequence(packed_y, batch_first=True, padding_value = -1)
          unpacked_y = unpacked_y.to(device)
            
          loss = criterion(outputs.permute(0, 2, 1), unpacked_y)
          train_loss += loss.item()
  
#           mask = (unpacked_y >= 0) -->costly operation, uncomment to see accuracy, f1
#           acc = accuracy(outputs, batch_y)
#           out_for_f1 = torch.argmax(outputs, dim = -1)
#           f1 = f1_score(out_for_f1[mask].cpu(), unpacked_y[mask].cpu(), average='weighted')
#           train_acc += acc
#           train_f1 += f1

          optimizer.zero_grad()
          loss.backward()
          optimizer.step()
          # print(get_lr(optimizer))

      train_loss /= len(train_loader)
#       train_acc /= len(train_loader)
#       train_f1 /= len(train_loader)
      
#       print(f"Average train accuracy: {train_acc}")
#       print(f"Average train f1: {train_f1}")
      val_loss, val_f1 = evaluate(model, criterion, dev_loader)
      if val_f1 > best_f1:
            best_f1 = val_f1
            torch.save(model.state_dict(), SAVE_PATH)
      scheduler.step(val_loss)
      print(f"Average train Loss: {train_loss}")
      print(f"Current Learning Rate: {get_lr(optimizer)}")
      print(f"Best sklearn masked F1: {best_f1}")

    return model

model = train(model, train_loader, optimizer, criterion, device, epochs = 100)


Epoch: 0


100%|██████████| 1874/1874 [00:18<00:00, 101.88it/s]
100%|██████████| 434/434 [00:02<00:00, 174.71it/s]


Average Dev Loss: 0.7340873273149613
Average Dev accuracy: 0.0
Average Dev F1: 0.7705576421574798
Average train Loss: 1.0544551648954954
Current Learning Rate: 0.01
Best sklearn masked F1: 0.7705576421574798
Epoch: 1


100%|██████████| 1874/1874 [00:18<00:00, 103.64it/s]
100%|██████████| 434/434 [00:02<00:00, 178.20it/s]


Average Dev Loss: 0.541081804717775
Average Dev accuracy: 0.0
Average Dev F1: 0.7525595739373699
Average train Loss: 0.5337999259681304
Current Learning Rate: 0.01
Best sklearn masked F1: 0.7705576421574798
Epoch: 2


100%|██████████| 1874/1874 [00:18<00:00, 102.19it/s]
100%|██████████| 434/434 [00:02<00:00, 170.28it/s]


Average Dev Loss: 0.4140992299245868
Average Dev accuracy: 0.0
Average Dev F1: 0.8172337322708556
Average train Loss: 0.40893494477147674
Current Learning Rate: 0.01
Best sklearn masked F1: 0.8172337322708556
Epoch: 3


100%|██████████| 1874/1874 [00:18<00:00, 103.21it/s]
100%|██████████| 434/434 [00:02<00:00, 158.26it/s]


Average Dev Loss: 0.4684875528070612
Average Dev accuracy: 0.0
Average Dev F1: 0.8982840367921393
Average train Loss: 0.33479627251640964
Current Learning Rate: 0.01
Best sklearn masked F1: 0.8982840367921393
Epoch: 4


100%|██████████| 1874/1874 [00:17<00:00, 104.90it/s]
100%|██████████| 434/434 [00:02<00:00, 169.59it/s]


Average Dev Loss: 0.42387279222208646
Average Dev accuracy: 0.0
Average Dev F1: 0.8954061583715427
Average train Loss: 0.2702637093445821
Current Learning Rate: 0.01
Best sklearn masked F1: 0.8982840367921393
Epoch: 5


100%|██████████| 1874/1874 [00:18<00:00, 102.16it/s]
100%|██████████| 434/434 [00:02<00:00, 179.66it/s]


Average Dev Loss: 0.38185237123618065
Average Dev accuracy: 0.0
Average Dev F1: 0.8988944260272429
Average train Loss: 0.2163824252063645
Current Learning Rate: 0.01
Best sklearn masked F1: 0.8988944260272429
Epoch: 6


100%|██████████| 1874/1874 [00:18<00:00, 101.60it/s]
100%|██████████| 434/434 [00:02<00:00, 176.17it/s]


Average Dev Loss: 0.3418062004123381
Average Dev accuracy: 0.0
Average Dev F1: 0.9031941570597836
Average train Loss: 0.17325236418074755
Current Learning Rate: 0.01
Best sklearn masked F1: 0.9031941570597836
Epoch: 7


100%|██████████| 1874/1874 [00:18<00:00, 103.73it/s]
100%|██████████| 434/434 [00:02<00:00, 179.76it/s]


Average Dev Loss: 0.40430830066598744
Average Dev accuracy: 0.0
Average Dev F1: 0.9451792849373514
Average train Loss: 0.1486453927482857
Current Learning Rate: 0.01
Best sklearn masked F1: 0.9451792849373514
Epoch: 8


100%|██████████| 1874/1874 [00:18<00:00, 101.19it/s]
100%|██████████| 434/434 [00:02<00:00, 179.87it/s]


Average Dev Loss: 0.42299528956876786
Average Dev accuracy: 0.0
Average Dev F1: 0.9401154687470477
Average train Loss: 0.12461662309203524
Current Learning Rate: 0.01
Best sklearn masked F1: 0.9451792849373514
Epoch: 9


100%|██████████| 1874/1874 [00:18<00:00, 100.40it/s]
100%|██████████| 434/434 [00:02<00:00, 176.38it/s]


Average Dev Loss: 0.3905213369605934
Average Dev accuracy: 0.0
Average Dev F1: 0.9132165193150874
Average train Loss: 0.1220061460059144
Current Learning Rate: 0.01
Best sklearn masked F1: 0.9451792849373514
Epoch: 10


100%|██████████| 1874/1874 [00:18<00:00, 104.06it/s]
100%|██████████| 434/434 [00:02<00:00, 176.00it/s]


Average Dev Loss: 0.36011768190494914
Average Dev accuracy: 0.0
Average Dev F1: 0.9140407793385151
Epoch 00011: reducing learning rate of group 0 to 5.5000e-03.
Average train Loss: 0.13264057458837575
Current Learning Rate: 0.0055000000000000005
Best sklearn masked F1: 0.9451792849373514
Epoch: 11


100%|██████████| 1874/1874 [00:18<00:00, 101.28it/s]
100%|██████████| 434/434 [00:02<00:00, 180.32it/s]


Average Dev Loss: 0.36015738576139894
Average Dev accuracy: 0.0
Average Dev F1: 0.956800734157179
Average train Loss: 0.08330352341640927
Current Learning Rate: 0.0055000000000000005
Best sklearn masked F1: 0.956800734157179
Epoch: 12


100%|██████████| 1874/1874 [00:18<00:00, 102.71it/s]
100%|██████████| 434/434 [00:02<00:00, 158.45it/s]


Average Dev Loss: 0.44202935310136443
Average Dev accuracy: 0.0
Average Dev F1: 0.9413330845111751
Average train Loss: 0.06459587105706914
Current Learning Rate: 0.0055000000000000005
Best sklearn masked F1: 0.956800734157179
Epoch: 13


100%|██████████| 1874/1874 [00:17<00:00, 106.07it/s]
100%|██████████| 434/434 [00:02<00:00, 175.01it/s]


Average Dev Loss: 0.3710743197548946
Average Dev accuracy: 0.0
Average Dev F1: 0.9406147234092199
Average train Loss: 0.06472840981250838
Current Learning Rate: 0.0055000000000000005
Best sklearn masked F1: 0.956800734157179
Epoch: 14


100%|██████████| 1874/1874 [00:18<00:00, 102.98it/s]
100%|██████████| 434/434 [00:02<00:00, 177.95it/s]


Average Dev Loss: 0.3911772697465494
Average Dev accuracy: 0.0
Average Dev F1: 0.9535653151111407
Epoch 00015: reducing learning rate of group 0 to 3.0250e-03.
Average train Loss: 0.06965213474654071
Current Learning Rate: 0.0030250000000000003
Best sklearn masked F1: 0.956800734157179
Epoch: 15


100%|██████████| 1874/1874 [00:18<00:00, 101.43it/s]
100%|██████████| 434/434 [00:02<00:00, 178.42it/s]


Average Dev Loss: 0.35976583116005434
Average Dev accuracy: 0.0
Average Dev F1: 0.9575308446179066
Average train Loss: 0.04689966810453711
Current Learning Rate: 0.0030250000000000003
Best sklearn masked F1: 0.9575308446179066
Epoch: 16


100%|██████████| 1874/1874 [00:17<00:00, 105.18it/s]
100%|██████████| 434/434 [00:02<00:00, 179.76it/s]


Average Dev Loss: 0.3261260737456845
Average Dev accuracy: 0.0
Average Dev F1: 0.9511090103120464
Average train Loss: 0.04188861966462609
Current Learning Rate: 0.0030250000000000003
Best sklearn masked F1: 0.9575308446179066
Epoch: 17


100%|██████████| 1874/1874 [00:18<00:00, 103.17it/s]
100%|██████████| 434/434 [00:02<00:00, 181.06it/s]


Average Dev Loss: 0.3353053120851156
Average Dev accuracy: 0.0
Average Dev F1: 0.950987513217435
Average train Loss: 0.042946167028058826
Current Learning Rate: 0.0030250000000000003
Best sklearn masked F1: 0.9575308446179066
Epoch: 18


100%|██████████| 1874/1874 [00:18<00:00, 103.09it/s]
100%|██████████| 434/434 [00:02<00:00, 149.58it/s]


Average Dev Loss: 0.3618248980932264
Average Dev accuracy: 0.0
Average Dev F1: 0.9565311853352448
Epoch 00019: reducing learning rate of group 0 to 1.6638e-03.
Average train Loss: 0.041932010846366566
Current Learning Rate: 0.0016637500000000003
Best sklearn masked F1: 0.9575308446179066
Epoch: 19


100%|██████████| 1874/1874 [00:17<00:00, 104.78it/s]
100%|██████████| 434/434 [00:02<00:00, 178.53it/s]


Average Dev Loss: 0.3563236067680994
Average Dev accuracy: 0.0
Average Dev F1: 0.9610906880511347
Average train Loss: 0.03492314823611919
Current Learning Rate: 0.0016637500000000003
Best sklearn masked F1: 0.9610906880511347
Epoch: 20


100%|██████████| 1874/1874 [00:18<00:00, 102.75it/s]
100%|██████████| 434/434 [00:02<00:00, 162.17it/s]


Average Dev Loss: 0.35178450059707916
Average Dev accuracy: 0.0
Average Dev F1: 0.9612852996126205
Average train Loss: 0.03289560864683491
Current Learning Rate: 0.0016637500000000003
Best sklearn masked F1: 0.9612852996126205
Epoch: 21


100%|██████████| 1874/1874 [00:17<00:00, 104.65it/s]
100%|██████████| 434/434 [00:03<00:00, 128.40it/s]


Average Dev Loss: 0.37081929343679604
Average Dev accuracy: 0.0
Average Dev F1: 0.9593031089864901
Average train Loss: 0.031822571572807906
Current Learning Rate: 0.0016637500000000003
Best sklearn masked F1: 0.9612852996126205
Epoch: 22


100%|██████████| 1874/1874 [00:17<00:00, 105.28it/s]
100%|██████████| 434/434 [00:02<00:00, 172.41it/s]


Average Dev Loss: 0.36622599357136093
Average Dev accuracy: 0.0
Average Dev F1: 0.9633455300102063
Epoch 00023: reducing learning rate of group 0 to 9.1506e-04.
Average train Loss: 0.03218096335085055
Current Learning Rate: 0.0009150625000000002
Best sklearn masked F1: 0.9633455300102063
Epoch: 23


100%|██████████| 1874/1874 [00:18<00:00, 100.99it/s]
100%|██████████| 434/434 [00:02<00:00, 175.56it/s]


Average Dev Loss: 0.363576629606273
Average Dev accuracy: 0.0
Average Dev F1: 0.9645140410524294
Average train Loss: 0.03019841169350032
Current Learning Rate: 0.0009150625000000002
Best sklearn masked F1: 0.9645140410524294
Epoch: 24


100%|██████████| 1874/1874 [00:18<00:00, 101.83it/s]
100%|██████████| 434/434 [00:02<00:00, 150.89it/s]


Average Dev Loss: 0.32067516113927563
Average Dev accuracy: 0.0
Average Dev F1: 0.9529870035127158
Average train Loss: 0.03054079951409334
Current Learning Rate: 0.0009150625000000002
Best sklearn masked F1: 0.9645140410524294
Epoch: 25


100%|██████████| 1874/1874 [00:18<00:00, 103.20it/s]
100%|██████████| 434/434 [00:02<00:00, 175.95it/s]


Average Dev Loss: 0.3578039292988586
Average Dev accuracy: 0.0
Average Dev F1: 0.963553423806116
Average train Loss: 0.029673461927416097
Current Learning Rate: 0.0009150625000000002
Best sklearn masked F1: 0.9645140410524294
Epoch: 26


100%|██████████| 1874/1874 [00:18<00:00, 100.69it/s]
100%|██████████| 434/434 [00:02<00:00, 174.48it/s]


Average Dev Loss: 0.36277928732631703
Average Dev accuracy: 0.0
Average Dev F1: 0.9638650004074688
Epoch 00027: reducing learning rate of group 0 to 5.0328e-04.
Average train Loss: 0.030650463044939597
Current Learning Rate: 0.0005032843750000001
Best sklearn masked F1: 0.9645140410524294
Epoch: 27


100%|██████████| 1874/1874 [00:18<00:00, 102.42it/s]
100%|██████████| 434/434 [00:02<00:00, 167.28it/s]


Average Dev Loss: 0.3766009843820268
Average Dev accuracy: 0.0
Average Dev F1: 0.9651730024036317
Average train Loss: 0.028650360605594966
Current Learning Rate: 0.0005032843750000001
Best sklearn masked F1: 0.9651730024036317
Epoch: 28


100%|██████████| 1874/1874 [00:18<00:00, 101.07it/s]
100%|██████████| 434/434 [00:02<00:00, 178.82it/s]


Average Dev Loss: 0.3564932592688472
Average Dev accuracy: 0.0
Average Dev F1: 0.9628406811133814
Average train Loss: 0.02826363526696186
Current Learning Rate: 0.0005032843750000001
Best sklearn masked F1: 0.9651730024036317
Epoch: 29


100%|██████████| 1874/1874 [00:18<00:00, 100.65it/s]
100%|██████████| 434/434 [00:02<00:00, 170.44it/s]


Average Dev Loss: 0.3480597430945284
Average Dev accuracy: 0.0
Average Dev F1: 0.9624290939448523
Average train Loss: 0.028744143206132187
Current Learning Rate: 0.0005032843750000001
Best sklearn masked F1: 0.9651730024036317
Epoch: 30


100%|██████████| 1874/1874 [00:18<00:00, 103.32it/s]
100%|██████████| 434/434 [00:02<00:00, 165.33it/s]


Average Dev Loss: 0.37299482981740895
Average Dev accuracy: 0.0
Average Dev F1: 0.9628705314111237
Epoch 00031: reducing learning rate of group 0 to 5.0000e-04.
Average train Loss: 0.028104597656404166
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9651730024036317
Epoch: 31


100%|██████████| 1874/1874 [00:18<00:00, 102.66it/s]
100%|██████████| 434/434 [00:02<00:00, 160.79it/s]


Average Dev Loss: 0.366466130207782
Average Dev accuracy: 0.0
Average Dev F1: 0.9634373627679855
Average train Loss: 0.02872835464175222
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9651730024036317
Epoch: 32


100%|██████████| 1874/1874 [00:18<00:00, 100.97it/s]
100%|██████████| 434/434 [00:02<00:00, 155.42it/s]


Average Dev Loss: 0.3795759295041497
Average Dev accuracy: 0.0
Average Dev F1: 0.9658862369696051
Average train Loss: 0.028495470814062968
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 33


100%|██████████| 1874/1874 [00:17<00:00, 104.29it/s]
100%|██████████| 434/434 [00:02<00:00, 177.15it/s]


Average Dev Loss: 0.3575249152812755
Average Dev accuracy: 0.0
Average Dev F1: 0.9622282463571162
Average train Loss: 0.028236778821496007
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 34


100%|██████████| 1874/1874 [00:18<00:00, 100.21it/s]
100%|██████████| 434/434 [00:02<00:00, 175.97it/s]


Average Dev Loss: 0.3866844594978603
Average Dev accuracy: 0.0
Average Dev F1: 0.9648304083786925
Average train Loss: 0.028537423445707984
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 35


100%|██████████| 1874/1874 [00:19<00:00, 98.06it/s] 
100%|██████████| 434/434 [00:02<00:00, 173.28it/s]


Average Dev Loss: 0.3660453311194481
Average Dev accuracy: 0.0
Average Dev F1: 0.9652629438868251
Average train Loss: 0.029345443409896407
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 36


100%|██████████| 1874/1874 [00:18<00:00, 102.94it/s]
100%|██████████| 434/434 [00:02<00:00, 176.31it/s]


Average Dev Loss: 0.3828624541488206
Average Dev accuracy: 0.0
Average Dev F1: 0.9650552234638863
Average train Loss: 0.028256158887139317
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 37


100%|██████████| 1874/1874 [00:18<00:00, 100.99it/s]
100%|██████████| 434/434 [00:02<00:00, 174.82it/s]


Average Dev Loss: 0.3767317843468954
Average Dev accuracy: 0.0
Average Dev F1: 0.9644929534300652
Average train Loss: 0.028405374981783003
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 38


100%|██████████| 1874/1874 [00:18<00:00, 98.93it/s] 
100%|██████████| 434/434 [00:02<00:00, 174.41it/s]


Average Dev Loss: 0.3737032901751058
Average Dev accuracy: 0.0
Average Dev F1: 0.9635902680799093
Average train Loss: 0.027971379814486604
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 39


100%|██████████| 1874/1874 [00:18<00:00, 102.42it/s]
100%|██████████| 434/434 [00:02<00:00, 178.09it/s]


Average Dev Loss: 0.3689041641660996
Average Dev accuracy: 0.0
Average Dev F1: 0.96396172260198
Average train Loss: 0.028550754619808022
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 40


100%|██████████| 1874/1874 [00:18<00:00, 100.12it/s]
100%|██████████| 434/434 [00:02<00:00, 174.75it/s]


Average Dev Loss: 0.3751048952891504
Average Dev accuracy: 0.0
Average Dev F1: 0.9638432117862434
Average train Loss: 0.028440312126585807
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 41


100%|██████████| 1874/1874 [00:18<00:00, 99.15it/s] 
100%|██████████| 434/434 [00:02<00:00, 173.52it/s]


Average Dev Loss: 0.36094401925501807
Average Dev accuracy: 0.0
Average Dev F1: 0.962096460741511
Average train Loss: 0.02837298496719748
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 42


100%|██████████| 1874/1874 [00:17<00:00, 104.66it/s]
100%|██████████| 434/434 [00:02<00:00, 159.03it/s]


Average Dev Loss: 0.3761223366997454
Average Dev accuracy: 0.0
Average Dev F1: 0.9643948303855163
Average train Loss: 0.02844897223939163
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 43


100%|██████████| 1874/1874 [00:18<00:00, 102.44it/s]
100%|██████████| 434/434 [00:02<00:00, 156.72it/s]


Average Dev Loss: 0.3727320826243317
Average Dev accuracy: 0.0
Average Dev F1: 0.9649908832410029
Average train Loss: 0.02766169782225023
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 44


100%|██████████| 1874/1874 [00:18<00:00, 100.66it/s]
100%|██████████| 434/434 [00:02<00:00, 160.26it/s]


Average Dev Loss: 0.37350334819891245
Average Dev accuracy: 0.0
Average Dev F1: 0.9642927627686403
Average train Loss: 0.028252283606072938
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 45


100%|██████████| 1874/1874 [00:17<00:00, 105.18it/s]
100%|██████████| 434/434 [00:02<00:00, 168.18it/s]


Average Dev Loss: 0.38059321768562554
Average Dev accuracy: 0.0
Average Dev F1: 0.9643911995727965
Average train Loss: 0.028454872131498735
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 46


100%|██████████| 1874/1874 [00:18<00:00, 101.28it/s]
100%|██████████| 434/434 [00:02<00:00, 176.15it/s]


Average Dev Loss: 0.36068550879854605
Average Dev accuracy: 0.0
Average Dev F1: 0.9629862673531945
Average train Loss: 0.02786830733450708
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 47


100%|██████████| 1874/1874 [00:18<00:00, 99.51it/s] 
100%|██████████| 434/434 [00:02<00:00, 177.13it/s]


Average Dev Loss: 0.3722898307495669
Average Dev accuracy: 0.0
Average Dev F1: 0.9644453313741828
Average train Loss: 0.028220085026358383
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 48


100%|██████████| 1874/1874 [00:18<00:00, 103.98it/s]
100%|██████████| 434/434 [00:02<00:00, 179.15it/s]


Average Dev Loss: 0.3800243582224585
Average Dev accuracy: 0.0
Average Dev F1: 0.9642104102748111
Average train Loss: 0.028588363375855668
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 49


100%|██████████| 1874/1874 [00:18<00:00, 99.24it/s] 
100%|██████████| 434/434 [00:02<00:00, 174.11it/s]


Average Dev Loss: 0.36627055413930887
Average Dev accuracy: 0.0
Average Dev F1: 0.9619624014534035
Average train Loss: 0.028160296838726596
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 50


100%|██████████| 1874/1874 [00:18<00:00, 99.84it/s] 
100%|██████████| 434/434 [00:02<00:00, 174.86it/s]


Average Dev Loss: 0.38496645338123037
Average Dev accuracy: 0.0
Average Dev F1: 0.9654946714981857
Average train Loss: 0.02788333107146813
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 51


100%|██████████| 1874/1874 [00:18<00:00, 103.39it/s]
100%|██████████| 434/434 [00:02<00:00, 176.15it/s]


Average Dev Loss: 0.36618605339985494
Average Dev accuracy: 0.0
Average Dev F1: 0.9651450058583206
Average train Loss: 0.027926630890538585
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 52


100%|██████████| 1874/1874 [00:18<00:00, 100.12it/s]
100%|██████████| 434/434 [00:02<00:00, 173.39it/s]


Average Dev Loss: 0.3789454476558067
Average Dev accuracy: 0.0
Average Dev F1: 0.9639947557574563
Average train Loss: 0.02795042090933957
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 53


100%|██████████| 1874/1874 [00:18<00:00, 100.37it/s]
100%|██████████| 434/434 [00:02<00:00, 163.50it/s]


Average Dev Loss: 0.383052503029185
Average Dev accuracy: 0.0
Average Dev F1: 0.9655429121892455
Average train Loss: 0.028579363797213823
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 54


100%|██████████| 1874/1874 [00:17<00:00, 104.78it/s]
100%|██████████| 434/434 [00:02<00:00, 161.98it/s]


Average Dev Loss: 0.3649205887853275
Average Dev accuracy: 0.0
Average Dev F1: 0.9629464525208369
Average train Loss: 0.028580935488791943
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 55


100%|██████████| 1874/1874 [00:18<00:00, 100.91it/s]
100%|██████████| 434/434 [00:02<00:00, 156.91it/s]


Average Dev Loss: 0.3816017303862588
Average Dev accuracy: 0.0
Average Dev F1: 0.9650467507913613
Average train Loss: 0.02764693062440388
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 56


100%|██████████| 1874/1874 [00:18<00:00, 101.40it/s]
100%|██████████| 434/434 [00:02<00:00, 175.69it/s]


Average Dev Loss: 0.3654713448070522
Average Dev accuracy: 0.0
Average Dev F1: 0.9636345451958777
Average train Loss: 0.028442350228012937
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 57


100%|██████████| 1874/1874 [00:18<00:00, 103.09it/s]
100%|██████████| 434/434 [00:02<00:00, 177.93it/s]


Average Dev Loss: 0.3760748871656839
Average Dev accuracy: 0.0
Average Dev F1: 0.9636361282606253
Average train Loss: 0.027818127460986487
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 58


100%|██████████| 1874/1874 [00:18<00:00, 99.34it/s] 
100%|██████████| 434/434 [00:02<00:00, 174.04it/s]


Average Dev Loss: 0.3707637390806361
Average Dev accuracy: 0.0
Average Dev F1: 0.9622901127268284
Average train Loss: 0.02769620058384178
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 59


100%|██████████| 1874/1874 [00:18<00:00, 100.45it/s]
100%|██████████| 434/434 [00:02<00:00, 173.76it/s]


Average Dev Loss: 0.37950362402877086
Average Dev accuracy: 0.0
Average Dev F1: 0.9643205499323937
Average train Loss: 0.027795198568557423
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 60


100%|██████████| 1874/1874 [00:18<00:00, 103.63it/s]
100%|██████████| 434/434 [00:02<00:00, 175.56it/s]


Average Dev Loss: 0.39299678431356044
Average Dev accuracy: 0.0
Average Dev F1: 0.9649934448041538
Average train Loss: 0.02783437112068982
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 61


100%|██████████| 1874/1874 [00:18<00:00, 99.00it/s] 
100%|██████████| 434/434 [00:02<00:00, 176.29it/s]


Average Dev Loss: 0.3733963322507063
Average Dev accuracy: 0.0
Average Dev F1: 0.9639927163816059
Average train Loss: 0.027197373145594873
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 62


100%|██████████| 1874/1874 [00:18<00:00, 99.53it/s] 
100%|██████████| 434/434 [00:02<00:00, 167.65it/s]


Average Dev Loss: 0.3778380778669659
Average Dev accuracy: 0.0
Average Dev F1: 0.9642246590493028
Average train Loss: 0.0283962509682076
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 63


100%|██████████| 1874/1874 [00:17<00:00, 104.18it/s]
100%|██████████| 434/434 [00:02<00:00, 174.48it/s]


Average Dev Loss: 0.38566355078056247
Average Dev accuracy: 0.0
Average Dev F1: 0.9632897931105834
Average train Loss: 0.027436398739529935
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 64


100%|██████████| 1874/1874 [00:18<00:00, 99.46it/s] 
100%|██████████| 434/434 [00:02<00:00, 174.47it/s]


Average Dev Loss: 0.38098970210836336
Average Dev accuracy: 0.0
Average Dev F1: 0.9632455554681149
Average train Loss: 0.02787364976954713
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 65


100%|██████████| 1874/1874 [00:18<00:00, 101.24it/s]
100%|██████████| 434/434 [00:02<00:00, 160.16it/s]


Average Dev Loss: 0.3688525071309682
Average Dev accuracy: 0.0
Average Dev F1: 0.9626743979628897
Average train Loss: 0.027569913359015003
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9658862369696051
Epoch: 66


100%|██████████| 1874/1874 [00:17<00:00, 104.88it/s]
100%|██████████| 434/434 [00:02<00:00, 156.78it/s]


Average Dev Loss: 0.3959426120001917
Average Dev accuracy: 0.0
Average Dev F1: 0.9666129355622163
Average train Loss: 0.028251088518309413
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9666129355622163
Epoch: 67


100%|██████████| 1874/1874 [00:18<00:00, 101.22it/s]
100%|██████████| 434/434 [00:02<00:00, 171.43it/s]


Average Dev Loss: 0.3973756260780429
Average Dev accuracy: 0.0
Average Dev F1: 0.9656785566310175
Average train Loss: 0.027602993611846687
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9666129355622163
Epoch: 68


100%|██████████| 1874/1874 [00:18<00:00, 100.49it/s]
100%|██████████| 434/434 [00:02<00:00, 171.16it/s]


Average Dev Loss: 0.3755531265348848
Average Dev accuracy: 0.0
Average Dev F1: 0.9643820770886209
Average train Loss: 0.027380479729546506
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9666129355622163
Epoch: 69


100%|██████████| 1874/1874 [00:17<00:00, 104.29it/s]
100%|██████████| 434/434 [00:02<00:00, 175.24it/s]


Average Dev Loss: 0.39218454388365115
Average Dev accuracy: 0.0
Average Dev F1: 0.9651977481583158
Average train Loss: 0.027665170454524515
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9666129355622163
Epoch: 70


100%|██████████| 1874/1874 [00:18<00:00, 100.80it/s]
100%|██████████| 434/434 [00:02<00:00, 178.67it/s]


Average Dev Loss: 0.40095388552888556
Average Dev accuracy: 0.0
Average Dev F1: 0.9667740115298565
Average train Loss: 0.027984382950585522
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9667740115298565
Epoch: 71


100%|██████████| 1874/1874 [00:17<00:00, 104.68it/s]
100%|██████████| 434/434 [00:02<00:00, 153.72it/s]


Average Dev Loss: 0.38349128872465155
Average Dev accuracy: 0.0
Average Dev F1: 0.9649189053438354
Average train Loss: 0.0280627757248746
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9667740115298565
Epoch: 72


100%|██████████| 1874/1874 [00:18<00:00, 104.02it/s]
100%|██████████| 434/434 [00:02<00:00, 175.51it/s]


Average Dev Loss: 0.38772129286177165
Average Dev accuracy: 0.0
Average Dev F1: 0.9610830277212258
Average train Loss: 0.028412259861815965
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9667740115298565
Epoch: 73


100%|██████████| 1874/1874 [00:18<00:00, 100.51it/s]
100%|██████████| 434/434 [00:02<00:00, 175.98it/s]


Average Dev Loss: 0.3720822725026581
Average Dev accuracy: 0.0
Average Dev F1: 0.9628278134482394
Average train Loss: 0.02826492162148012
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9667740115298565
Epoch: 74


100%|██████████| 1874/1874 [00:17<00:00, 104.43it/s]
100%|██████████| 434/434 [00:03<00:00, 141.16it/s]


Average Dev Loss: 0.39037783153425937
Average Dev accuracy: 0.0
Average Dev F1: 0.963508208429195
Average train Loss: 0.02765279785350259
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9667740115298565
Epoch: 75


100%|██████████| 1874/1874 [00:17<00:00, 104.82it/s]
100%|██████████| 434/434 [00:02<00:00, 155.12it/s]


Average Dev Loss: 0.3750019280507105
Average Dev accuracy: 0.0
Average Dev F1: 0.9622665807004814
Average train Loss: 0.027685741218937045
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9667740115298565
Epoch: 76


100%|██████████| 1874/1874 [00:18<00:00, 101.62it/s]
100%|██████████| 434/434 [00:02<00:00, 161.84it/s]


Average Dev Loss: 0.3924258245447058
Average Dev accuracy: 0.0
Average Dev F1: 0.9650222923852549
Average train Loss: 0.02746304524456138
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9667740115298565
Epoch: 77


100%|██████████| 1874/1874 [00:18<00:00, 103.96it/s]
100%|██████████| 434/434 [00:02<00:00, 166.58it/s]


Average Dev Loss: 0.38428022235810055
Average Dev accuracy: 0.0
Average Dev F1: 0.9638591797744208
Average train Loss: 0.02747486061774385
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9667740115298565
Epoch: 78


100%|██████████| 1874/1874 [00:18<00:00, 100.21it/s]
100%|██████████| 434/434 [00:02<00:00, 176.69it/s]


Average Dev Loss: 0.3713027905347684
Average Dev accuracy: 0.0
Average Dev F1: 0.9625236910753792
Average train Loss: 0.02725455979170554
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9667740115298565
Epoch: 79


100%|██████████| 1874/1874 [00:18<00:00, 100.40it/s]
100%|██████████| 434/434 [00:02<00:00, 177.09it/s]


Average Dev Loss: 0.3964959149895888
Average Dev accuracy: 0.0
Average Dev F1: 0.9641178098969956
Average train Loss: 0.02772812001762977
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9667740115298565
Epoch: 80


100%|██████████| 1874/1874 [00:18<00:00, 103.25it/s]
100%|██████████| 434/434 [00:02<00:00, 180.06it/s]


Average Dev Loss: 0.4048247821162313
Average Dev accuracy: 0.0
Average Dev F1: 0.9662723127066368
Average train Loss: 0.02738431377014346
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9667740115298565
Epoch: 81


100%|██████████| 1874/1874 [00:18<00:00, 99.98it/s] 
100%|██████████| 434/434 [00:02<00:00, 176.32it/s]


Average Dev Loss: 0.3970283558463446
Average Dev accuracy: 0.0
Average Dev F1: 0.9647906821972178
Average train Loss: 0.027125827424186445
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9667740115298565
Epoch: 82


100%|██████████| 1874/1874 [00:18<00:00, 100.67it/s]
100%|██████████| 434/434 [00:02<00:00, 178.22it/s]


Average Dev Loss: 0.3836888997324137
Average Dev accuracy: 0.0
Average Dev F1: 0.9630923609405769
Average train Loss: 0.027148180329260738
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9667740115298565
Epoch: 83


100%|██████████| 1874/1874 [00:18<00:00, 102.64it/s]
100%|██████████| 434/434 [00:02<00:00, 174.11it/s]


Average Dev Loss: 0.3848345639706645
Average Dev accuracy: 0.0
Average Dev F1: 0.9653334375357785
Average train Loss: 0.02734184223763931
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9667740115298565
Epoch: 84


100%|██████████| 1874/1874 [00:18<00:00, 98.75it/s] 
100%|██████████| 434/434 [00:02<00:00, 175.81it/s]


Average Dev Loss: 0.38008875819137705
Average Dev accuracy: 0.0
Average Dev F1: 0.9634495603452587
Average train Loss: 0.027998781443092466
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9667740115298565
Epoch: 85


100%|██████████| 1874/1874 [00:18<00:00, 100.53it/s]
100%|██████████| 434/434 [00:02<00:00, 167.20it/s]


Average Dev Loss: 0.4039447942651027
Average Dev accuracy: 0.0
Average Dev F1: 0.9646262081131926
Average train Loss: 0.02759397403026952
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9667740115298565
Epoch: 86


100%|██████████| 1874/1874 [00:17<00:00, 105.96it/s]
100%|██████████| 434/434 [00:02<00:00, 160.24it/s]


Average Dev Loss: 0.41123437608296076
Average Dev accuracy: 0.0
Average Dev F1: 0.965930642696262
Average train Loss: 0.026694173262660315
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9667740115298565
Epoch: 87


100%|██████████| 1874/1874 [00:18<00:00, 100.29it/s]
100%|██████████| 434/434 [00:02<00:00, 158.83it/s]


Average Dev Loss: 0.39044779088575615
Average Dev accuracy: 0.0
Average Dev F1: 0.9632857329862702
Average train Loss: 0.026920986213580196
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9667740115298565
Epoch: 88


100%|██████████| 1874/1874 [00:18<00:00, 101.77it/s]
100%|██████████| 434/434 [00:02<00:00, 175.73it/s]


Average Dev Loss: 0.4012495892096637
Average Dev accuracy: 0.0
Average Dev F1: 0.9633667383774915
Average train Loss: 0.02731161598929203
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9667740115298565
Epoch: 89


100%|██████████| 1874/1874 [00:18<00:00, 104.01it/s]
100%|██████████| 434/434 [00:02<00:00, 174.09it/s]


Average Dev Loss: 0.3953391899234585
Average Dev accuracy: 0.0
Average Dev F1: 0.9637122534694664
Average train Loss: 0.027484629102068156
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9667740115298565
Epoch: 90


100%|██████████| 1874/1874 [00:18<00:00, 98.96it/s] 
100%|██████████| 434/434 [00:02<00:00, 177.66it/s]


Average Dev Loss: 0.395673350387034
Average Dev accuracy: 0.0
Average Dev F1: 0.9635911003640862
Average train Loss: 0.027694861349682527
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9667740115298565
Epoch: 91


100%|██████████| 1874/1874 [00:18<00:00, 99.52it/s] 
100%|██████████| 434/434 [00:02<00:00, 180.08it/s]


Average Dev Loss: 0.3962217952357605
Average Dev accuracy: 0.0
Average Dev F1: 0.9638237000958894
Average train Loss: 0.027041805757501876
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9667740115298565
Epoch: 92


100%|██████████| 1874/1874 [00:18<00:00, 103.70it/s]
100%|██████████| 434/434 [00:02<00:00, 173.02it/s]


Average Dev Loss: 0.3856771288950357
Average Dev accuracy: 0.0
Average Dev F1: 0.9624039159078915
Average train Loss: 0.027477514502177912
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9667740115298565
Epoch: 93


100%|██████████| 1874/1874 [00:18<00:00, 99.67it/s] 
100%|██████████| 434/434 [00:02<00:00, 173.38it/s]


Average Dev Loss: 0.36869373172199704
Average Dev accuracy: 0.0
Average Dev F1: 0.9582133959414156
Average train Loss: 0.027440873631688675
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9667740115298565
Epoch: 94


100%|██████████| 1874/1874 [00:18<00:00, 101.24it/s]
100%|██████████| 434/434 [00:02<00:00, 174.93it/s]


Average Dev Loss: 0.39935854548687105
Average Dev accuracy: 0.0
Average Dev F1: 0.9643121425472292
Average train Loss: 0.027491494972858153
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9667740115298565
Epoch: 95


100%|██████████| 1874/1874 [00:18<00:00, 102.61it/s]
100%|██████████| 434/434 [00:02<00:00, 173.70it/s]


Average Dev Loss: 0.42446297493952106
Average Dev accuracy: 0.0
Average Dev F1: 0.9653871368599587
Average train Loss: 0.027550317879180795
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9667740115298565
Epoch: 96


100%|██████████| 1874/1874 [00:18<00:00, 99.38it/s] 
100%|██████████| 434/434 [00:02<00:00, 175.26it/s]


Average Dev Loss: 0.3887692813505097
Average Dev accuracy: 0.0
Average Dev F1: 0.9608418341539071
Average train Loss: 0.027241104272747833
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9667740115298565
Epoch: 97


100%|██████████| 1874/1874 [00:18<00:00, 101.51it/s]
100%|██████████| 434/434 [00:02<00:00, 161.98it/s]


Average Dev Loss: 0.3799737037153065
Average Dev accuracy: 0.0
Average Dev F1: 0.9603896379264439
Average train Loss: 0.027420283760502737
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9667740115298565
Epoch: 98


100%|██████████| 1874/1874 [00:18<00:00, 103.28it/s]
100%|██████████| 434/434 [00:02<00:00, 158.76it/s]


Average Dev Loss: 0.40372101074054595
Average Dev accuracy: 0.0
Average Dev F1: 0.9632844905972828
Average train Loss: 0.027640321590413866
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9667740115298565
Epoch: 99


100%|██████████| 1874/1874 [00:18<00:00, 101.40it/s]
100%|██████████| 434/434 [00:02<00:00, 164.83it/s]

Average Dev Loss: 0.4062223510756608
Average Dev accuracy: 0.0
Average Dev F1: 0.9620557984641228
Average train Loss: 0.02718517045453668
Current Learning Rate: 0.0005
Best sklearn masked F1: 0.9667740115298565





In [140]:
SAVE_PATH = "best_model_2.pt"
model = BLSTM(len(vocabulary), embedding_dim=100, hidden_dim=256, output_dim=len(tag2idx), dropout=0.33).to(device)
model.load_state_dict(torch.load(SAVE_PATH))
model.eval()

BLSTM(
  (embedding): Embedding(30292, 100, padding_idx=0)
  (blstm): LSTM(100, 256, batch_first=True, bidirectional=True)
  (dropout1): Dropout(p=0.33, inplace=False)
  (linear1): Linear(in_features=512, out_features=128, bias=True)
  (activation): ELU(alpha=1.0)
  (classifier): Sequential(
    (0): Linear(in_features=128, out_features=256, bias=True)
    (1): Tanh()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=256, out_features=9, bias=True)
  )
)

In [None]:
mode = 'test_out'

if mode == 'test_out':
    mode_data = test_data
elif mode == 'dev_out' or mode == 'dev_perl':
    mode_data = dev_data
    
print_dataset = NERDataset(mode_data, word2idx, tag2idx, mode, test_word2idx_untouched)
print_loader = DataLoader(print_dataset, batch_size=batch_size)

In [None]:
from IPython.display import FileLink
if mode == 'dev_perl':
    file_name = 'dev2_perl.out'
    make_dev_for_perl(model, idx2tag, reverse_test_word2idx_untouched, print_loader, file_name, device)
    
FileLink(r'dev2_perl.out')

In [None]:
if mode == 'dev_out':
    file_name = 'dev2.out'
    make_output_file(model, idx2tag, reverse_test_word2idx_untouched, print_loader, file_name, device)    
FileLink(r'dev2.out')

In [None]:
if mode == 'test_out':
    file_name = 'test2.out'
    make_output_file(model, idx2tag, reverse_test_word2idx_untouched, print_loader, file_name, device)    
FileLink(r'test2.out')

In [134]:
!perl conll03eval < dev2.out

processed 51577 tokens with 5942 phrases; found: 6190 phrases; correct: 5066.
accuracy:  96.98%; precision:  81.84%; recall:  85.26%; FB1:  83.51
              LOC: precision:  88.91%; recall:  92.54%; FB1:  90.69  1912
             MISC: precision:  70.30%; recall:  73.43%; FB1:  71.83  963
              ORG: precision:  78.05%; recall:  79.27%; FB1:  78.65  1362
              PER: precision:  83.26%; recall:  88.27%; FB1:  85.69  1953
