In [None]:
# from nltk.parse.corenlp import CoreNLPParser,CoreNLPDependencyParser
from tqdm.auto import trange, tqdm
from dataclasses import dataclass
import pandas as pd
import ast
import itertools
import wandb
import evaluate
from itertools import cycle
import numpy as np
import random
import time
from datetime import datetime
import collections

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv, GATConv, GATv2Conv, TransformerConv, SAGEConv, GraphConv, ResGatedGraphConv, ChebConv, global_mean_pool
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader

from transformers import AutoTokenizer, get_scheduler
from transformers.models.bert.modeling_bert import BertModel

from sklearn.metrics import top_k_accuracy_score

In [None]:
import transformers
transformers.__version__

'4.18.0'

In [None]:
import torch_geometric as pyg
pyg.__version__

'2.2.0'

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# definitions

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

In [None]:
@dataclass
class myGNNoutput:
    loss: None
    logit: None
    emb: None

In [None]:
def get_loader(df, add_syllables=False, col='pos_seqs', limit=None, batch_size=32, shuffle=True, max_length=128):
    data_list = []
    if limit is not None:
        dfnew = df.sample(frac=1).reset_index(drop=True)[:limit]
    else:
        dfnew = df
    data_list = []
    count = 0
    for i in trange(len(dfnew), leave=False):
        curr = df.iloc[i]
        data = Data()
        data.edge_index = torch.cat([torch.tensor([[0],[0]]),  # for self loop of CLS token
                                     torch.tensor(curr['homo_edges']).T, 
                                     # for batching purpose, if data.x is missing, edge_index is used to inference batch
                                     # an isolated node (the SEP in this case) will mess all up
                                     torch.tensor([[len(curr['homo_edges'])+1],[len(curr['homo_edges'])+1]])], 
                                    axis=1)
        if data.edge_index.shape[1] > max_length:
            count += 1
            continue
        data.text = ' '.join(curr[col])
        data.y = torch.tensor([curr['author']])
        if add_syllables:
            data.num_syllables = torch.tensor([17]+curr['num_syllables']+[17])
            
        if 'doc_id' in curr:
            data.doc_id = torch.tensor([curr['doc_id']])
        data_list.append(data)
    print(f'{count} data dropped because of exceeding max_length {max_length}')
    loader = DataLoader(data_list, batch_size=batch_size, shuffle=shuffle)
    return loader


In [None]:
GNNtype2layer = {'GCNConv':GCNConv, 
                 'ChebConv':ChebConv, 
                 'SAGEConv':SAGEConv, 
                 'GraphConv':GraphConv,
                 'ResGatedGraphConv':ResGatedGraphConv, 
                 'GATConv':GATConv, 
                 'GATv2Conv':GATv2Conv}

class myHomoGNN(torch.nn.Module):
    def __init__(self, 
                 num_layers, 
                 num_classes, 
                 add_self_loops=False, 
                 gnntype='GCNConv', 
                 add_syllables=False,
                 checkpoint='/scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/',
                 max_length=256):
        
        super().__init__()
        self.checkpoint = checkpoint
        self.max_length = max_length
        self.tokenizer = AutoTokenizer.from_pretrained(self.checkpoint, local_files_only=True)
        self.bert = BertModel.from_pretrained(self.checkpoint, local_files_only=True, add_pooling_layer = False).to(device)
        self.num_layers = num_layers
        self.num_classes = num_classes
        self.add_self_loops = add_self_loops
        self.GNNlayer = GNNtype2layer[gnntype]
        self.pos_emb_dim = 32 # this is determined by POS Bert
        
        self.add_syllables = add_syllables
        if add_syllables:
            self.num_syllables = 18 # the longest word has 17 syllables
            self.syllable_emb_layer = nn.Embedding(self.num_syllables, self.pos_emb_dim)
        
        self.gnns = nn.ModuleList()
        for i in range(num_layers):
            if gnntype in ['GCNConv', 'GATConv', 'GATv2Conv']:
                self.gnns.append(self.GNNlayer(self.pos_emb_dim, self.pos_emb_dim, add_self_loops=self.add_self_loops))
            elif gnntype == 'ChebConv':
                self.gnns.append(self.GNNlayer(self.pos_emb_dim, self.pos_emb_dim, K=2)) # hard coded as 2
            else:
                self.gnns.append(self.GNNlayer(self.pos_emb_dim, self.pos_emb_dim))
                
        self.classifier = nn.Linear(self.pos_emb_dim, self.num_classes)
        self.lossfn = nn.CrossEntropyLoss()
        
    def forward(self, text, edge_index, batch, y, ptr, num_syllable=None, readout='pool'):
        tokens = self.tokenizer(text, padding=True, truncation=True, max_length=self.max_length, return_tensors='pt').to(device)
        x = self.bert(**tokens).last_hidden_state
        # reshape! drop padded tokens!
        x = x.masked_select(tokens.attention_mask.ge(0.5).unsqueeze(2)).reshape((-1,self.pos_emb_dim))
        
        if self.add_syllables:
            syllable_emb = self.syllable_emb_layer(num_syllable)
            x = x + syllable_emb
        
        for i in range(self.num_layers):
            x = self.gnns[i](x, edge_index)
            x = F.relu(x)
        
        if readout == 'pool':
            x = global_mean_pool(x, batch) 
        elif readout == 'cls':
            x = x[ptr[:-1],:]
        
        x = F.dropout(x, training=self.training)
        logit = self.classifier(x)
        loss = self.lossfn(logit, y)
        return myGNNoutput(loss=loss, logit=logit, emb=x)

In [None]:
cols_to_eval = ['homo_edges', 'hetoro_edges', 'pos_seqs', 'upos_seqs', 'num_syllables']

# runs

In [None]:
max_length = 256

epochs = 100
warmup_ratio = 0.15
monitering_metric = 'accuracy'

LIMIT = [None]
NUM_LAYERS = [3]
LR = [1e-3]
READOUT = ['pool']
GNNTYPE = ['ChebConv']
ADD_SELF_LOOPS = [True]
ADD_SYLLABLES = [True, False]
NUM_SENT = [1,2,3]
REPEAT = list(range(5))

ARGS = itertools.product(LIMIT, NUM_LAYERS, LR, READOUT, GNNTYPE, ADD_SELF_LOOPS, ADD_SYLLABLES, NUM_SENT, REPEAT)
num_runs = len(list(ARGS))
run_pbar = trange(num_runs, leave=False)

skip_runs = 7
ARGS = itertools.product(LIMIT, NUM_LAYERS, LR, READOUT, GNNTYPE, ADD_SELF_LOOPS, ADD_SYLLABLES, NUM_SENT, REPEAT)
for i_run, args in enumerate(ARGS):

    if i_run <= skip_runs:
        run_pbar.update(1)
        continue
    limit, num_layers, lr, readout, gnntype, add_self_loops, add_syllables, num_sent_per_text, repeat = args
    
    seed = int(datetime.now().timestamp())
    set_seed(seed)
    
    file = f'../../data/CCAT50/processed/author_all_sent_{num_sent_per_text}_train.csv'
    df = pd.read_csv(file)
    for col in cols_to_eval:
        df[col] = df[col].apply(ast.literal_eval)

    file = f'../../data/CCAT50/processed/author_all_sent_{num_sent_per_text}_val.csv'
    df_val = pd.read_csv(file)
    for col in cols_to_eval:
        df_val[col] = df_val[col].apply(ast.literal_eval)
    val_docid2index = {doc_id:i for i,doc_id in enumerate(df_val['doc_id'].unique())}
    
    valid_loader = get_loader(df_val, add_syllables=add_syllables, max_length=max_length)
    num_valid_steps = len(valid_loader)
    train_loader = get_loader(df, limit = limit, add_syllables=add_syllables, max_length=max_length)
    num_training_steps = len(train_loader)
    
    model = myHomoGNN(num_layers=num_layers,
                      num_classes=50, 
                      add_self_loops=add_self_loops,
                      gnntype=gnntype,
                      add_syllables=add_syllables
                     )
    
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    scheduler = get_scheduler("linear",
                            optimizer=optimizer,
                            num_warmup_steps=int(warmup_ratio*epochs*num_training_steps),
                            num_training_steps=epochs*num_training_steps)
    
    wconfig = {}
    wconfig['seed'] = seed
    wconfig['num_sent_per_text'] = num_sent_per_text
    wconfig['limit'] = limit
    wconfig['num_layers'] = num_layers
    wconfig['lr'] = lr
    wconfig['readout'] = readout
    wconfig['GNNtype'] = gnntype
    wconfig['add_self_loops'] = add_self_loops
    wconfig['add_syllables'] = add_syllables
    
    run = wandb.init(project="homo POS GNN (all authors, bert unfrozen)", 
                     entity="fsu-dsc-cil", 
                     dir='/scratch/data_jz17d/wandb_tmp/', 
                     config=wconfig,
                     name=f'run_{i_run}',
                     reinit=True,
                     settings=wandb.Settings(start_method='thread'))
    
    best_evaluation = collections.defaultdict(float)
    pbar = trange(epochs*num_training_steps, leave=False)
    for i_epoch in range(epochs):
        model.train()
        for data in train_loader:
            data.to(device)
            optimizer.zero_grad()
            if add_syllables:
                output = model(data.text, data.edge_index, data.batch, data.y, data.ptr, data.num_syllables, readout=readout)
            else:
                output = model(data.text, data.edge_index, data.batch, data.y, data.ptr, readout=readout)
            output.loss.backward()
            optimizer.step()
            scheduler.step()
            pbar.update(1)

        model.eval()
        doc_score = 1e-8*np.ones((len(val_docid2index),50))
        doc_true = np.zeros(len(val_docid2index))
        metric = evaluate.load('/home/jz17d/Desktop/metrics/accuracy')
        for data in valid_loader:
            data.to(device)
            if add_syllables:
                output = model(data.text, data.edge_index, data.batch, data.y, data.ptr, data.num_syllables, readout=readout)
            else:
                output = model(data.text, data.edge_index, data.batch, data.y, data.ptr, readout=readout)
            metric.add_batch(predictions=output.logit.argmax(axis=-1).cpu().detach().numpy(), references=data.y.cpu().numpy())
            
            pred = output.logit.argmax(axis=-1).cpu().detach().numpy()
            doc_id = np.vectorize(val_docid2index.get)(data.doc_id.cpu().detach().numpy()) 
            doc_score[doc_id,pred] += 1
            doc_true[doc_id] = data.y.cpu().numpy()
        
        # logging
        evaluation = metric.compute()
        for k in range(1, 6):
            evaluation.update({f'doc_acc@{k}': top_k_accuracy_score(doc_true, doc_score, k=k)})
        wandb.log(evaluation, step=pbar.n)
        
        # logging best
        for key in evaluation:
            best_evaluation[f'best_{key}'] = max(best_evaluation[f'best_{key}'], evaluation[key])
        wandb.log(best_evaluation, step=pbar.n)
    
    run.finish()
    run_pbar.update(1)

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/7707 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/30937 [00:00<?, ?it/s]

3 data dropped because of exceeding max_length 256


Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

  0%|          | 0/96700 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▁▂▃▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇██████████████
best_accuracy,▁▁▂▃▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇██████████████
best_doc_acc@1,▁▁▂▂▄▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇█████████████
best_doc_acc@2,▁▁▂▂▄▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇█████████████
best_doc_acc@3,▁▁▂▃▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇█████████████
best_doc_acc@4,▁▁▂▃▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇███████████████
best_doc_acc@5,▁▁▂▃▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇███████████████
doc_acc@1,▁▁▂▂▄▄▄▅▅▅▆▆▆▆▇▇▆▆▇▇▇▆▇▇▇▇▇█▇▇██████▇▇▇█
doc_acc@2,▁▁▂▂▄▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▆▇▇▇▇▇█▇▇█▇████████
doc_acc@3,▁▁▂▃▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████████

0,1
accuracy,0.18451
best_accuracy,0.19061
best_doc_acc@1,0.35815
best_doc_acc@2,0.46076
best_doc_acc@3,0.52515
best_doc_acc@4,0.56942
best_doc_acc@5,0.61167
doc_acc@1,0.33199
doc_acc@2,0.43461
doc_acc@3,0.50302


  0%|          | 0/7707 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/30937 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


3 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/96700 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▁▂▃▄▄▅▅▅▆▆▆▇▆▆▇▇▇▇▇▇▇▇▇▇▇██████████████
best_accuracy,▁▁▂▃▄▄▅▅▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇█████████████████
best_doc_acc@1,▁▁▂▃▃▃▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████
best_doc_acc@2,▁▁▂▃▄▄▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇████████████████
best_doc_acc@3,▁▁▂▃▄▄▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇████████████████
best_doc_acc@4,▁▁▃▃▄▄▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇█████████████████
best_doc_acc@5,▁▁▃▄▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇██████████████████
doc_acc@1,▁▁▂▃▃▄▄▅▅▅▅▅▆▆▅▆▇▆▇▆▆▇▇▇▇▇▇▇▇█▇▇█▇█▇████
doc_acc@2,▁▁▂▃▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇████▇▇█▇██████
doc_acc@3,▁▁▂▃▄▃▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇█▇▇███▇█▇███████

0,1
accuracy,0.1945
best_accuracy,0.19541
best_doc_acc@1,0.38028
best_doc_acc@2,0.46076
best_doc_acc@3,0.52515
best_doc_acc@4,0.58551
best_doc_acc@5,0.61771
doc_acc@1,0.37223
doc_acc@2,0.45674
doc_acc@3,0.51509


  0%|          | 0/7210 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/28943 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


4 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/90500 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▁▂▃▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇████████████████
best_accuracy,▁▁▂▃▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇██████████████████
best_doc_acc@1,▁▁▂▃▃▃▄▅▅▅▅▅▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████
best_doc_acc@2,▁▁▂▃▃▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇████████████
best_doc_acc@3,▁▁▂▃▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇████████████
best_doc_acc@4,▁▁▂▃▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇████████████
best_doc_acc@5,▁▁▂▃▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇████████████████
doc_acc@1,▁▁▂▃▃▃▅▅▅▅▅▅▆▆▆▆▇▇▇▇▆▇▇▇▇▇█▇██▇█████████
doc_acc@2,▁▁▂▃▃▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇████████████
doc_acc@3,▁▁▂▃▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇█▇▇█████████

0,1
accuracy,0.22552
best_accuracy,0.22913
best_doc_acc@1,0.41093
best_doc_acc@2,0.52227
best_doc_acc@3,0.60324
best_doc_acc@4,0.65587
best_doc_acc@5,0.67409
doc_acc@1,0.38866
doc_acc@2,0.51417
doc_acc@3,0.59312


  0%|          | 0/7210 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/28943 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


4 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/90500 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▁▃▃▄▃▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇██▇▇███████████
best_accuracy,▁▁▃▃▄▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇███████████████
best_doc_acc@1,▁▁▃▃▃▃▄▄▅▅▅▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████
best_doc_acc@2,▁▁▃▃▃▃▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇██████████████
best_doc_acc@3,▁▁▃▃▄▄▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇█████████████████
best_doc_acc@4,▁▁▃▃▄▄▄▅▅▆▆▆▆▆▆▇▇▇▇▇▇███████████████████
best_doc_acc@5,▁▁▂▃▄▄▄▅▅▆▆▆▆▆▆▇▇▇▇▇▇███████████████████
doc_acc@1,▁▁▃▃▃▃▄▄▅▅▅▅▅▇▆▇▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇██████▇█
doc_acc@2,▁▁▃▃▃▃▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇██▇██████
doc_acc@3,▁▁▃▃▄▄▄▅▅▅▅▆▆▆▆▇▇▆▇▇▇▇▇█▇█▇▇▇███████████

0,1
accuracy,0.23495
best_accuracy,0.23703
best_doc_acc@1,0.417
best_doc_acc@2,0.53846
best_doc_acc@3,0.60121
best_doc_acc@4,0.64777
best_doc_acc@5,0.68421
doc_acc@1,0.40688
doc_acc@2,0.52834
doc_acc@3,0.58907


  0%|          | 0/7210 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/28943 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


4 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/90500 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▁▃▃▄▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇███▇████████████
best_accuracy,▁▁▃▃▄▄▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇███████████████████
best_doc_acc@1,▁▁▂▃▃▄▄▄▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
best_doc_acc@2,▁▁▃▃▄▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇██████████████
best_doc_acc@3,▁▁▃▃▄▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇██████████████
best_doc_acc@4,▁▁▃▃▄▄▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇███████████████████
best_doc_acc@5,▁▁▃▃▄▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇███████████████
doc_acc@1,▁▁▂▃▃▄▄▄▅▅▅▆▆▆▆▆▆▆▇▇▆▇▇▇▇▇▇▇▇▇█▇▇█▇▇▇█▇▇
doc_acc@2,▁▁▃▃▄▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇███▇█▇██████
doc_acc@3,▁▁▃▃▄▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████

0,1
accuracy,0.22566
best_accuracy,0.23287
best_doc_acc@1,0.42105
best_doc_acc@2,0.51619
best_doc_acc@3,0.58704
best_doc_acc@4,0.63158
best_doc_acc@5,0.66802
doc_acc@1,0.37449
doc_acc@2,0.49393
doc_acc@3,0.57085


  0%|          | 0/7210 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/28943 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


4 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/90500 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▁▂▃▃▄▄▅▅▅▅▆▆▇▆▇▇▇▇▇▇▇▇▇▇▇▇█████████████
best_accuracy,▁▁▂▃▃▄▄▅▅▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████████
best_doc_acc@1,▁▁▂▃▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇████████████
best_doc_acc@2,▁▁▂▃▃▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇████████████
best_doc_acc@3,▁▁▂▃▃▄▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇████████████
best_doc_acc@4,▁▁▂▃▄▄▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████████
best_doc_acc@5,▁▁▂▃▄▄▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████████
doc_acc@1,▁▁▂▃▂▃▄▅▅▅▅▆▆▆▆▇▇▆▇▇▇▇▇▇▇▇▇▇███▇▇███████
doc_acc@2,▁▁▂▃▃▃▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇█▇██████████
doc_acc@3,▁▁▂▃▃▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇████████████

0,1
accuracy,0.23537
best_accuracy,0.23925
best_doc_acc@1,0.40688
best_doc_acc@2,0.53846
best_doc_acc@3,0.61336
best_doc_acc@4,0.66599
best_doc_acc@5,0.70243
doc_acc@1,0.38462
doc_acc@2,0.51619
doc_acc@3,0.58704


  0%|          | 0/7210 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/28943 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


4 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/90500 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▁▂▃▄▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇███▇█████████████
best_accuracy,▁▁▂▃▄▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇█████████████████
best_doc_acc@1,▁▁▂▃▃▄▄▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇█████████
best_doc_acc@2,▁▁▂▃▄▄▄▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇█████████████
best_doc_acc@3,▁▁▂▃▄▄▄▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇█████████████
best_doc_acc@4,▁▁▂▃▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇████████████████
best_doc_acc@5,▁▁▂▃▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████
doc_acc@1,▁▁▂▃▄▄▄▄▅▅▅▆▆▆▇▆▆▆▆▇▇▇▆▇▇▇▇███▇█▇█▇▇████
doc_acc@2,▁▁▂▃▄▄▄▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇██▇███▇█████
doc_acc@3,▁▁▃▃▄▅▄▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇██▇▇████████████

0,1
accuracy,0.23481
best_accuracy,0.23648
best_doc_acc@1,0.417
best_doc_acc@2,0.51822
best_doc_acc@3,0.58704
best_doc_acc@4,0.64777
best_doc_acc@5,0.7004
doc_acc@1,0.40081
doc_acc@2,0.49798
doc_acc@3,0.5749


  0%|          | 0/8207 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/32937 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/103000 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▁▂▃▄▄▄▅▅▆▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇███████████
best_accuracy,▁▁▂▃▄▄▄▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇███████████
best_doc_acc@1,▁▁▂▂▃▃▄▄▄▄▄▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██
best_doc_acc@2,▁▁▂▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████
best_doc_acc@3,▁▁▂▃▄▄▄▄▅▅▅▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████
best_doc_acc@4,▁▁▂▃▄▄▄▅▅▅▅▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████
best_doc_acc@5,▁▁▂▃▄▄▄▅▅▅▅▅▆▇▇▇▇▇▇▇▇▇▇▇▇███████████████
doc_acc@1,▁▁▂▂▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▆▇▇▇▇▇▇▇█████
doc_acc@2,▁▁▂▃▃▄▄▄▄▅▅▅▅▆▅▆▆▆▆▆▇▇▇▆▇▇▇▆▇█▇▇▇▇██▇███
doc_acc@3,▁▁▂▃▄▄▄▅▅▅▅▅▅▆▅▆▆▆▆▆▇▇▆▇▇▇▇▇▇▇█▇█▇▇█████

0,1
accuracy,0.12355
best_accuracy,0.12477
best_doc_acc@1,0.266
best_doc_acc@2,0.344
best_doc_acc@3,0.402
best_doc_acc@4,0.446
best_doc_acc@5,0.478
doc_acc@1,0.258
doc_acc@2,0.334
doc_acc@3,0.386


  0%|          | 0/8207 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/32937 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/103000 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▃▃▄▄▅▅▅▅▅▅▆▆▆▆▇▆▇▇▇▇▇▇▇▇▇▇▇▇██████████
best_accuracy,▁▂▃▃▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇████████████
best_doc_acc@1,▁▁▂▃▃▃▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇█████████████
best_doc_acc@2,▁▁▂▃▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇███████████████
best_doc_acc@3,▁▁▂▃▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇███████████████
best_doc_acc@4,▁▁▂▃▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇███████████████
best_doc_acc@5,▁▁▂▂▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇███████████████
doc_acc@1,▁▁▂▃▃▃▄▄▅▅▅▅▅▅▅▆▆▆▇▆▇▇▆▇▇▇▇█▆▇▇▇▇▇█▇▇▇▇█
doc_acc@2,▁▁▂▃▄▄▄▄▅▅▄▅▆▆▅▆▆▆▆▆▇▆▇▇▇▇▇▇▇▇██▇▇███▇██
doc_acc@3,▁▁▂▃▄▄▅▄▅▅▄▅▆▆▆▆▆▆▆▇▇▇▇▇▇█▇▇▇▇█▇█▇▇█████

0,1
accuracy,0.12794
best_accuracy,0.12867
best_doc_acc@1,0.258
best_doc_acc@2,0.34
best_doc_acc@3,0.39
best_doc_acc@4,0.432
best_doc_acc@5,0.476
doc_acc@1,0.258
doc_acc@2,0.336
doc_acc@3,0.384


  0%|          | 0/8207 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/32937 [00:00<?, ?it/s]

1 data dropped because of exceeding max_length 256


Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/103000 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▁▂▃▃▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇█▇█▇█████████
best_accuracy,▁▁▂▃▃▄▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇█████████████
best_doc_acc@1,▁▁▂▂▃▃▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇██████████
best_doc_acc@2,▁▁▁▂▂▄▄▄▄▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇██████████
best_doc_acc@3,▁▁▁▂▃▄▄▄▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
best_doc_acc@4,▁▁▁▂▃▄▄▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇██████████
best_doc_acc@5,▁▁▁▂▃▃▄▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇██████████
doc_acc@1,▁▁▂▃▃▃▄▃▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇██▇█████
doc_acc@2,▁▁▁▂▃▄▃▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇█████▇█████
doc_acc@3,▁▁▁▂▃▄▃▄▄▅▅▅▅▅▆▆▆▆▇▇▆▇▇▆▇▇▇▇▇▇▇███▇█████

0,1
accuracy,0.12367
best_accuracy,0.12514
best_doc_acc@1,0.258
best_doc_acc@2,0.352
best_doc_acc@3,0.408
best_doc_acc@4,0.456
best_doc_acc@5,0.5
doc_acc@1,0.246
doc_acc@2,0.33
doc_acc@3,0.386


  0%|          | 0/8207 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/32937 [00:00<?, ?it/s]

1 data dropped because of exceeding max_length 256


Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.033350507418314614, max=1.0…

  0%|          | 0/103000 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▁▂▃▄▃▄▅▅▅▅▅▆▆▆▆▆▇▆▇▇▇▇▇▇▇▇▇▇▇██████████
best_accuracy,▁▁▂▃▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
best_doc_acc@1,▁▁▂▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇████████████████
best_doc_acc@2,▁▁▂▃▄▄▄▅▅▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇█████████████
best_doc_acc@3,▁▁▂▃▄▄▄▅▅▅▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇█████████████
best_doc_acc@4,▁▁▂▃▄▄▄▅▅▅▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇████████████
best_doc_acc@5,▁▁▂▃▄▄▄▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇████████████
doc_acc@1,▁▁▂▃▃▃▄▄▄▄▄▄▅▅▆▆▆▆▆▆▆▇▆▇▇▇▇▇▇██▇█▇██████
doc_acc@2,▁▁▂▃▄▃▄▅▅▆▅▅▆▅▆▆▆▆▆▆▆▇▆▇▇▇▇▇▇█▇▇▇▇▇███▇█
doc_acc@3,▁▁▂▃▄▃▄▅▄▅▅▅▅▅▅▆▅▆▆▆▆▇▆▇▆▇▇▇▇▇▇▇▇▇▇███▇█

0,1
accuracy,0.12294
best_accuracy,0.12392
best_doc_acc@1,0.25
best_doc_acc@2,0.324
best_doc_acc@3,0.39
best_doc_acc@4,0.426
best_doc_acc@5,0.458
doc_acc@1,0.242
doc_acc@2,0.312
doc_acc@3,0.376


  0%|          | 0/8207 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/32937 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/103000 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▂▃▄▄▅▅▅▅▆▆▆▆▆▆▇▆▇▇▇▇▇▇▇▇▇▇█▇▇█████████
best_accuracy,▁▂▂▃▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇████████████
best_doc_acc@1,▁▁▂▂▃▃▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇████████
best_doc_acc@2,▁▁▂▂▃▃▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇██████████
best_doc_acc@3,▁▁▂▂▃▄▄▄▅▅▅▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇█████████
best_doc_acc@4,▁▁▂▃▄▄▄▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇█████████████████
best_doc_acc@5,▁▁▂▃▄▄▄▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇████████████
doc_acc@1,▁▁▂▂▃▄▅▄▄▅▅▅▆▆▅▆▆▆▆▆▇▇▇▇▇▆▇▇▇▇▇▇███▇▇▇██
doc_acc@2,▁▁▂▂▄▄▅▄▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▆▇▇▇▇▇▇████▇█▇▇█
doc_acc@3,▁▁▂▂▃▄▄▄▄▅▅▅▆▆▆▆▆▆▅▆▆▆▇▇▆▇▇▇▇▇▇████▇▇▇██

0,1
accuracy,0.11844
best_accuracy,0.11904
best_doc_acc@1,0.242
best_doc_acc@2,0.322
best_doc_acc@3,0.374
best_doc_acc@4,0.414
best_doc_acc@5,0.448
doc_acc@1,0.226
doc_acc@2,0.312
doc_acc@3,0.362


  0%|          | 0/7707 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/30937 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


3 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/96700 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▃▃▄▄▄▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇████████████
best_accuracy,▁▂▂▃▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
best_doc_acc@1,▁▁▂▃▃▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇████████
best_doc_acc@2,▁▁▃▃▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇████████████
best_doc_acc@3,▁▁▃▄▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇████████████
best_doc_acc@4,▁▁▃▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████
best_doc_acc@5,▁▁▃▄▄▄▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇████████████████
doc_acc@1,▁▁▂▃▃▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▆▆▆▇▇▇▇▇▇▇▇▇▇▇████
doc_acc@2,▁▁▃▃▄▄▄▅▅▅▅▅▅▆▆▆▆▇▆▆▇▇▆▇▇▇▇▇█▇▇▇████████
doc_acc@3,▁▁▃▄▄▄▅▅▅▅▆▅▆▆▆▆▆▇▇▆▇▇▇▇▇▇▇███▇█████████

0,1
accuracy,0.17283
best_accuracy,0.17841
best_doc_acc@1,0.33602
best_doc_acc@2,0.42254
best_doc_acc@3,0.47887
best_doc_acc@4,0.52515
best_doc_acc@5,0.55332
doc_acc@1,0.31992
doc_acc@2,0.4004
doc_acc@3,0.45875


  0%|          | 0/7707 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/30937 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


3 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/96700 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▁▂▃▄▄▅▅▅▅▆▅▆▆▇▆▇▇▇▇▇▇▇█▇▇█▇████████████
best_accuracy,▁▁▂▃▄▄▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇█████████████████
best_doc_acc@1,▁▁▂▃▃▄▄▄▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇██████████████
best_doc_acc@2,▁▁▂▃▃▄▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇██████████████
best_doc_acc@3,▁▁▂▃▃▄▄▅▅▅▅▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇██████████████
best_doc_acc@4,▁▁▂▃▃▄▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇██████████████
best_doc_acc@5,▁▁▂▃▄▄▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇▇▇▇████████████████
doc_acc@1,▁▁▂▃▃▄▄▄▅▅▆▄▆▆▆▅▆▇▆▇▆▇▇▇▇▇█▇▇██▇█▇▇█████
doc_acc@2,▁▁▂▃▃▄▄▅▅▅▆▅▆▆▆▆▆▆▆▇▆▇▇▇▇▇▇▇▇████▇██████
doc_acc@3,▁▁▂▃▃▄▅▅▅▅▆▅▆▆▇▆▆▇▇▇▇▇▇▇▇▇█▇████████████

0,1
accuracy,0.15596
best_accuracy,0.15648
best_doc_acc@1,0.29175
best_doc_acc@2,0.39235
best_doc_acc@3,0.44869
best_doc_acc@4,0.49296
best_doc_acc@5,0.52918
doc_acc@1,0.29175
doc_acc@2,0.37827
doc_acc@3,0.43058


  0%|          | 0/7707 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/30937 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


3 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/96700 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▁▂▃▃▄▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇█▇█▇████████
best_accuracy,▁▁▂▃▄▄▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇██████████████
best_doc_acc@1,▁▁▂▂▃▄▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇██████████
best_doc_acc@2,▁▁▂▂▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇███████████████
best_doc_acc@3,▁▁▂▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇███████████████
best_doc_acc@4,▁▁▁▃▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇███████████████
best_doc_acc@5,▁▁▂▃▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇███████████████████
doc_acc@1,▁▁▂▂▃▄▄▄▅▅▅▅▅▆▅▅▆▆▆▇▆▆▇▇▇▇▇▇▇▇▇█▇▇▇█▇███
doc_acc@2,▁▁▂▂▃▄▄▄▅▅▅▅▆▆▅▆▆▆▇▇▇▇▇▇▇█▇▇▇▇▇█████████
doc_acc@3,▁▁▂▃▃▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇█▇▇▇▇██████████

0,1
accuracy,0.17075
best_accuracy,0.17088
best_doc_acc@1,0.32998
best_doc_acc@2,0.41247
best_doc_acc@3,0.4829
best_doc_acc@4,0.52113
best_doc_acc@5,0.55332
doc_acc@1,0.32193
doc_acc@2,0.39839
doc_acc@3,0.46881


  0%|          | 0/7707 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/30937 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


3 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/96700 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▁▂▃▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇█▇█▇█████████████
best_accuracy,▁▁▂▃▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇█████████████████
best_doc_acc@1,▁▁▂▂▃▄▄▄▄▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇██████████
best_doc_acc@2,▁▁▂▃▃▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████
best_doc_acc@3,▁▁▂▃▃▄▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████
best_doc_acc@4,▁▁▂▃▃▄▄▄▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇███████████
best_doc_acc@5,▁▁▂▃▃▄▄▄▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇██████████████
doc_acc@1,▁▁▂▂▃▄▄▃▄▄▅▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇████▇▇████
doc_acc@2,▁▁▂▃▃▄▄▄▅▅▅▅▅▅▆▆▆▇▆▆▆▇▆▇▇▇▇▇▇▇▇▇█▇▇▇████
doc_acc@3,▁▁▂▃▃▄▅▄▅▅▅▅▅▅▆▆▆▇▆▇▇▇▇▇▇▇▇▇▇▇▇▇██▇▇████

0,1
accuracy,0.17088
best_accuracy,0.17283
best_doc_acc@1,0.34004
best_doc_acc@2,0.44266
best_doc_acc@3,0.49698
best_doc_acc@4,0.54326
best_doc_acc@5,0.57948
doc_acc@1,0.34004
doc_acc@2,0.42656
doc_acc@3,0.47686


  0%|          | 0/7707 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/30937 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


3 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/96700 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▁▂▃▄▄▅▅▅▅▆▆▆▆▇▆▆▇▇▇▇▇▇▇▇█▇███▇█████████
best_accuracy,▁▁▂▃▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇███████████████
best_doc_acc@1,▁▁▂▂▃▄▄▅▅▅▅▆▆▆▇▇▇▇▇▇▇███████████████████
best_doc_acc@2,▁▁▂▂▃▃▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████
best_doc_acc@3,▁▁▂▂▄▄▄▄▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████
best_doc_acc@4,▁▁▂▃▄▄▄▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇██████████████
best_doc_acc@5,▁▁▂▃▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇██████████████
doc_acc@1,▁▁▂▂▃▄▄▄▅▄▅▅▆▆▇▆▆▇▆▆▇▇██▇█▇████████▇████
doc_acc@2,▁▁▂▂▄▄▄▄▅▄▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇█▇██████
doc_acc@3,▁▁▂▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▆▇▇▇▇▇▇▇▇▇█▇▇█████████

0,1
accuracy,0.17568
best_accuracy,0.17633
best_doc_acc@1,0.31992
best_doc_acc@2,0.45473
best_doc_acc@3,0.51107
best_doc_acc@4,0.54728
best_doc_acc@5,0.58551
doc_acc@1,0.31791
doc_acc@2,0.44064
doc_acc@3,0.49497


  0%|          | 0/7210 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/28943 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


4 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/90500 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▁▃▃▄▅▅▅▅▆▆▆▆▇▆▇▇▇▇▇▇▇▇▇███████▇████████
best_accuracy,▁▁▃▃▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇████████████████
best_doc_acc@1,▁▁▂▂▃▄▄▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇███████████████
best_doc_acc@2,▁▁▂▃▃▄▄▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇██████████████
best_doc_acc@3,▁▁▂▃▄▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇██████████████
best_doc_acc@4,▁▁▂▃▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇██████████████
best_doc_acc@5,▁▁▂▃▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇████████████████
doc_acc@1,▁▁▂▂▃▄▄▄▅▅▅▆▆▆▅▆▆▆▆▇▇▇▇▆▇█▇▇▇█▇███▇▇████
doc_acc@2,▁▁▂▂▃▄▄▄▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▆▇▇█▇██▇▇▇███████
doc_acc@3,▁▁▂▃▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▆▇▇█▇█▇▇▇██▇▇████

0,1
accuracy,0.18044
best_accuracy,0.18169
best_doc_acc@1,0.33603
best_doc_acc@2,0.43117
best_doc_acc@3,0.5
best_doc_acc@4,0.53846
best_doc_acc@5,0.57692
doc_acc@1,0.33401
doc_acc@2,0.42105
doc_acc@3,0.47166


  0%|          | 0/7210 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/28943 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


4 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/90500 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▁▃▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇█████████████
best_accuracy,▁▁▃▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇█████████████
best_doc_acc@1,▁▁▂▃▃▄▄▄▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇█████████████
best_doc_acc@2,▁▁▃▃▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇█████████████
best_doc_acc@3,▁▁▃▃▄▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇██████████████
best_doc_acc@4,▁▁▃▃▄▄▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇████████████████
best_doc_acc@5,▁▁▃▃▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇████████████████
doc_acc@1,▁▁▂▃▃▃▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇██████▇▇
doc_acc@2,▁▁▃▃▄▄▅▄▅▅▅▅▆▆▆▆▇▇▆▇▇▇▇▇▇▇▇▇▇▇▇▇████████
doc_acc@3,▁▁▃▃▄▄▅▅▅▆▆▅▆▆▆▆▇▇▆▇▇▇▇▇▇▇▇██▇▇█████████

0,1
accuracy,0.20374
best_accuracy,0.20555
best_doc_acc@1,0.36842
best_doc_acc@2,0.46559
best_doc_acc@3,0.53036
best_doc_acc@4,0.57085
best_doc_acc@5,0.61741
doc_acc@1,0.34211
doc_acc@2,0.45344
doc_acc@3,0.52632


  0%|          | 0/7210 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/28943 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


4 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/90500 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▃▃▄▄▄▅▅▅▆▆▆▇▆▇▇▇▇▇▇██▇▇███████████████
best_accuracy,▁▂▃▃▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇█████████████████
best_doc_acc@1,▁▁▃▃▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇████████████████
best_doc_acc@2,▁▁▂▃▄▄▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇████████████████
best_doc_acc@3,▁▁▂▃▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇█████████████
best_doc_acc@4,▁▁▂▃▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████████
best_doc_acc@5,▁▁▃▃▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇██████████████████
doc_acc@1,▁▁▃▃▄▄▄▄▅▄▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇█▇████▇▇██▇██
doc_acc@2,▁▁▂▃▄▄▄▄▅▅▆▅▆▆▆▇▆▇▇▇▇▇▇▇▇▇▇█████████████
doc_acc@3,▁▁▂▃▄▅▅▄▆▅▆▆▆▆▆▇▆▇▇▇▇▇█▇██▇█████████████

0,1
accuracy,0.19501
best_accuracy,0.19806
best_doc_acc@1,0.34818
best_doc_acc@2,0.44332
best_doc_acc@3,0.51822
best_doc_acc@4,0.56883
best_doc_acc@5,0.59717
doc_acc@1,0.32389
doc_acc@2,0.43117
doc_acc@3,0.49393


  0%|          | 0/7210 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/28943 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


4 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/90500 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▂▃▄▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇█▇████████
best_accuracy,▁▂▂▃▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇███████████████
best_doc_acc@1,▁▁▂▃▃▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
best_doc_acc@2,▁▁▂▃▃▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇███████████████
best_doc_acc@3,▁▁▂▃▃▄▅▅▅▅▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████████
best_doc_acc@4,▁▁▂▃▄▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇███████████████
best_doc_acc@5,▁▁▂▃▄▄▅▅▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇███████████████
doc_acc@1,▁▁▂▃▃▄▅▄▅▅▅▅▅▆▆▅▆▆▇▇▇▇▇▇▇▇▇▇▇▇█▇▇▇▇▇████
doc_acc@2,▁▁▂▃▃▄▅▄▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇█▇██▇▇██████
doc_acc@3,▁▁▂▃▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇█▇▇▇▇▇▇█████

0,1
accuracy,0.18558
best_accuracy,0.18558
best_doc_acc@1,0.35223
best_doc_acc@2,0.4413
best_doc_acc@3,0.5081
best_doc_acc@4,0.54453
best_doc_acc@5,0.58502
doc_acc@1,0.34818
doc_acc@2,0.4332
doc_acc@3,0.5


  0%|          | 0/7210 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/28943 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


4 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/90500 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▃▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇█▇█████████████████
best_accuracy,▁▂▃▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇███████████████████
best_doc_acc@1,▁▁▃▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇█████████████████
best_doc_acc@2,▁▁▃▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇███████████████████
best_doc_acc@3,▁▁▃▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇████████████████
best_doc_acc@4,▁▁▂▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇███████████████████
best_doc_acc@5,▁▁▂▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇███████████████████
doc_acc@1,▁▁▃▄▃▄▄▅▄▅▅▅▅▆▅▆▆▆▇▇▇▇▇█▇▇▇█▇▇█▇█▇██▇███
doc_acc@2,▁▁▃▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇██▇█████████▇██████
doc_acc@3,▁▁▃▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇█▇▇███████▇█▇██████

0,1
accuracy,0.19196
best_accuracy,0.19431
best_doc_acc@1,0.35628
best_doc_acc@2,0.44332
best_doc_acc@3,0.51012
best_doc_acc@4,0.5668
best_doc_acc@5,0.60324
doc_acc@1,0.33401
doc_acc@2,0.41903
doc_acc@3,0.50202
