In [None]:
from tqdm.auto import trange, tqdm
import pandas as pd
import ast
import itertools
from transformers import get_scheduler
import torch
import wandb
import evaluate
from itertools import cycle
import numpy as np
import random
import time
from datetime import datetime
import collections
from sklearn.metrics import top_k_accuracy_score

In [None]:
import torch_geometric.transforms as T
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from transformers import AutoTokenizer
from transformers.models.bert.modeling_bert import BertModel
import torch

In [None]:
import transformers
transformers.__version__

'4.18.0'

In [None]:
import torch_geometric as pyg
pyg.__version__

'2.2.0'

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# definition

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

In [None]:
from dataclasses import dataclass
@dataclass
class myGNNoutput:
    loss: None
    logit: None
    emb: None

In [None]:
# English specific denpendency relations: https://universaldependencies.org/en/dep/
s = '''nsubj 	csubj
↳nsubj:pass 	↳csubj:pass
↳nsubj:outer 	↳csubj:outer
obj 	ccomp 	xcomp
iobj
obl 	advcl 	advmod
↳obl:npmod 	↳advcl:relcl
↳obl:tmod
vocative 	aux 	mark
discourse 	↳aux:pass
expl 	cop
nummod 	acl 	amod
  	↳acl:relcl
appos 	  	det
  	  	↳det:predet
nmod 	  	 
↳nmod:npmod
↳nmod:tmod
↳nmod:poss
compound 	flat
↳compound:prt 	↳flat:foreign
fixed 	goeswith
conj 	cc
  	↳cc:preconj
list 	parataxis 	orphan
dislocated 		reparandum
root 	punct 	dep'''
all_relations = []
s = s.split('\n')
for line in s:
    if '↳' in line:
        continue
    line = line.split('\t')
    for r in line:
        if r.strip() == '':
            continue
        all_relations.append(r.split(':')[0].strip())
if 'root' in all_relations:
    all_relations.remove('root')
    all_relations.append('ROOT')
    all_relations.append('case')      # manually add relation not in list
    all_relations.append('discourse')    # manually add relation not in list
all_relations = sorted(all_relations)

In [None]:
relation2id = {all_relations[i]:i for i in range(len(all_relations))}

In [None]:
relation2id['self'] = 36 # add self loop type

In [None]:
len(relation2id)

37

In [None]:
def get_loader(df, add_syllables=False, col='pos_seqs', limit=None, batch_size=32, shuffle=True, max_length=128):
    data_list = []
    if limit is not None:
        dfnew = df.sample(frac=1).reset_index(drop=True)[:limit]
    else:
        dfnew = df
    data_list = []
    count = 0
    for i in trange(len(dfnew), leave=False):
        curr = df.iloc[i]
        data = Data()
        data.edge_index = torch.cat([torch.tensor([[0],[0]]),  # for self loop of CLS token
                                     torch.tensor(curr['homo_edges']).T, 
                                     # for batching purpose, if data.x is missing, edge_index is used to inference batch
                                     # an isolated node (the SEP in this case) will mess all up
                                     torch.tensor([[len(curr['homo_edges'])+1],[len(curr['homo_edges'])+1]])], 
                                    axis=1)
        data.edge_type_ids = torch.tensor([36]+[relation2id[t.split(':')[0]] for t in curr['hetoro_edges']]+[36])
        if data.edge_index.shape[1] >= max_length-1:
            count += 1
#             print(f"data {i} too long length {data.edge_index.shape[1]}")
            continue
        
        data.text = ' '.join(curr[col])
        data.y = torch.tensor([curr['author']])
        if add_syllables:
            data.num_syllables = torch.tensor([17]+curr['num_syllables']+[17])
            
        if 'doc_id' in curr:
            data.doc_id = torch.tensor([curr['doc_id']])
        data_list.append(data)
    print(f'{count} data dropped because of exceeding max_length {max_length}')
    loader = DataLoader(data_list, batch_size=batch_size, shuffle=shuffle)
    return loader


In [None]:
def preprocess_author_ids(df):
    assert 'author' in df, 'no column named "author" found in df'
    
    max_id, min_id = df['author'].max(), df['author'].min()
    mapping = {i+min_id:i for i in range(max_id-min_id+1)}
    df['author'] = df['author'].map(mapping)
    
    return df

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv, GATv2Conv, TransformerConv, PDNConv, global_mean_pool

GNNtype2layer = {'GATConv':GATConv, 'GATv2Conv':GATv2Conv, 'TransformerConv':TransformerConv, 'PDNConv':PDNConv}

class myHeteroGNN(torch.nn.Module):
    def __init__(self, 
                 num_layers, 
                 num_classes, 
                 num_dep_type, 
                 heads, 
                 hidden_dim, 
                 dep_emb_dim=32, 
                 add_self_loops=False, 
                 gnntype='GATConv', 
                 add_syllables=None,
                 checkpoint='/scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/',
                 max_length=256):
        
        super().__init__()
        self.checkpoint = checkpoint
        self.max_length = max_length
        self.tokenizer = AutoTokenizer.from_pretrained(self.checkpoint, local_files_only=True)
        self.bert = BertModel.from_pretrained(self.checkpoint, local_files_only=True, add_pooling_layer = False).to(device)
        self.num_layers = num_layers
        self.num_classes = num_classes
        self.pos_emb_dim = 32 # this is determined by POS Bert
        self.heads = heads
        self.hidden_dim = hidden_dim
        self.dep_emb_dim = dep_emb_dim
        self.add_syllables = add_syllables
        
        if add_syllables:
            self.num_syllables = 18 # the longest word has 17 syllables
            self.syllable_emb_layer = nn.Embedding(self.num_syllables, self.pos_emb_dim)
            
        self.GNNlayer = GNNtype2layer[gnntype]
        
        self.add_self_loops = add_self_loops
        self.dep_emb_layer = nn.Embedding(num_dep_type, self.dep_emb_dim)
        
        self.gnns = nn.ModuleList()
        self.gnns.append(self.GNNlayer(self.pos_emb_dim, self.hidden_dim, heads = self.heads, add_self_loops=self.add_self_loops, edge_dim=self.dep_emb_dim))
        for i in range(self.num_layers-1):
            self.gnns.append(self.GNNlayer(self.hidden_dim * self.heads, self.hidden_dim, heads = self.heads, edge_dim=self.dep_emb_dim))
        
        self.classifier = nn.Linear(self.hidden_dim * self.heads, self.num_classes)
        self.lossfn = nn.CrossEntropyLoss()
        
    def forward(self, text, edge_index, edge_type_ids, batch, y, ptr, num_syllable=None, readout='pool'):
        tokens = self.tokenizer(text, padding=True, truncation=True, max_length=self.max_length, return_tensors='pt').to(device)
        x = self.bert(**tokens).last_hidden_state
        # reshape! drop padded tokens!
        x = x.masked_select(tokens.attention_mask.ge(0.5).unsqueeze(2)).reshape((-1,self.pos_emb_dim))
        
        
        if self.add_syllables:
            syllable_emb = self.syllable_emb_layer(num_syllable)
            x = x + syllable_emb
            
        edge_attr = self.dep_emb_layer(edge_type_ids)
        for i in range(self.num_layers):
            x = self.gnns[i](x, edge_index, edge_attr=edge_attr)
            x = F.relu(x)
        
        if readout == 'pool':
            x = global_mean_pool(x, batch) 
        elif readout == 'cls':
            x = x[ptr[:-1],:]
        
        x = F.dropout(x, training=self.training)
        logit = self.classifier(x)
        loss = self.lossfn(logit, y)
        return myGNNoutput(loss=loss, logit=logit, emb=x)

In [None]:
cols_to_eval = ['homo_edges', 'hetoro_edges', 'pos_seqs', 'upos_seqs', 'num_syllables']

# train

In [None]:
max_length = 256

epochs = 100
warmup_ratio = 0.15
monitering_metric = 'accuracy'

LIMIT = [None]
NUM_LAYERS = [4]
LR = [1e-3]
HEADS = [4]
READOUT = ['pool']
GNNTYPE = ['TransformerConv'] # 'GATConv', 'GATv2Conv', 
HIDDEN_DIM = [32]
DEP_EMB_DIM = [32]
NUM_SENT = [1,2,3]
ADD_SELF_LOOPS = [False]
ADD_SYLLABLES = [True, False]
REPEAT = list(range(5))

ARGS = itertools.product(LIMIT, NUM_LAYERS, LR, HEADS, READOUT, GNNTYPE, HIDDEN_DIM, DEP_EMB_DIM, NUM_SENT, ADD_SELF_LOOPS, ADD_SYLLABLES, REPEAT)
num_runs = len(list(ARGS))
run_pbar = trange(num_runs, leave=False)

skip_runs = -1
ARGS = itertools.product(LIMIT, NUM_LAYERS, LR, HEADS, READOUT, GNNTYPE, HIDDEN_DIM, DEP_EMB_DIM, NUM_SENT, ADD_SELF_LOOPS, ADD_SYLLABLES, REPEAT)
for i_run, args in enumerate(ARGS):

    if i_run <= skip_runs:
        run_pbar.update(1)
        continue
    limit, num_layers, lr, heads, readout, gnntype, hidden_dim, dep_emb_dim, num_sent_per_text, add_self_loops, add_syllables, repeat = args
    
    seed = int(datetime.now().timestamp())
    set_seed(seed)
    
    file = f'../../data/CCAT50/processed/author_all_sent_{num_sent_per_text}_train.csv'
    df = pd.read_csv(file)
    for col in cols_to_eval:
        df[col] = df[col].apply(ast.literal_eval)

    file = f'../../data/CCAT50/processed/author_all_sent_{num_sent_per_text}_val.csv'
    df_val = pd.read_csv(file)
    for col in cols_to_eval:
        df_val[col] = df_val[col].apply(ast.literal_eval)
    val_docid2index = {doc_id:i for i,doc_id in enumerate(df_val['doc_id'].unique())}
    
    valid_loader = get_loader(df_val, add_syllables=add_syllables, max_length=max_length)
    num_valid_steps = len(valid_loader)
    train_loader = get_loader(df, limit = limit, add_syllables=add_syllables, max_length=max_length)
    num_training_steps = len(train_loader)
    
    model = myHeteroGNN(num_layers=num_layers,
                       num_classes=50, 
                       num_dep_type=len(relation2id), 
                       heads=heads,
                       hidden_dim=hidden_dim,
                       dep_emb_dim=dep_emb_dim, 
                       add_self_loops=add_self_loops,
                       gnntype=gnntype,
                       add_syllables=add_syllables
                      )
    
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    scheduler = get_scheduler("linear",
                            optimizer=optimizer,
                            num_warmup_steps=int(warmup_ratio*epochs*num_training_steps),
                            num_training_steps=epochs*num_training_steps)
    
    wconfig = {}
    wconfig['seed'] = seed
    wconfig['num_sent_per_text'] = num_sent_per_text
    wconfig['limit'] = limit
    wconfig['num_layers'] = num_layers
    wconfig['lr'] = lr
    wconfig['heads'] = heads
    wconfig['readout'] = readout
    wconfig['GNNtype'] = gnntype
    wconfig['add_self_loops'] = add_self_loops
    wconfig['add_syllables'] = add_syllables
    
    run = wandb.init(project="hetero POS GNN (all authors, bert unfrozen)", 
                     entity="fsu-dsc-cil", 
                     dir='/scratch/data_jz17d/wandb_tmp/', 
                     config=wconfig,
                     name=f'run_{i_run}',
                     reinit=True,
                     settings=wandb.Settings(start_method='thread'))
    
    best_evaluation = collections.defaultdict(float)
    pbar = trange(epochs*num_training_steps, leave=False)
    for i_epoch in range(epochs):
        model.train()
        for data in train_loader:
            data.to(device)
            optimizer.zero_grad()
            if add_syllables:
                output = model(data.text, data.edge_index, data.edge_type_ids, data.batch, data.y, data.ptr, data.num_syllables, readout=readout)
            else:
                output = model(data.text, data.edge_index, data.edge_type_ids, data.batch, data.y, data.ptr, readout=readout)
            output.loss.backward()
            optimizer.step()
            scheduler.step()
            pbar.update(1)

        model.eval()
        doc_score = 1e-8*np.ones((len(val_docid2index),50))
        doc_true = np.zeros(len(val_docid2index))
        metric = evaluate.load('/home/jz17d/Desktop/metrics/accuracy')
        for data in valid_loader:
            data.to(device)
            if add_syllables:
                output = model(data.text, data.edge_index, data.edge_type_ids, data.batch, data.y, data.ptr, data.num_syllables, readout=readout)
            else:
                output = model(data.text, data.edge_index, data.edge_type_ids, data.batch, data.y, data.ptr, readout=readout)
            metric.add_batch(predictions=output.logit.argmax(axis=-1).cpu().detach().numpy(), references=data.y.cpu().numpy())
            
            pred = output.logit.argmax(axis=-1).cpu().detach().numpy()
            doc_id = np.vectorize(val_docid2index.get)(data.doc_id.cpu().detach().numpy()) 
            doc_score[doc_id,pred] += 1
            doc_true[doc_id] = data.y.cpu().numpy()
        
        # logging
        evaluation = metric.compute()
        for k in range(1, 6):
            evaluation.update({f'doc_acc@{k}': top_k_accuracy_score(doc_true, doc_score, k=k)})
        wandb.log(evaluation, step=pbar.n)
        
        # logging best
        for key in evaluation:
            best_evaluation[f'best_{key}'] = max(best_evaluation[f'best_{key}'], evaluation[key])
        wandb.log(best_evaluation, step=pbar.n)
    
    run.finish()
    run_pbar.update(1)

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/8207 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/32937 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1 data dropped because of exceeding max_length 256


[34m[1mwandb[0m: Currently logged in as: [33mcpuyyp[0m ([33mfsu-dsc-cil[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/103000 [00:00<?, ?it/s]











VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▁▂▃▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████
best_accuracy,▁▁▂▃▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████
best_doc_acc@1,▁▁▂▃▃▄▅▅▆▆▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████
best_doc_acc@2,▁▁▂▃▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████
best_doc_acc@3,▁▁▂▃▄▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇███████████████
best_doc_acc@4,▁▁▂▃▄▅▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████████████
best_doc_acc@5,▁▁▃▄▅▅▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████████████
doc_acc@1,▁▁▂▂▃▅▅▅▅▆▆▆▆▆▆▆▇▆▆▇▆▇▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇███
doc_acc@2,▁▁▂▃▄▅▅▆▆▆▆▆▆▇▆▆▇▇▇▆▇▇▇▇▇▇▇▇█▇██▇▇██████
doc_acc@3,▁▁▃▃▄▅▆▆▆▆▆▆▆▇▇▆▇▇▇▇▇▇▇▇▇▇█▇█▇██████████

0,1
accuracy,0.23651
best_accuracy,0.23663
best_doc_acc@1,0.56
best_doc_acc@2,0.642
best_doc_acc@3,0.698
best_doc_acc@4,0.728
best_doc_acc@5,0.744
doc_acc@1,0.548
doc_acc@2,0.63
doc_acc@3,0.688


  0%|          | 0/8207 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/32937 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/103000 [00:00<?, ?it/s]











VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▃▃▄▄▅▅▅▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇█▇███████
best_accuracy,▁▂▃▃▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████
best_doc_acc@1,▁▁▂▃▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
best_doc_acc@2,▁▂▂▃▄▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇█████████████████
best_doc_acc@3,▁▂▃▄▄▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇████████████████
best_doc_acc@4,▁▂▃▄▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇████████████████
best_doc_acc@5,▁▂▃▄▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇████████████████
doc_acc@1,▁▁▂▃▄▄▅▅▅▅▆▆▆▇▆▆▆▇▆▇▇▇▇▇▇▇▇▇▇▇▇█████████
doc_acc@2,▁▂▂▃▄▅▆▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇█▇▇▇▇████████████
doc_acc@3,▁▂▃▄▄▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███▇███████

0,1
accuracy,0.24345
best_accuracy,0.24394
best_doc_acc@1,0.552
best_doc_acc@2,0.634
best_doc_acc@3,0.694
best_doc_acc@4,0.73
best_doc_acc@5,0.758
doc_acc@1,0.544
doc_acc@2,0.63
doc_acc@3,0.694


  0%|          | 0/8207 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/32937 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/103000 [00:00<?, ?it/s]











VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▃▃▄▅▅▅▆▆▆▆▆▇▆▇▇▇▇▇▇▇▇▇▇▇▇▇██▇▇████████
best_accuracy,▁▂▃▃▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████
best_doc_acc@1,▁▁▂▃▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████████
best_doc_acc@2,▁▁▂▃▄▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████████
best_doc_acc@3,▁▂▃▃▄▅▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████████
best_doc_acc@4,▁▂▂▄▅▅▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇█████████████████
best_doc_acc@5,▁▂▃▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████████████
doc_acc@1,▁▁▂▃▄▅▅▅▆▆▆▆▆▇▆▆▇▇▇▇▇▇▇█▇█▇████▇████████
doc_acc@2,▁▁▂▃▄▅▅▆▆▆▆▆▆▇▆▆▇▇▇▇▇▇▇▇▇▇▇█▇█▇▇█▇██▇███
doc_acc@3,▁▂▃▃▅▅▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇█▇█████▇█████████

0,1
accuracy,0.23139
best_accuracy,0.23309
best_doc_acc@1,0.514
best_doc_acc@2,0.63
best_doc_acc@3,0.68
best_doc_acc@4,0.718
best_doc_acc@5,0.738
doc_acc@1,0.504
doc_acc@2,0.614
doc_acc@3,0.672


  0%|          | 0/8207 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/32937 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/103000 [00:00<?, ?it/s]











VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▂▃▄▄▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████
best_accuracy,▁▂▂▃▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
best_doc_acc@1,▁▁▂▃▄▄▄▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
best_doc_acc@2,▁▁▂▃▄▄▅▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████████
best_doc_acc@3,▁▁▃▄▄▄▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
best_doc_acc@4,▁▂▃▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇█████████████████
best_doc_acc@5,▁▂▃▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇█████████████████
doc_acc@1,▁▁▂▃▄▄▅▅▆▆▆▇▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇██▇███████
doc_acc@2,▁▁▂▄▄▄▅▅▆▆▆▇▆▇▆▇▇▇▇▇▇▇▇▇▇▇██▇███████████
doc_acc@3,▁▁▃▄▄▄▅▆▆▆▆▇▇▇▆▇▇▇▇▇▇▇▇▇▇▇█▇▇▇██████████

0,1
accuracy,0.24418
best_accuracy,0.24455
best_doc_acc@1,0.546
best_doc_acc@2,0.64
best_doc_acc@3,0.696
best_doc_acc@4,0.728
best_doc_acc@5,0.756
doc_acc@1,0.54
doc_acc@2,0.624
doc_acc@3,0.682


  0%|          | 0/8207 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/32937 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/103000 [00:00<?, ?it/s]











VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▁▂▃▄▅▅▆▆▆▆▆▆▆▆▆▇▆▇▇▇▇▇▇▇▇▇▇▇██▇████████
best_accuracy,▁▁▂▃▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇████████████
best_doc_acc@1,▁▁▂▃▄▅▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████████
best_doc_acc@2,▁▁▂▃▄▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████████
best_doc_acc@3,▁▁▂▄▄▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████████
best_doc_acc@4,▁▂▃▄▄▅▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████████████
best_doc_acc@5,▁▂▃▄▄▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████████
doc_acc@1,▁▁▂▃▄▅▅▆▆▆▇▆▇▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇████████████
doc_acc@2,▁▁▂▃▄▅▅▆▆▆▇▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇█▇██████████
doc_acc@3,▁▁▂▄▄▅▆▆▆▆▇▆▆▇▆▇▇▇▇▇▇▇▇▇▇▇▇▇███▇████████

0,1
accuracy,0.23151
best_accuracy,0.23358
best_doc_acc@1,0.508
best_doc_acc@2,0.616
best_doc_acc@3,0.668
best_doc_acc@4,0.706
best_doc_acc@5,0.736
doc_acc@1,0.49
doc_acc@2,0.588
doc_acc@3,0.652


  0%|          | 0/8207 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/32937 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/103000 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▂▃▄▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
best_accuracy,▁▂▂▃▄▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
best_doc_acc@1,▁▁▂▂▃▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇███████
best_doc_acc@2,▁▁▂▃▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇████████████
best_doc_acc@3,▁▁▂▃▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇███████████████
best_doc_acc@4,▁▂▂▃▄▄▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇██████████████
best_doc_acc@5,▁▂▂▃▄▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████████
doc_acc@1,▁▁▂▂▃▄▄▄▅▅▆▅▆▅▆▆▆▆▆▆▇▇▇▇▇▆▇▇█▇▇█████████
doc_acc@2,▁▁▂▃▄▄▅▅▅▅▆▅▆▆▆▆▆▇▆▆▇▇▇▇▇▆▇▇█▇▇█████████
doc_acc@3,▁▁▂▃▄▄▅▅▅▅▆▆▆▆▆▇▇▇▆▆▇▇▇▇▇▇▇▇██▇█████████

0,1
accuracy,0.19812
best_accuracy,0.19825
best_doc_acc@1,0.488
best_doc_acc@2,0.574
best_doc_acc@3,0.622
best_doc_acc@4,0.67
best_doc_acc@5,0.71
doc_acc@1,0.48
doc_acc@2,0.562
doc_acc@3,0.608


  0%|          | 0/8207 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/32937 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/103000 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▂▃▄▄▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████
best_accuracy,▁▂▂▃▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████
best_doc_acc@1,▁▁▂▂▃▃▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████
best_doc_acc@2,▁▂▂▃▄▄▄▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇█████████████
best_doc_acc@3,▁▁▂▃▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇█████████████████
best_doc_acc@4,▁▂▂▃▄▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇█████████████████
best_doc_acc@5,▁▂▂▃▄▅▅▆▆▆▆▆▇▇▇▇▇▇▇█████████████████████
doc_acc@1,▁▁▂▂▃▃▃▄▅▅▅▅▅▆▆▆▆▆▆▆▆▇▆▇▆▇▇▇▇▇▇▇▇█▇██▇██
doc_acc@2,▁▂▂▃▄▄▄▅▅▅▆▅▆▆▆▆▆▆▇▇▆▇▇▇▇▇▇▇▇▇▇█▇███████
doc_acc@3,▁▁▂▃▄▅▄▅▆▆▆▆▆▇▆▆▇▇▇▇▇▇▇▇▇▇█▇▇███████████

0,1
accuracy,0.20105
best_accuracy,0.20105
best_doc_acc@1,0.51
best_doc_acc@2,0.594
best_doc_acc@3,0.64
best_doc_acc@4,0.68
best_doc_acc@5,0.704
doc_acc@1,0.508
doc_acc@2,0.588
doc_acc@3,0.624


  0%|          | 0/8207 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/32937 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/103000 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▂▃▄▅▅▅▅▅▆▆▆▆▆▆▆▆▆▇▇▆▇▇▇▇▇▇▇▇▇▇████████
best_accuracy,▁▂▂▃▄▅▅▅▅▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇████████
best_doc_acc@1,▁▁▂▃▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████
best_doc_acc@2,▁▂▂▃▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇█████████████
best_doc_acc@3,▁▁▂▃▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇█████████████
best_doc_acc@4,▁▂▂▃▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇████████████████
best_doc_acc@5,▁▁▂▃▄▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇████████████████
doc_acc@1,▁▁▂▃▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▆▆▇▇▆▇▇▇▇▇▇▇█████▇██
doc_acc@2,▁▂▂▃▄▅▄▅▅▅▆▅▆▆▆▆▆▆▇▆▇▆▇▇▇▇▇██▇▇█▇███████
doc_acc@3,▁▁▂▃▄▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▆▇▇▇▇▇█▇▇▇█▇▇██████

0,1
accuracy,0.19898
best_accuracy,0.19898
best_doc_acc@1,0.488
best_doc_acc@2,0.576
best_doc_acc@3,0.638
best_doc_acc@4,0.67
best_doc_acc@5,0.698
doc_acc@1,0.47
doc_acc@2,0.568
doc_acc@3,0.618


  0%|          | 0/8207 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/32937 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/103000 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▃▃▄▄▅▅▆▆▆▆▆▆▇▆▆▇▇▇▆▆▇▇▇▇▇▇▇▇▇▇████████
best_accuracy,▁▂▃▃▄▄▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████
best_doc_acc@1,▁▁▂▃▄▄▄▄▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
best_doc_acc@2,▁▁▃▃▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇██████████████
best_doc_acc@3,▁▁▃▃▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇▇▇███████████████████
best_doc_acc@4,▁▁▃▃▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇██████████████
best_doc_acc@5,▁▁▃▃▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇███████████████████
doc_acc@1,▁▁▂▃▄▄▄▄▅▅▅▆▆▆▆▆▆▆▇▇▆▆▇▇▇▇▇▇▇▇▇█████████
doc_acc@2,▁▁▃▃▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▆▇▇▇▇▇▇█▇▇██████████
doc_acc@3,▁▁▃▃▅▄▅▅▆▅▆▆▆▇▇▆▆▆▇▇▇▇▇▇▇▇▇█▇▇██████████

0,1
accuracy,0.1941
best_accuracy,0.19459
best_doc_acc@1,0.458
best_doc_acc@2,0.56
best_doc_acc@3,0.618
best_doc_acc@4,0.658
best_doc_acc@5,0.688
doc_acc@1,0.454
doc_acc@2,0.534
doc_acc@3,0.616


  0%|          | 0/8207 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/32937 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/103000 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▁▃▃▄▄▅▅▅▆▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇███████
best_accuracy,▁▁▃▃▄▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████
best_doc_acc@1,▁▁▂▃▄▄▅▅▅▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇███████████████
best_doc_acc@2,▁▁▂▃▄▄▅▅▅▅▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇████████████
best_doc_acc@3,▁▁▂▃▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████
best_doc_acc@4,▁▂▂▃▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇████████████
best_doc_acc@5,▁▂▂▃▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇████████████
doc_acc@1,▁▁▂▂▄▃▅▅▅▆▆▅▆▆▆▆▆▆▆▆▆▆▆▇▇█▇▇▇█▇█▇███████
doc_acc@2,▁▁▂▃▄▄▅▅▅▅▆▅▆▆▆▆▆▆▆▆▇▆▇▇▇▇▇▇▇▇▇▇▇█▇█████
doc_acc@3,▁▁▂▃▄▄▅▅▅▆▆▆▆▆▆▇▆▇▇▆▇▇▇▇▇▇▇▇█▇▇███▇█████

0,1
accuracy,0.18923
best_accuracy,0.19008
best_doc_acc@1,0.446
best_doc_acc@2,0.564
best_doc_acc@3,0.626
best_doc_acc@4,0.668
best_doc_acc@5,0.702
doc_acc@1,0.44
doc_acc@2,0.55
doc_acc@3,0.616


  0%|          | 0/7707 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/30937 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


3 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/96700 [00:00<?, ?it/s]











VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▃▄▄▅▅▆▆▆▆▆▆▆▇▇▇▆▃▇▇▇▆▇▇▇▇▇███▇████████
best_accuracy,▁▂▃▄▄▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
best_doc_acc@1,▁▂▂▃▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████
best_doc_acc@2,▁▂▃▃▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████████
best_doc_acc@3,▁▂▃▄▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████████
best_doc_acc@4,▁▂▃▄▅▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████
best_doc_acc@5,▁▂▃▄▅▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████████████
doc_acc@1,▁▂▂▃▄▅▅▅▅▆▆▆▆▆▆▆▆▆▃▇▇▇▆▇▆▆▇▇▇▇▇▇█▇██████
doc_acc@2,▁▂▃▃▄▅▅▆▆▆▆▇▆▆▇▇▇▇▃▇▇▇▆▇▇▇▇▇▇██▇████████
doc_acc@3,▁▂▃▄▅▅▅▆▆▆▆▇▇▆▇▇▇▇▄▇▇▇▆▇▇▇▇▇▇██▇████████

0,1
accuracy,0.30388
best_accuracy,0.30544
best_doc_acc@1,0.58551
best_doc_acc@2,0.69014
best_doc_acc@3,0.7505
best_doc_acc@4,0.77465
best_doc_acc@5,0.79074
doc_acc@1,0.55936
doc_acc@2,0.67807
doc_acc@3,0.73843


  0%|          | 0/7707 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/30937 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


3 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/96700 [00:00<?, ?it/s]











VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▃▄▄▅▅▆▆▆▆▆▆▇▇▆▇▇▇▇▇▇▇▇▇▇▇▇█▇██████████
best_accuracy,▁▂▃▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████
best_doc_acc@1,▁▁▃▃▄▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████████
best_doc_acc@2,▁▂▃▄▅▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████████████
best_doc_acc@3,▁▂▃▄▅▅▆▆▆▇▇▇▇▇▇▇▇███████████████████████
best_doc_acc@4,▁▂▃▄▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇███████████████████
best_doc_acc@5,▁▁▄▄▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇████████████████████
doc_acc@1,▁▁▃▃▄▄▅▅▆▆▆▆▇▆▇▆▇▇▇▇▇▇▇▇▇▇▇▇█▇▇█████████
doc_acc@2,▁▂▃▄▅▅▆▆▆▇▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████
doc_acc@3,▁▂▃▄▅▅▆▆▆▇▇▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██▇██████████

0,1
accuracy,0.30622
best_accuracy,0.30881
best_doc_acc@1,0.57143
best_doc_acc@2,0.67606
best_doc_acc@3,0.73038
best_doc_acc@4,0.76258
best_doc_acc@5,0.78471
doc_acc@1,0.55533
doc_acc@2,0.66197
doc_acc@3,0.72032


  0%|          | 0/7707 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/30937 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


3 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/96700 [00:00<?, ?it/s]











VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▂▃▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
best_accuracy,▁▂▂▃▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████
best_doc_acc@1,▁▂▂▃▄▅▅▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
best_doc_acc@2,▁▂▃▃▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████
best_doc_acc@3,▁▂▃▄▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████
best_doc_acc@4,▁▂▃▄▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇██████████████████
best_doc_acc@5,▁▂▃▄▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇██████████████████
doc_acc@1,▁▂▂▃▄▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇█▇▇▇███████████
doc_acc@2,▁▂▃▄▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇█▇▇██▇▇███████████
doc_acc@3,▁▂▃▄▅▆▆▆▆▇▆▇▇▇▇▇▇▇▇▇▇▇█▇▇▇██▇███████████

0,1
accuracy,0.32075
best_accuracy,0.32204
best_doc_acc@1,0.5996
best_doc_acc@2,0.70221
best_doc_acc@3,0.75855
best_doc_acc@4,0.7827
best_doc_acc@5,0.80483
doc_acc@1,0.58551
doc_acc@2,0.69014
doc_acc@3,0.75252


  0%|          | 0/7707 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/30937 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


3 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/96700 [00:00<?, ?it/s]

wandb: Network error (ConnectTimeout), entering retry loop.










VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▃▃▄▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇████████████
best_accuracy,▁▂▃▃▄▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇████████████
best_doc_acc@1,▁▁▂▃▄▄▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████
best_doc_acc@2,▁▁▂▄▄▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████████
best_doc_acc@3,▁▂▃▄▅▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████████
best_doc_acc@4,▁▂▃▄▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████████████
best_doc_acc@5,▁▂▃▄▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████████████
doc_acc@1,▁▁▂▃▄▄▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█▇██▇▇▇██
doc_acc@2,▁▁▂▄▄▅▆▆▆▇▆▇▆▇▇▇▇▇▇▇▇▇▇▇█▇▇▇█▇██████████
doc_acc@3,▁▂▃▄▅▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█▇▇▇█▇██████████

0,1
accuracy,0.31491
best_accuracy,0.31828
best_doc_acc@1,0.61167
best_doc_acc@2,0.70221
best_doc_acc@3,0.75252
best_doc_acc@4,0.7827
best_doc_acc@5,0.80483
doc_acc@1,0.57545
doc_acc@2,0.65996
doc_acc@3,0.71227


  0%|          | 0/7707 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/30937 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


3 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/96700 [00:00<?, ?it/s]











VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▁▂▄▄▅▅▆▆▆▆▆▆▇▆▆▇▇▇▇▆▇▇▇▇▇▇▇▇▇██████████
best_accuracy,▁▁▂▄▄▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████████
best_doc_acc@1,▁▁▂▃▄▄▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████████
best_doc_acc@2,▁▂▂▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████████
best_doc_acc@3,▁▂▃▄▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████████
best_doc_acc@4,▁▂▃▄▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████████
best_doc_acc@5,▁▂▃▄▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████████████
doc_acc@1,▁▁▂▃▄▄▅▆▆▆▆▇▇▆▇▇▇▇▇▇▆▇▇▇▇▇▇██▇██████████
doc_acc@2,▁▂▂▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█▇███████████
doc_acc@3,▁▂▃▄▅▆▆▆▆▇▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█▇█▇█████████

0,1
accuracy,0.30518
best_accuracy,0.30647
best_doc_acc@1,0.56137
best_doc_acc@2,0.6841
best_doc_acc@3,0.73441
best_doc_acc@4,0.78068
best_doc_acc@5,0.8008
doc_acc@1,0.53924
doc_acc@2,0.68008
doc_acc@3,0.72233


  0%|          | 0/7707 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/30937 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


3 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/96700 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▃▄▅▅▆▆▇▇▅▆▆▇▇▇▇▅▆▇▆▇▇▆▇▆▇▇████████████
best_accuracy,▁▂▃▄▅▅▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████
best_doc_acc@1,▁▂▃▃▄▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████
best_doc_acc@2,▁▂▃▃▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████
best_doc_acc@3,▁▂▃▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████
best_doc_acc@4,▁▂▃▄▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████████
best_doc_acc@5,▁▂▄▄▅▅▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████████
doc_acc@1,▁▂▃▃▄▄▅▅▆▆▄▆▅▆▆▇▇▄▆▇▆▇▇▆▆▆▇▇▇█▇▇██▇█▇███
doc_acc@2,▁▂▃▃▅▅▅▅▆▆▅▆▆▇▆▇▇▅▇▇▆▇▇▇▇▆▇▇▇▇▇▇█████▇██
doc_acc@3,▁▂▃▄▅▅▅▆▆▆▅▆▆▆▆▇▇▅▇▇▆▇▇▇▇▆▇▇▇██▇█████▇██

0,1
accuracy,0.22603
best_accuracy,0.22914
best_doc_acc@1,0.47485
best_doc_acc@2,0.5996
best_doc_acc@3,0.65392
best_doc_acc@4,0.69416
best_doc_acc@5,0.72032
doc_acc@1,0.4668
doc_acc@2,0.57545
doc_acc@3,0.63783


  0%|          | 0/7707 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/30937 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


3 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/96700 [00:00<?, ?it/s]



wandb: Network error (ConnectTimeout), entering retry loop.
wandb: Network error (ConnectTimeout), entering retry loop.






VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▃▄▅▅▅▆▆▆▆▆▆▆▅▆▆▇▆▇▇▇▇▇▆▇▇▇▇███████████
best_accuracy,▁▂▃▄▅▅▅▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇███████████
best_doc_acc@1,▁▂▃▃▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇███████████
best_doc_acc@2,▁▂▃▄▄▅▅▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇███████████
best_doc_acc@3,▁▂▃▄▄▅▅▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇███████████
best_doc_acc@4,▁▂▃▄▄▅▅▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇████████████
best_doc_acc@5,▁▂▃▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇██████████████
doc_acc@1,▁▂▃▃▄▅▅▅▅▆▆▅▆▆▅▅▆▆▆▆▇▇▇▇▆▇▇▇▇████▇██████
doc_acc@2,▁▂▃▄▄▅▅▅▆▆▆▆▆▆▅▆▆▇▆▇▆▇▇▇▇▇▇▇▇▇██▇▇██████
doc_acc@3,▁▂▃▄▄▅▅▅▆▆▆▆▆▆▅▆▆▇▆▇▇▇▇▇▇▇▇▇▇███▇███████

0,1
accuracy,0.24744
best_accuracy,0.24912
best_doc_acc@1,0.50905
best_doc_acc@2,0.62777
best_doc_acc@3,0.69618
best_doc_acc@4,0.74044
best_doc_acc@5,0.76258
doc_acc@1,0.50503
doc_acc@2,0.60765
doc_acc@3,0.69618


  0%|          | 0/7707 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/30937 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


3 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/96700 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▂▄▅▅▆▆▆▆▆▆▆▆▇▇▇▆▇▆▆▆▇▇▇▇▇▇▇▇▇▇█▇██████
best_accuracy,▁▂▂▄▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████
best_doc_acc@1,▁▁▂▃▄▅▅▅▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████
best_doc_acc@2,▁▂▂▄▄▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████████
best_doc_acc@3,▁▂▂▄▄▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████████
best_doc_acc@4,▁▂▂▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████████
best_doc_acc@5,▁▂▃▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████████
doc_acc@1,▁▁▂▃▄▅▅▆▅▆▆▆▆▅▆▇▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇████
doc_acc@2,▁▂▂▄▄▅▆▆▆▆▆▆▆▆▇▇▆▇▇▆▆▆▇▇▇▇█▇▇█▇█████████
doc_acc@3,▁▂▂▄▅▆▆▆▇▇▇▆▇▆▇▇▇▆▇▆▇▆▇▇▇▇█▇▇█▇█████████

0,1
accuracy,0.2368
best_accuracy,0.2381
best_doc_acc@1,0.49296
best_doc_acc@2,0.57948
best_doc_acc@3,0.64386
best_doc_acc@4,0.69215
best_doc_acc@5,0.72435
doc_acc@1,0.49095
doc_acc@2,0.57344
doc_acc@3,0.63179


  0%|          | 0/7707 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/30937 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


3 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/96700 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▃▄▅▅▆▆▆▆▆▆▆▆▆▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇█████████
best_accuracy,▁▂▃▄▅▅▆▆▆▆▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇█████████
best_doc_acc@1,▁▁▂▃▄▅▅▆▆▆▆▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇████████████
best_doc_acc@2,▁▁▃▃▄▅▅▆▆▆▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇████████████
best_doc_acc@3,▁▂▃▄▄▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████
best_doc_acc@4,▁▂▃▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████
best_doc_acc@5,▁▂▄▄▅▅▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████████
doc_acc@1,▁▁▂▃▄▄▅▆▆▆▅▅▆▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇██▇█████
doc_acc@2,▁▁▃▃▄▄▅▆▆▆▆▅▆▅▆▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇█▇▇▇▇█████
doc_acc@3,▁▂▃▄▄▄▆▆▆▆▆▅▆▆▆▆▆▆▆▇▆▇▇▇▇▇▇▇▇▇███▇▇█████

0,1
accuracy,0.23485
best_accuracy,0.23745
best_doc_acc@1,0.48692
best_doc_acc@2,0.59759
best_doc_acc@3,0.65795
best_doc_acc@4,0.69215
best_doc_acc@5,0.72435
doc_acc@1,0.46881
doc_acc@2,0.55936
doc_acc@3,0.64185


  0%|          | 0/7707 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/30937 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


3 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/96700 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▁▃▃▅▅▅▆▅▆▆▆▆▆▇▇▇▆▆▆▆▆▆▇▇▇▇▇▇█▇█▇█▇█████
best_accuracy,▁▁▃▃▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████████
best_doc_acc@1,▁▁▃▃▄▅▅▅▅▅▆▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇███████████
best_doc_acc@2,▁▁▃▃▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████
best_doc_acc@3,▁▂▃▄▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████
best_doc_acc@4,▁▂▃▄▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████████
best_doc_acc@5,▁▁▃▄▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████
doc_acc@1,▁▁▃▃▄▅▄▅▅▅▅▅▅▆▆▆▆▅▆▆▆▇▆▆▇▇▇▇▇██▇▇█▇▇████
doc_acc@2,▁▁▃▃▄▅▅▅▅▆▆▆▆▆▇▆▇▆▆▆▆▇▆▇▆▇▇▇▇▇▇█▇█▇█████
doc_acc@3,▁▂▃▄▅▅▅▅▅▆▆▆▆▆▇▆▇▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████

0,1
accuracy,0.23122
best_accuracy,0.23187
best_doc_acc@1,0.47485
best_doc_acc@2,0.59356
best_doc_acc@3,0.65996
best_doc_acc@4,0.69215
best_doc_acc@5,0.71831
doc_acc@1,0.4668
doc_acc@2,0.58954
doc_acc@3,0.65795


  0%|          | 0/7210 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/28943 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


4 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/90500 [00:00<?, ?it/s]





wandb: Network error (ConnectTimeout), entering retry loop.






VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▃▄▅▅▆▆▆▇▇▇▆▇▇▇▆▆▆▇▇▇▇▇▇▇███████▇██████
best_accuracy,▁▂▃▄▅▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████████
best_doc_acc@1,▁▂▃▄▅▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████
best_doc_acc@2,▁▂▃▄▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████████████
best_doc_acc@3,▁▂▃▅▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████████
best_doc_acc@4,▁▂▄▅▆▆▇▇▇▇██████████████████████████████
best_doc_acc@5,▁▂▄▅▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████████
doc_acc@1,▁▂▂▄▅▅▆▆▆▇▇▇▆▇▇▇▆▇▆▇▇▇▇▇▇▇█▇████████████
doc_acc@2,▁▂▃▄▅▅▇▆▆▇▇▇▆▇▇▇▆▇▆▇▇▇▇▇▇▇███████▇█▇████
doc_acc@3,▁▂▃▅▆▆▇▆▆▇▇▇▆▇▇▇▇▇▆▇▇▇▇▇▇▇▇██████▇██████

0,1
accuracy,0.30638
best_accuracy,0.30791
best_doc_acc@1,0.51012
best_doc_acc@2,0.61943
best_doc_acc@3,0.68219
best_doc_acc@4,0.7247
best_doc_acc@5,0.75911
doc_acc@1,0.50607
doc_acc@2,0.60324
doc_acc@3,0.65182


  0%|          | 0/7210 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/28943 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


4 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.03336155414581299, max=1.0)…

  0%|          | 0/90500 [00:00<?, ?it/s]











VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▂▃▄▅▅▅▆▆▆▆▆▆▆▇▇▇▆▇▆▇▇▇▇▇▇▇▇▇█▇████████
best_accuracy,▁▂▂▃▄▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
best_doc_acc@1,▁▂▂▃▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████████
best_doc_acc@2,▁▂▂▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████████
best_doc_acc@3,▁▂▃▄▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████████
best_doc_acc@4,▁▂▃▄▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇██████████████████
best_doc_acc@5,▁▂▃▅▆▆▆▆▇▇▇▇▇▇▇█████████████████████████
doc_acc@1,▁▂▂▃▄▅▅▅▆▆▆▆▆▇▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇█▇█████████
doc_acc@2,▁▂▂▄▅▆▅▆▆▆▆▇▆▇▆▇▇▇▇▇▆▇▇▇▇▇▇█▇▇▇█████████
doc_acc@3,▁▂▃▄▅▆▅▆▆▆▇▇▆▇▇▇▇▇▇▇▇▇▇▇▇█▇▇▇█▇█████████

0,1
accuracy,0.36269
best_accuracy,0.3663
best_doc_acc@1,0.59109
best_doc_acc@2,0.70648
best_doc_acc@3,0.75911
best_doc_acc@4,0.78947
best_doc_acc@5,0.80769
doc_acc@1,0.56073
doc_acc@2,0.69028
doc_acc@3,0.74696


  0%|          | 0/7210 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/28943 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


4 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.0333436648050944, max=1.0))…

  0%|          | 0/90500 [00:00<?, ?it/s]











VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▃▄▅▅▅▅▅▆▆▆▆▆▆▆▆▆▆▇▆▆▆▇▇▇▇▇▇▇▇▇████████
best_accuracy,▁▂▃▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
best_doc_acc@1,▁▂▂▄▄▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████
best_doc_acc@2,▁▂▃▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
best_doc_acc@3,▁▂▃▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████████
best_doc_acc@4,▁▂▃▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████
best_doc_acc@5,▁▂▄▄▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████
doc_acc@1,▁▂▂▄▄▅▅▆▅▆▆▆▆▆▆▆▆▆▆▇▇▇▆▇▇▇▇▇▇█▇▇██▇█▇███
doc_acc@2,▁▂▃▄▅▅▅▆▅▆▆▆▆▇▆▇▆▇▆▇▇▇▆▇▇▇▇▇▇▇▇▇██▇▇████
doc_acc@3,▁▂▃▄▅▅▆▆▆▆▇▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███▇█████████

0,1
accuracy,0.36436
best_accuracy,0.36436
best_doc_acc@1,0.60526
best_doc_acc@2,0.72065
best_doc_acc@3,0.77126
best_doc_acc@4,0.81781
best_doc_acc@5,0.83603
doc_acc@1,0.59514
doc_acc@2,0.7166
doc_acc@3,0.76518


  0%|          | 0/7210 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/28943 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


4 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/90500 [00:00<?, ?it/s]











VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▃▃▄▅▅▅▅▆▆▆▆▇▆▇▆▆▆▇▇▇▇▇▇▇▇▇█▇██████████
best_accuracy,▁▂▃▃▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████
best_doc_acc@1,▁▂▂▃▄▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇████████████
best_doc_acc@2,▁▂▃▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████████████
best_doc_acc@3,▁▂▃▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████████
best_doc_acc@4,▁▂▃▄▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇██████████████████
best_doc_acc@5,▁▂▄▄▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████████████
doc_acc@1,▁▂▂▃▄▅▅▆▆▆▆▆▆▆▆▇▆▆▆▇▇▇▇▇▇▇▇▇███▇████████
doc_acc@2,▁▂▃▃▅▅▆▆▆▆▆▇▆▇▇▇▇▇▇▇▇▇▇▇▇██▇█▇██████████
doc_acc@3,▁▂▃▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇███▇████████████

0,1
accuracy,0.36338
best_accuracy,0.36519
best_doc_acc@1,0.59919
best_doc_acc@2,0.69636
best_doc_acc@3,0.74899
best_doc_acc@4,0.78543
best_doc_acc@5,0.81174
doc_acc@1,0.57895
doc_acc@2,0.66802
doc_acc@3,0.72065


  0%|          | 0/7210 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/28943 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


4 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/90500 [00:00<?, ?it/s]











VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▃▄▅▅▆▆▆▇▇▆▇▅█▇█▇██▇█▆▆▆▆▇▇▇▇▇▇████████
best_accuracy,▁▂▃▄▅▅▆▆▇▇▇▇▇▇██████████████████████████
best_doc_acc@1,▁▂▃▄▅▅▆▆▇▇▇▇▇▇▇▇████████████████████████
best_doc_acc@2,▁▂▃▄▅▅▆▇▇▇▇▇▇▇▇▇▇███████████████████████
best_doc_acc@3,▁▂▄▅▆▆▆▇▇▇▇▇▇███████████████████████████
best_doc_acc@4,▁▂▄▅▆▆▇▇▇▇▇▇████████████████████████████
best_doc_acc@5,▁▂▄▅▆▆▇▇▇▇██████████████████████████████
doc_acc@1,▁▂▃▄▅▅▆▆▆▇▇▆▇▆▇▇██▇█▇█▆▇▇▆▇▇▇▇▇▇▇▇██████
doc_acc@2,▁▂▃▄▅▅▆▆▆▇▇▇▇▆▇▇▇██▇▇▇▆▇▇▆▇▇█▇▇▇█▇█▇████
doc_acc@3,▁▂▄▅▆▅▆▆▆▇▇▆▇▆█▇▇██▇▇█▆▇▇▇▇▇█████▇██████

0,1
accuracy,0.31221
best_accuracy,0.31221
best_doc_acc@1,0.51215
best_doc_acc@2,0.62753
best_doc_acc@3,0.69028
best_doc_acc@4,0.72672
best_doc_acc@5,0.76113
doc_acc@1,0.5081
doc_acc@2,0.62551
doc_acc@3,0.68623


  0%|          | 0/7210 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/28943 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


4 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/90500 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▄▄▅▆▆▆▆▆▆▆▆▇▇▆▆▆▆▇▇▇▇▇▇▄▇▇▇▇▇█▇▇██████
best_accuracy,▁▂▄▄▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████
best_doc_acc@1,▁▁▃▄▅▅▅▆▆▆▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇████████
best_doc_acc@2,▁▁▃▄▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████████
best_doc_acc@3,▁▂▄▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
best_doc_acc@4,▁▂▄▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████████
best_doc_acc@5,▁▁▄▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████
doc_acc@1,▁▁▃▄▅▅▅▅▅▆▅▆▆▆▆▆▆▆▆▆▇▇▇▇▇▄▇▇▇▇▇▇█▇▇█████
doc_acc@2,▁▁▄▄▅▅▅▆▆▆▆▆▆▇▆▆▆▆▆▇▇▇▇▇▇▄▇▇▇▇██▇▇██████
doc_acc@3,▁▂▄▄▅▅▆▆▆▆▆▆▆▇▇▆▆▆▆▇▇▇▇▇▇▅▇▇▇▇▇█▇▇██████

0,1
accuracy,0.27836
best_accuracy,0.27961
best_doc_acc@1,0.51012
best_doc_acc@2,0.61336
best_doc_acc@3,0.67004
best_doc_acc@4,0.7085
best_doc_acc@5,0.74291
doc_acc@1,0.50202
doc_acc@2,0.60324
doc_acc@3,0.67004


  0%|          | 0/7210 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/28943 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


4 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/90500 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▃▄▅▅▅▅▆▅▆▆▆▆▆▆▆▇▇▆▇▆▇▇▇▇▇▇▇▇▇▇▇███████
best_accuracy,▁▂▃▄▅▅▅▅▆▆▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇███████
best_doc_acc@1,▁▂▂▃▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████
best_doc_acc@2,▁▂▃▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇█████████████
best_doc_acc@3,▁▂▃▄▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇█████████████
best_doc_acc@4,▁▂▃▄▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████████
best_doc_acc@5,▁▂▃▄▅▅▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████████
doc_acc@1,▁▂▂▃▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▆▇▆▇▇▇▇▇▇▇▇▇▇▇▇██████
doc_acc@2,▁▂▂▄▅▄▅▅▆▆▆▆▆▆▆▆▇▇▇▆▇▆▇▇▇▇▇▇▇▇▇▇████████
doc_acc@3,▁▂▃▄▅▅▆▆▆▆▆▆▆▆▆▆▇▇▇▆▇▇▇▇▇▇▇▇█▇▇█████████

0,1
accuracy,0.3129
best_accuracy,0.31387
best_doc_acc@1,0.55466
best_doc_acc@2,0.65992
best_doc_acc@3,0.7166
best_doc_acc@4,0.74696
best_doc_acc@5,0.77935
doc_acc@1,0.54049
doc_acc@2,0.65385
doc_acc@3,0.7004


  0%|          | 0/7210 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/28943 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


4 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/90500 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▃▃▄▄▅▅▅▆▆▆▆▆▆▆▇▇▆▆▇▇▇▇▇▇▇▇▇▇▇█████████
best_accuracy,▁▂▃▃▄▄▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████
best_doc_acc@1,▁▂▂▃▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████
best_doc_acc@2,▁▂▃▃▄▄▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇█████████████
best_doc_acc@3,▁▂▃▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
best_doc_acc@4,▁▂▃▄▄▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████████
best_doc_acc@5,▁▂▃▄▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████
doc_acc@1,▁▂▂▃▄▄▅▄▅▅▆▅▆▅▆▆▇▇▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇███████
doc_acc@2,▁▂▃▃▄▄▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇█▇█████████
doc_acc@3,▁▂▃▄▄▅▅▅▆▆▆▆▆▆▇▆▇▇▇▇▇▇▇▇▇▇▇▇▇█▇█████████

0,1
accuracy,0.30693
best_accuracy,0.30693
best_doc_acc@1,0.55466
best_doc_acc@2,0.66194
best_doc_acc@3,0.72672
best_doc_acc@4,0.76113
best_doc_acc@5,0.78745
doc_acc@1,0.54656
doc_acc@2,0.64575
doc_acc@3,0.70445


  0%|          | 0/7210 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/28943 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


4 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/90500 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▃▃▄▅▆▅▅▆▆▆▆▆▆▆▆▇▅▆▆▆▇▆▇▇▇▇▇▇▇█████████
best_accuracy,▁▂▃▃▄▅▅▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████
best_doc_acc@1,▁▂▃▃▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████
best_doc_acc@2,▁▂▃▃▄▅▆▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
best_doc_acc@3,▁▂▃▄▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
best_doc_acc@4,▁▂▃▄▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
best_doc_acc@5,▁▂▃▄▅▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████████
doc_acc@1,▁▂▃▃▄▄▅▄▅▅▆▆▆▆▆▆▆▇▄▆▆▆▆▆▇▇▇▇▇▇▇█▇▇██████
doc_acc@2,▁▂▃▃▄▅▆▅▅▆▆▆▆▆▆▇▆▇▅▆▆▇▇▇▇▇▇▇▇▇▇█████████
doc_acc@3,▁▂▃▄▅▅▆▅▅▆▆▇▆▆▇▇▆▇▅▆▆▇▇▇▇▇▇▇▇▇▇█████████

0,1
accuracy,0.30666
best_accuracy,0.30693
best_doc_acc@1,0.56883
best_doc_acc@2,0.66397
best_doc_acc@3,0.72267
best_doc_acc@4,0.75911
best_doc_acc@5,0.7753
doc_acc@1,0.56478
doc_acc@2,0.65385
doc_acc@3,0.71255


  0%|          | 0/7210 [00:00<?, ?it/s]

0 data dropped because of exceeding max_length 256


  0%|          | 0/28943 [00:00<?, ?it/s]

Some weights of the model checkpoint at /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_256_pos_mlm_0/checkpoint-120000/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


4 data dropped because of exceeding max_length 256
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/90500 [00:00<?, ?it/s]









VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂▃▃▅▅▆▆▆▆▇▇▆▆▆▆▆▆▆▆▇▇▆▇▆▇▇▇▇▇▇▇████████
best_accuracy,▁▂▃▃▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████
best_doc_acc@1,▁▂▃▃▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
best_doc_acc@2,▁▂▃▃▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
best_doc_acc@3,▁▂▃▃▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
best_doc_acc@4,▁▂▃▄▅▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
best_doc_acc@5,▁▂▃▄▅▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
doc_acc@1,▁▂▃▃▅▅▆▅▅▆▇▆▆▆▆▆▆▆▆▆▇▇▆▇▆▆▇▇▇▇█▇▇███████
doc_acc@2,▁▂▃▃▅▅▆▅▆▆▇▆▇▆▆▆▇▇▆▆▇▇▆▇▆▇▇▇▇▇█▇████████
doc_acc@3,▁▂▃▃▅▆▆▆▆▆▇▇▇▇▇▆▇▇▆▆▇▇▇▇▆▇▇▇▇▇█▇████████

0,1
accuracy,0.28946
best_accuracy,0.29182
best_doc_acc@1,0.52429
best_doc_acc@2,0.62753
best_doc_acc@3,0.69028
best_doc_acc@4,0.72065
best_doc_acc@5,0.74089
doc_acc@1,0.52024
doc_acc@2,0.62348
doc_acc@3,0.67409
