In [None]:
from tokenizers import Tokenizer
from tokenizers.trainers import WordLevelTrainer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
from transformers import BertForMaskedLM, BertConfig
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import torch
from nltk.parse.corenlp import CoreNLPParser, CoreNLPDependencyParser
from nltk.tag.hunpos import HunposTagger
from nltk.tokenize import word_tokenize
import stanza
import nltk
import numpy as np
from datasets import load_dataset, load_from_disk
from numerize import numerize
import wandb
import os 
import typing
import tokenizers
from tqdm.auto import trange, tqdm

In [None]:
# nlp = stanza.Pipeline(lang='en', processors='tokenize,pos')
pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
# ht = HunposTagger('/home/jz17d/bin/english.model')

In [None]:
# https://emorynlp.github.io/nlp4j/components/part-of-speech-tagging.html
corenlp_vocab = '''$ 	Dollar 	1.0.0
: 	Colon 	1.0.0
, 	Comma 	1.0.0
. 	Period 	1.0.0
`` 	Left quote 	1.0.0
'' 	Right quote 	1.0.0
-LRB- 	Left bracket 	1.0.0
-RRB- 	Right bracket 	1.0.0
ADD 	Email 	1.0.0
AFX 	Affix 	1.0.0
CC 	Coordinating conjunction 	1.0.0
CD 	Cardinal number 	1.0.0
DT 	Determiner 	1.0.0
EX 	Existential there 	1.0.0
FW 	Foreign word 	1.0.0
GW 	Go with 	1.0.0
HYPH 	Hyphen 	1.0.0
IN 	Preposition or subordinating conjunction 	1.0.0
JJ 	Adjective 	1.0.0
JJR 	Adjective, comparative 	1.0.0
JJS 	Adjective, superlative 	1.0.0
LS 	List item marker 	1.0.0
MD 	Modal 	1.0.0
NFP 	Superfluous punctuation 	1.0.0
NN 	Noun, singular or mass 	1.0.0
NNS 	Noun, plural 	1.0.0
NNP 	Proper noun, singular 	1.0.0
NNPS 	Proper noun, plural 	1.0.0
PDT 	Predeterminer 	1.0.0
POS 	Possessive ending 	1.0.0
PRP 	Personal pronoun 	1.0.0
PRP$ 	Possessive pronoun 	1.0.0
RB 	Adverb 	1.0.0
RBR 	Adverb, comparative 	1.0.0
RBS 	Adverb, superlative 	1.0.0
RP 	Particle 	1.0.0
SYM 	Symbol 	1.0.0
TO 	To 	1.0.0
UH 	Interjection 	1.0.0
VB 	Verb, base form 	1.0.0
VBD 	Verb, past tense 	1.0.0
VBG 	Verb, gerund or present participle 	1.0.0
VBN 	Verb, past participle 	1.0.0
VBP 	Verb, non-3rd person singular present 	1.0.0
VBZ 	Verb, 3rd person singular present 	1.0.0
WDT 	Wh-determiner 	1.0.0
WP 	Wh-pronoun 	1.0.0
WP$ 	Wh-pronoun, possessive 	1.0.0
WRB 	Wh-adverb 	1.0.0
XX'''.split('\n')
for i in range(len(corenlp_vocab)):
    corenlp_vocab[i] = corenlp_vocab[i].split('\t')[0].strip()
num_xpos_tokens = len(corenlp_vocab)
corenlp_token2id = {corenlp_vocab[i]:i for i in range(num_xpos_tokens)}

In [None]:
# get tagset 
def get_pos_vocab(tagger, tagset='xpos'):
    if tagset == 'upos':
        assert tagger == 'stanza', 'only stanza support upos'
    # the full list of upos tokens
    upos_vocab = ['ADJ',
                'ADP',
                'ADV',
                'AUX',
                'CCONJ',
                'DET',
                'INTJ',
                'NOUN',
                'NUM',
                'PART',
                'PRON',
                'PROPN',
                'PUNCT',
                'SCONJ',
                'SYM',
                'VERB',
                'X']
    # if use the simple conversion, upos vocab is smaller
#     upos_vocab = ['ADJ',
#                  'ADP',
#                  'ADV',
#                  'CCONJ',
#                  'DET',
#                  'INTJ',
#                  'NOUN',
#                  'NUM',
#                  'PART',
#                  'PRON',
#                  'PROPN',
#                  'PUNCT',
#                  'SYM',
#                  'VERB',
#                  'X']

    from nltk.data import load
    tagdict = load('help/tagsets/upenn_tagset.pickle')

    xpos_vocab = list(tagdict.keys())
    if tagger == 'corenlp':
        xpos_vocab = corenlp_vocab
    xpos_vocab = sorted(xpos_vocab)
    if tagset=='xpos':
        return xpos_vocab
    else:
        return upos_vocab


In [None]:
def stanza_tagging(examples, tag_set='xpos'): # stanza
    all_upos = []
    all_xpos = []
    for text in examples['text']:
        doc = nlp(text)
        upos = []
        xpos = []
        for sentence in doc.sentences:
            for word in sentence.words:
                upos.append(word.upos)
                xpos.append(word.xpos)  
        all_upos.append(' '.join(upos))   
        all_xpos.append(' '.join(xpos))   
    if tag_set == 'upos':
        return tokenizer(all_upos, truncation=True) 
    else:
        return tokenizer(all_xpos, truncation=True)

In [None]:
def hunpos_tagging(examples): # hunpos
    xpos = []
    for sentence in examples['text']:
        tagged = ht.tag(word_tokenize(sentence))
        pos = []
        for word in tagged:
            pos.append(word[1].decode('utf-8'))
        xpos.append(' '.join(pos))
    return tokenizer(xpos, truncation=True) 

In [None]:
def corenlp_tagging(examples): # corenlp
    xpos = []
    tagged = list(pos_tagger.raw_tag_sents(examples['text']))
    for sentence in tagged:
        pos = []
        for word in sentence[0]:
            pos.append(word[1])
        xpos.append(' '.join(pos))
    return tokenizer(xpos, truncation=True) 

In [None]:
tagger2func = {'stanza': stanza_tagging,
               'hunpos': hunpos_tagging,
               'corenlp': corenlp_tagging,}

In [None]:
def get_tokenizer(vocab, model_max_length = 128):
    # Tokenizer is from tokenizers package. PreTrainedTokenizerFast is from tranformers package.
    # PreTrainedTokenizerFast can load vocab saved/trained by Tokenizer
    t = Tokenizer(WordLevel(unk_token="[UNK]"))
    t.pre_tokenizer = Whitespace()
    t.add_special_tokens(["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]",])

    t.add_tokens(vocab) 
#     trainer makes "-LRB-" 3 tokens
#     trainer = WordLevelTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
#     data = [' '.join(vocab)]
#     t.train_from_iterator(data, trainer=trainer)

    t.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", t.token_to_id("[CLS]")),
            ("[SEP]", t.token_to_id("[SEP]")),
        ],
    )

    t.enable_padding(pad_id=t.token_to_id("[PAD]"), pad_token="[PAD]")
    t.enable_truncation(max_length=model_max_length)
    t.save('/home/jz17d/Desktop/pos_tokenizer.json')

    tokenizer = PreTrainedTokenizerFast(tokenizer_file="/home/jz17d/Desktop/pos_tokenizer.json", unk_token="[UNK]")
    tokenizer.pad_token = '[PAD]'
    tokenizer.mask_token = '[MASK]'
    tokenizer.unk_token = '[UNK]'
    special_tokens = {
         "unk_token": "[UNK]",
         "sep_token": "[SEP]",
         "pad_token": "[PAD]",
         "cls_token": "[CLS]",
         "mask_token": "[MASK]" }
    tokenizer.add_special_tokens(special_tokens)
    # tokenizer.add_special_tokens({'unk_token':'[UNK]'})
    tokenizer.model_max_length=model_max_length
    return tokenizer

In [None]:
# def evaluate_ppl(model, input_ids, stride=128, max_length=128):
#     '''
#     Example from https://huggingface.co/docs/transformers/perplexity
#     '''
#     seq_len = input_ids.size(1)
#     nlls = []
#     prev_end_loc = 0
#     for begin_loc in range(0, seq_len, stride):
#         end_loc = min(begin_loc + max_length, seq_len)
#         trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
#         curr_input_ids = input_ids[:, begin_loc:end_loc].to(device)
#         target_ids = curr_input_ids.clone()
#         target_ids[:, :-trg_len] = -100

#         with torch.no_grad():
#             outputs = model(curr_input_ids, labels=target_ids)
#             # loss is calculated using CrossEntropyLoss which averages over input tokens.
#             # Multiply it with trg_len to get the summation instead of average.
#             # We will take average over all the tokens to get the true average
#             # in the last step of this example.
#             neg_log_likelihood = outputs.loss * trg_len

#         nlls.append(neg_log_likelihood)

#         prev_end_loc = end_loc
#         if end_loc == seq_len:
#             break
#     ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
#     return ppl.cpu().item()

In [None]:
# load raw bookcorpus dataset
os.environ['HF_DATASETS_CACHE'] = '/scratch/data_jz17d/hf_datasets_cache'
dataset = load_dataset("bookcorpus", cache_dir="/scratch/data_jz17d/hf_datasets_cache")

Found cached dataset bookcorpus (/scratch/data_jz17d/hf_datasets_cache/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f)


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# reprocess corenlp with correct tag list 
tagger = 'corenlp'
tagset = 'xpos'
reprocess = True
select = 1000000
model_max_length = 128

vocab = get_pos_vocab(tagger, tagset=tagset)
tokenizer = get_tokenizer(vocab, model_max_length = model_max_length)

cache_location = f'/scratch/data_jz17d/data/bookcorpus/bookcorpus_{numerize.numerize(select).lower()}_{tagger}_transformed.hf'
if not os.path.exists(cache_location) or reprocess:
    tagging_func = tagger2func[tagger]
    trainset = dataset['train'].select(range(select)).map(tagging_func, batched=True)
    trainset.save_to_disk(cache_location)
else:
    trainset = load_from_disk(cache_location)
trainset = trainset.remove_columns(['text'])

  0%|          | 0/1000 [00:00<?, ?ba/s]

In [None]:
reprocess = True
select2 = range(select,select+50000)
cache_location = f'/scratch/data_jz17d/data/bookcorpus/bookcorpus_50k_{tagger}_transformed_test.hf'
if not os.path.exists(cache_location) or reprocess:
    tagging_func = tagger2func[tagger]
    testset = dataset['train'].select(select2).map(tagging_func, batched=True)
    testset.save_to_disk(cache_location)
else:
    testset = load_from_disk(cache_location)
testset = testset.remove_columns(['text'])

  0%|          | 0/50 [00:00<?, ?ba/s]

In [None]:
# data related args
tagset = 'xpos'
select = 1000000
reprocess = False

# bert related args
model_max_length = 128
hidden_size = 32
# num_hidden_layers = 3 
num_attention_heads = 4
intermediate_size = 128

# training related args
control_steps = 5000 # num_steps to log and save
num_train_epochs = 20
batchsize = 128
dropout_prob = 0.1
tagger = 'corenlp'

# sweep variables
# TAGGER = ['corenlp']
# TAGGER = ['hunpos', 'corenlp', 'stanza']
NUM_LAYERS = [2, 3, 4]
MLM_P = [0.15, 0.25]
LR = [1e-4, 5e-4, 8e-4]

NUM_LAYERS, MLM_P, LR = np.meshgrid(NUM_LAYERS, MLM_P, LR)
NUM_LAYERS, MLM_P, LR = NUM_LAYERS.flatten(), MLM_P.flatten(), LR.flatten()
num_runs = len(LR)

for i_run in trange(num_runs):
    
    num_hidden_layers = int(NUM_LAYERS[i_run])
    mlm_probability = float(MLM_P[i_run])
    lr = float(LR[i_run])
    
    # create tokenizer
    vocab = get_pos_vocab(tagger, tagset=tagset)
    tokenizer = get_tokenizer(vocab, model_max_length = model_max_length)
    
    # transform or load dataset
    cache_location = f'/scratch/data_jz17d/data/bookcorpus/bookcorpus_{numerize.numerize(select).lower()}_{tagger}_transformed.hf'
    if not os.path.exists(cache_location) or reprocess:
        tagging_func = tagger2func[tagger]
        trainset = dataset['train'].select(range(select)).map(tagging_func, batched=True)
        trainset.save_to_disk(cache_location)
    else:
        trainset = load_from_disk(cache_location)
    trainset = trainset.remove_columns(['text'])
    
    select2 = range(select,select+50000)
    cache_location = f'/scratch/data_jz17d/data/bookcorpus/bookcorpus_50k_{tagger}_transformed_test.hf'
    if not os.path.exists(cache_location) or reprocess:
        tagging_func = tagger2func[tagger]
        testset = dataset['train'].select(select2).map(tagging_func, batched=True)
        testset.save_to_disk(cache_location)
    else:
        testset = load_from_disk(cache_location)
    testset = testset.remove_columns(['text'])
#     test_input_ids = torch.LongTensor([[1]+[item for t in testset['input_ids'] for item in t[1:-1]]+[2]])

    # mlm data collater
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=True,
        mlm_probability=mlm_probability
    )

    # model config
    config = BertConfig(vocab_size = len(tokenizer.get_vocab()),
                        hidden_size = hidden_size,
                        num_hidden_layers = num_hidden_layers,
                        num_attention_heads = num_attention_heads,
                        intermediate_size = intermediate_size,
                        hidden_act = 'gelu',
                        hidden_dropout_prob = dropout_prob,
                        attention_probs_dropout_prob = dropout_prob,
                        max_position_embeddings = model_max_length,
                        type_vocab_size = 2,
                        initializer_range = 0.02,
                        layer_norm_eps = 1e-12,
                        pad_token_id = tokenizer.pad_token_id)
    # init model
    bert = BertForMaskedLM(config)

    # trainer config
    training_args = TrainingArguments(
        learning_rate=lr,
        output_dir= f"/scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_{i_run}",
        overwrite_output_dir=True,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=batchsize,
        per_device_eval_batch_size=batchsize,
        evaluation_strategy='steps',
        save_steps=control_steps,
        logging_steps=control_steps,
        eval_steps=control_steps,
        save_total_limit=2,
        prediction_loss_only=True,
        remove_unused_columns=False,
#         report_to='wandb',
        )
    
    # wandb config
    wconfig = {}
    wconfig['num_hidden_layers'] = num_hidden_layers
    wconfig['mlm_probability'] = mlm_probability
    wconfig['lr'] = lr
    run = wandb.init(project="POS MLM CoreNLP", 
                     entity="fsu-dsc-cil", 
                     dir='/scratch/data_jz17d/wandb_tmp/', 
                     config=wconfig,
                     name=f'pos mlm {i_run}',
                     reinit=True)

    trainer = Trainer(
        model=bert,
        args=training_args,
        tokenizer=tokenizer,
        data_collator=data_collator,
        train_dataset=trainset,
        eval_dataset=testset,
    )
    trainer.train()
    run.finish()

  0%|          | 0/18 [00:00<?, ?it/s]

Assigning [UNK] to the unk_token key of the tokenizer
Assigning [SEP] to the sep_token key of the tokenizer
Assigning [PAD] to the pad_token key of the tokenizer
Assigning [CLS] to the cls_token key of the tokenizer
Assigning [MASK] to the mask_token key of the tokenizer
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.033339222272237144, max=1.0…

***** Running training *****
  Num examples = 1000000
  Num Epochs = 20
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 156260
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss
5000,2.6919,2.283304
10000,2.3505,2.093428
15000,2.2203,1.927104
20000,2.0985,1.765798
25000,1.9988,1.657201
30000,1.9195,1.577371
35000,1.8514,1.513302
40000,1.782,1.456283
45000,1.7254,1.432919
50000,1.6912,1.40577


***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_0/checkpoint-5000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_0/checkpoint-5000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_0/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_0/checkpoint-5000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_0/checkpoint-5000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_0/checkpoint-10000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_0/checkpoint-10000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_0/checkpoint-10000/pytor

Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_0/checkpoint-60000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_0/checkpoint-60000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_0/checkpoint-60000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_0/checkpoint-60000/special_tokens_map.json
Deleting older checkpoint [/scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_0/checkpoint-50000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_0/checkpoint-65000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_0/checkpoint-65000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_0/checkpoint-65000/pytorch_model.bin
tokenizer config file sav

***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_0/checkpoint-115000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_0/checkpoint-115000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_0/checkpoint-115000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_0/checkpoint-115000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_0/checkpoint-115000/special_tokens_map.json
Deleting older checkpoint [/scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_0/checkpoint-105000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_0/checkpoint-120000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenl

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,█▇▅▄▄▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,█▄▃▃▃█▄▂▆▅▃▁▁▂▁▁▅▅▂▂▂▁▁▁▄▆▃▁▂▂▁
eval/samples_per_second,▁▅▆▆▆▁▅▇▃▄▆██▇██▄▄▇▇▇▇██▅▃▆█▇▇█
eval/steps_per_second,▁▅▆▆▆▁▅▇▃▄▆██▇██▄▄▇▇▇▇██▅▃▆█▇▇█
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/learning_rate,███▇▇▇▇▆▆▆▆▅▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▁▁▁
train/loss,█▆▅▄▄▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,1.29922
eval/runtime,13.5053
eval/samples_per_second,3702.238
eval/steps_per_second,28.952
train/epoch,20.0
train/global_step,156260.0
train/learning_rate,0.0
train/loss,1.5076
train/total_flos,181982190094848.0
train/train_loss,1.7014


Assigning [UNK] to the unk_token key of the tokenizer
Assigning [SEP] to the sep_token key of the tokenizer
Assigning [PAD] to the pad_token key of the tokenizer
Assigning [CLS] to the cls_token key of the tokenizer
Assigning [MASK] to the mask_token key of the tokenizer
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 1000000
  Num Epochs = 20
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 156260
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss
5000,2.23,1.507956
10000,1.6164,1.35743
15000,1.5137,1.31245
20000,1.4671,1.288372
25000,1.4409,1.2759
30000,1.4261,1.259849
35000,1.4105,1.246999
40000,1.4043,1.244564
45000,1.3919,1.245173
50000,1.3878,1.242039


***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_1/checkpoint-5000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_1/checkpoint-5000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_1/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_1/checkpoint-5000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_1/checkpoint-5000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_1/checkpoint-10000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_1/checkpoint-10000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_1/checkpoint-10000/pytor

Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_1/checkpoint-60000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_1/checkpoint-60000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_1/checkpoint-60000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_1/checkpoint-60000/special_tokens_map.json
Deleting older checkpoint [/scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_1/checkpoint-50000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_1/checkpoint-65000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_1/checkpoint-65000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_1/checkpoint-65000/pytorch_model.bin
tokenizer config file sav

***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_1/checkpoint-115000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_1/checkpoint-115000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_1/checkpoint-115000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_1/checkpoint-115000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_1/checkpoint-115000/special_tokens_map.json
Deleting older checkpoint [/scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_1/checkpoint-105000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_1/checkpoint-120000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenl

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,█▅▄▃▃▂▂▂▂▂▂▂▂▂▁▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▄▄▆█▅▇▁▁▅▃▅▆▅▅▆▂▂▁▂▂▁▆▁▂▂▂▅▂▂▂▁
eval/samples_per_second,▅▅▃▁▄▂██▄▆▄▃▄▄▃▇▇█▇▇█▃█▇▇▇▄▇▇▇█
eval/steps_per_second,▅▅▃▁▄▂██▄▆▄▃▄▄▃▇▇█▇▇█▃█▇▇▇▄▇▇▇█
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/learning_rate,███▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▂▂▂▂▁▁▁
train/loss,█▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,1.19965
eval/runtime,13.5082
eval/samples_per_second,3701.468
eval/steps_per_second,28.945
train/epoch,20.0
train/global_step,156260.0
train/learning_rate,0.0
train/loss,1.3421
train/total_flos,181982190094848.0
train/train_loss,1.41094


Assigning [UNK] to the unk_token key of the tokenizer
Assigning [SEP] to the sep_token key of the tokenizer
Assigning [PAD] to the pad_token key of the tokenizer
Assigning [CLS] to the cls_token key of the tokenizer
Assigning [MASK] to the mask_token key of the tokenizer
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 1000000
  Num Epochs = 20
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 156260
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss
5000,2.0029,1.378969
10000,1.5308,1.325399
15000,1.4749,1.29093
20000,1.442,1.27084
25000,1.4229,1.261166
30000,1.412,1.249452
35000,1.4003,1.234677
40000,1.3962,1.234411
45000,1.386,1.239756
50000,1.3831,1.231708


***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_2/checkpoint-5000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_2/checkpoint-5000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_2/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_2/checkpoint-5000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_2/checkpoint-5000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_2/checkpoint-10000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_2/checkpoint-10000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_2/checkpoint-10000/pytor

Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_2/checkpoint-60000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_2/checkpoint-60000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_2/checkpoint-60000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_2/checkpoint-60000/special_tokens_map.json
Deleting older checkpoint [/scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_2/checkpoint-50000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_2/checkpoint-65000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_2/checkpoint-65000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_2/checkpoint-65000/pytorch_model.bin
tokenizer config file sav

***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_2/checkpoint-115000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_2/checkpoint-115000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_2/checkpoint-115000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_2/checkpoint-115000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_2/checkpoint-115000/special_tokens_map.json
Deleting older checkpoint [/scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_2/checkpoint-105000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_2/checkpoint-120000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenl

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,█▆▅▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁
eval/runtime,▃▃▃██▆▂▄▃▂▁▃▁▅▃▂▇▅▅▁▆▂▄▂▃▄▇▃▁▄▄
eval/samples_per_second,▆▆▆▁▁▃▆▅▆▇▇▆█▄▆▇▂▄▄█▃▇▅▇▆▅▂▆█▅▅
eval/steps_per_second,▆▆▆▁▁▃▆▅▆▇▇▆█▄▆▇▂▄▄█▃▇▅▇▆▅▂▆█▅▅
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/learning_rate,███▇▇▇▇▆▆▆▆▅▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▁▁▁
train/loss,█▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,1.19242
eval/runtime,13.7237
eval/samples_per_second,3643.345
eval/steps_per_second,28.491
train/epoch,20.0
train/global_step,156260.0
train/learning_rate,1e-05
train/loss,1.3368
train/total_flos,181982190094848.0
train/train_loss,1.39454


Assigning [UNK] to the unk_token key of the tokenizer
Assigning [SEP] to the sep_token key of the tokenizer
Assigning [PAD] to the pad_token key of the tokenizer
Assigning [CLS] to the cls_token key of the tokenizer
Assigning [MASK] to the mask_token key of the tokenizer
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 1000000
  Num Epochs = 20
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 156260
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss
5000,2.6962,2.284791
10000,2.3266,2.024099
15000,2.1629,1.842095
20000,2.0399,1.693693
25000,1.9313,1.583185
30000,1.8404,1.508486
35000,1.7741,1.463975
40000,1.7334,1.437089
45000,1.6953,1.418838
50000,1.6641,1.396953


***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_3/checkpoint-5000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_3/checkpoint-5000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_3/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_3/checkpoint-5000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_3/checkpoint-5000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_3/checkpoint-10000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_3/checkpoint-10000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_3/checkpoint-10000/pytor

Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_3/checkpoint-60000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_3/checkpoint-60000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_3/checkpoint-60000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_3/checkpoint-60000/special_tokens_map.json
Deleting older checkpoint [/scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_3/checkpoint-50000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_3/checkpoint-65000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_3/checkpoint-65000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_3/checkpoint-65000/pytorch_model.bin
tokenizer config file sav

***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_3/checkpoint-115000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_3/checkpoint-115000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_3/checkpoint-115000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_3/checkpoint-115000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_3/checkpoint-115000/special_tokens_map.json
Deleting older checkpoint [/scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_3/checkpoint-105000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_3/checkpoint-120000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenl

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,█▆▅▄▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,█▆▆▄▃▄▄▄▆▅▄▅▃▃▅▇▃▃▃▃▄▇▄▃▂█▆▄▃▂▁
eval/samples_per_second,▁▃▃▅▆▅▅▄▃▄▅▄▆▆▄▂▆▆▆▆▅▂▅▆▆▁▃▅▆▇█
eval/steps_per_second,▁▃▃▅▆▅▅▄▃▄▅▄▆▆▄▂▆▆▆▆▅▂▅▆▆▁▃▅▆▇█
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/learning_rate,███▇▇▇▇▆▆▆▆▅▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▁▁▁
train/loss,█▆▅▄▄▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,1.29021
eval/runtime,14.0638
eval/samples_per_second,3555.218
eval/steps_per_second,27.802
train/epoch,20.0
train/global_step,156260.0
train/learning_rate,0.0
train/loss,1.4727
train/total_flos,268745747212800.0
train/train_loss,1.66373


Assigning [UNK] to the unk_token key of the tokenizer
Assigning [SEP] to the sep_token key of the tokenizer
Assigning [PAD] to the pad_token key of the tokenizer
Assigning [CLS] to the cls_token key of the tokenizer
Assigning [MASK] to the mask_token key of the tokenizer
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 1000000
  Num Epochs = 20
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 156260
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss
5000,2.2421,1.600359
10000,1.7178,1.378072
15000,1.535,1.3154
20000,1.4681,1.282308
25000,1.4318,1.262799
30000,1.412,1.247404
35000,1.3927,1.227851
40000,1.3825,1.226134
45000,1.3692,1.226744
50000,1.3636,1.219057


***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_4/checkpoint-5000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_4/checkpoint-5000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_4/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_4/checkpoint-5000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_4/checkpoint-5000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_4/checkpoint-10000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_4/checkpoint-10000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_4/checkpoint-10000/pytor

Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_4/checkpoint-60000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_4/checkpoint-60000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_4/checkpoint-60000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_4/checkpoint-60000/special_tokens_map.json
Deleting older checkpoint [/scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_4/checkpoint-50000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_4/checkpoint-65000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_4/checkpoint-65000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_4/checkpoint-65000/pytorch_model.bin
tokenizer config file sav

***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_4/checkpoint-115000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_4/checkpoint-115000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_4/checkpoint-115000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_4/checkpoint-115000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_4/checkpoint-115000/special_tokens_map.json
Deleting older checkpoint [/scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_4/checkpoint-105000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_4/checkpoint-120000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenl

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,█▄▃▃▃▂▂▂▂▂▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▃▂▂▂▂▂▂▂▂▁▂▂▂▂▂▅▃▃▃▃▃█▃▃▃█▃▃▁▁▂
eval/samples_per_second,▅▇▆▇▇▇▇▇▇▇▇▇▇▇▇▄▆▆▆▆▆▁▆▆▆▁▆▆██▇
eval/steps_per_second,▅▇▆▇▇▇▇▇▇▇▇▇▇▇▇▄▆▆▆▆▆▁▆▆▆▁▆▆██▇
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/learning_rate,███▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▂▂▂▂▁▁▁
train/loss,█▄▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,1.17217
eval/runtime,14.1117
eval/samples_per_second,3543.147
eval/steps_per_second,27.707
train/epoch,20.0
train/global_step,156260.0
train/learning_rate,0.0
train/loss,1.3032
train/total_flos,268745747212800.0
train/train_loss,1.38826


Assigning [UNK] to the unk_token key of the tokenizer
Assigning [SEP] to the sep_token key of the tokenizer
Assigning [PAD] to the pad_token key of the tokenizer
Assigning [CLS] to the cls_token key of the tokenizer
Assigning [MASK] to the mask_token key of the tokenizer
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 1000000
  Num Epochs = 20
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 156260
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss
5000,2.0728,1.410241
10000,1.5463,1.3087
15000,1.4491,1.265374
20000,1.4021,1.245277
25000,1.3773,1.233929
30000,1.3637,1.221093
35000,1.3495,1.200732
40000,1.3442,1.203284
45000,1.3337,1.203784
50000,1.3294,1.199546


***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_5/checkpoint-5000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_5/checkpoint-5000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_5/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_5/checkpoint-5000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_5/checkpoint-5000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_5/checkpoint-10000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_5/checkpoint-10000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_5/checkpoint-10000/pytor

Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_5/checkpoint-60000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_5/checkpoint-60000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_5/checkpoint-60000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_5/checkpoint-60000/special_tokens_map.json
Deleting older checkpoint [/scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_5/checkpoint-50000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_5/checkpoint-65000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_5/checkpoint-65000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_5/checkpoint-65000/pytorch_model.bin
tokenizer config file sav

***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_5/checkpoint-115000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_5/checkpoint-115000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_5/checkpoint-115000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_5/checkpoint-115000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_5/checkpoint-115000/special_tokens_map.json
Deleting older checkpoint [/scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_5/checkpoint-105000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_5/checkpoint-120000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenl

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,█▅▄▄▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂▁▁▁▁▁▁▁▁▁
eval/runtime,▃▁▁▂▂▂▂▅▂▃▂▆▂▂▃▆█▂▁▁▄▃▂▂▃▅▁▂▂▂▂
eval/samples_per_second,▆██▇▇▇▇▄▇▆▇▃▇▇▆▂▁▇██▅▆▇▇▆▄█▇▇▇▇
eval/steps_per_second,▆██▇▇▇▇▄▇▆▇▃▇▇▆▂▁▇██▅▆▇▇▆▄█▇▇▇▇
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/learning_rate,███▇▇▇▇▆▆▆▆▅▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▁▁▁
train/loss,█▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,1.15312
eval/runtime,14.3395
eval/samples_per_second,3486.874
eval/steps_per_second,27.267
train/epoch,20.0
train/global_step,156260.0
train/learning_rate,1e-05
train/loss,1.2767
train/total_flos,268745747212800.0
train/train_loss,1.34646


Assigning [UNK] to the unk_token key of the tokenizer
Assigning [SEP] to the sep_token key of the tokenizer
Assigning [PAD] to the pad_token key of the tokenizer
Assigning [CLS] to the cls_token key of the tokenizer
Assigning [MASK] to the mask_token key of the tokenizer
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 1000000
  Num Epochs = 20
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 156260
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss
5000,2.6742,2.247338
10000,2.2888,1.983894
15000,2.1238,1.802521
20000,1.9904,1.647753
25000,1.8813,1.552505
30000,1.797,1.480862
35000,1.7346,1.437792
40000,1.6945,1.410568
45000,1.6541,1.392488
50000,1.6205,1.368134


***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_6/checkpoint-5000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_6/checkpoint-5000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_6/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_6/checkpoint-5000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_6/checkpoint-5000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_6/checkpoint-10000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_6/checkpoint-10000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_6/checkpoint-10000/pytor

Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_6/checkpoint-60000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_6/checkpoint-60000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_6/checkpoint-60000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_6/checkpoint-60000/special_tokens_map.json
Deleting older checkpoint [/scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_6/checkpoint-50000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_6/checkpoint-65000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_6/checkpoint-65000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/pos_mlm_6/checkpoint-65000/pytorch_model.bin
tokenizer config file sav

## retrain with correct tokenizer

In [None]:
# data related args
tagset = 'xpos'
select = 1000000
reprocess = False

# bert related args
model_max_length = 128
hidden_size = 32
# num_hidden_layers = 3 
num_attention_heads = 4
intermediate_size = 128

# training related args
control_steps = 5000 # num_steps to log and save
num_train_epochs = 20
batchsize = 128
dropout_prob = 0.1
tagger = 'corenlp'

# sweep variables
# TAGGER = ['corenlp']
# TAGGER = ['hunpos', 'corenlp', 'stanza']
NUM_LAYERS = [4]
MLM_P = [0.15]
LR = [5e-4, 8e-4]

NUM_LAYERS, MLM_P, LR = np.meshgrid(NUM_LAYERS, MLM_P, LR)
NUM_LAYERS, MLM_P, LR = NUM_LAYERS.flatten(), MLM_P.flatten(), LR.flatten()
num_runs = len(LR)

for i_run in trange(num_runs):
    
    num_hidden_layers = int(NUM_LAYERS[i_run])
    mlm_probability = float(MLM_P[i_run])
    lr = float(LR[i_run])
    
    # create tokenizer
    vocab = get_pos_vocab(tagger, tagset=tagset)
    tokenizer = get_tokenizer(vocab, model_max_length = model_max_length)
    
    # transform or load dataset
    cache_location = f'/scratch/data_jz17d/data/bookcorpus/bookcorpus_{numerize.numerize(select).lower()}_{tagger}_transformed.hf'
    if not os.path.exists(cache_location) or reprocess:
        tagging_func = tagger2func[tagger]
        trainset = dataset['train'].select(range(select)).map(tagging_func, batched=True)
        trainset.save_to_disk(cache_location)
    else:
        trainset = load_from_disk(cache_location)
    trainset = trainset.remove_columns(['text'])
    
    select2 = range(select,select+50000)
    cache_location = f'/scratch/data_jz17d/data/bookcorpus/bookcorpus_50k_{tagger}_transformed_test.hf'
    if not os.path.exists(cache_location) or reprocess:
        tagging_func = tagger2func[tagger]
        testset = dataset['train'].select(select2).map(tagging_func, batched=True)
        testset.save_to_disk(cache_location)
    else:
        testset = load_from_disk(cache_location)
    testset = testset.remove_columns(['text'])
#     test_input_ids = torch.LongTensor([[1]+[item for t in testset['input_ids'] for item in t[1:-1]]+[2]])

    # mlm data collater
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=True,
        mlm_probability=mlm_probability
    )

    # model config
    config = BertConfig(vocab_size = len(tokenizer.get_vocab()),
                        hidden_size = hidden_size,
                        num_hidden_layers = num_hidden_layers,
                        num_attention_heads = num_attention_heads,
                        intermediate_size = intermediate_size,
                        hidden_act = 'gelu',
                        hidden_dropout_prob = dropout_prob,
                        attention_probs_dropout_prob = dropout_prob,
                        max_position_embeddings = model_max_length,
                        type_vocab_size = 2,
                        initializer_range = 0.02,
                        layer_norm_eps = 1e-12,
                        pad_token_id = tokenizer.pad_token_id)
    # init model
    bert = BertForMaskedLM(config)

    # trainer config
    training_args = TrainingArguments(
        learning_rate=lr,
        output_dir= f"/scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_{i_run}",
        overwrite_output_dir=True,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=batchsize,
        per_device_eval_batch_size=batchsize,
        evaluation_strategy='steps',
        save_steps=control_steps,
        logging_steps=control_steps,
        eval_steps=control_steps,
        save_total_limit=2,
        prediction_loss_only=True,
        remove_unused_columns=False,
#         report_to='wandb',
        )
    
    # wandb config
    wconfig = {}
    wconfig['num_hidden_layers'] = num_hidden_layers
    wconfig['mlm_probability'] = mlm_probability
    wconfig['lr'] = lr
    run = wandb.init(project="POS MLM CoreNLP", 
                     entity="fsu-dsc-cil", 
                     dir='/scratch/data_jz17d/wandb_tmp/', 
                     config=wconfig,
                     name=f'pos mlm {i_run}',
                     reinit=True)

    trainer = Trainer(
        model=bert,
        args=training_args,
        tokenizer=tokenizer,
        data_collator=data_collator,
        train_dataset=trainset,
        eval_dataset=testset,
    )
    trainer.train()
    run.finish()

  0%|          | 0/2 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mcpuyyp[0m ([33mfsu-dsc-cil[0m). Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 1000000
  Num Epochs = 20
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 156260
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss
5000,2.1977,1.503165
10000,1.6216,1.332788
15000,1.4802,1.282856
20000,1.4228,1.256439
25000,1.3911,1.235706
30000,1.3717,1.220973
35000,1.3515,1.200947
40000,1.3414,1.197688
45000,1.3279,1.200239
50000,1.3202,1.194722


***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_0/checkpoint-5000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_0/checkpoint-5000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_0/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_0/checkpoint-5000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_0/checkpoint-5000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_0/checkpoint-10000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_0/checkpoint-10000/config.json
Model weights saved in /scra

Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_0/checkpoint-55000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_0/checkpoint-55000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_0/checkpoint-55000/special_tokens_map.json
Deleting older checkpoint [/scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_0/checkpoint-45000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_0/checkpoint-60000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_0/checkpoint-60000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_0/checkpoint-60000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/po

Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_0/checkpoint-105000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_0/checkpoint-105000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_0/checkpoint-105000/special_tokens_map.json
Deleting older checkpoint [/scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_0/checkpoint-95000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_0/checkpoint-110000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_0/checkpoint-110000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_0/checkpoint-110000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/res

Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_0/checkpoint-155000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_0/checkpoint-155000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_0/checkpoint-155000/special_tokens_map.json
Deleting older checkpoint [/scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_0/checkpoint-145000] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)




VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,█▅▄▃▃▃▂▂▂▂▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▃▁▁▂▂▂▁▂▂▁▂▂▂▅▂█▆▂▁▂▃▆▂▂▂▁▂▃▂▂▂
eval/samples_per_second,▆██▇▇▇█▇▇█▇▇▇▄▇▁▃▇█▇▆▂▇▇▇█▇▆▇▇▇
eval/steps_per_second,▆██▇▇▇█▇▇█▇▇▇▄▇▁▃▇█▇▆▂▇▇▇█▇▆▇▇▇
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/learning_rate,███▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▂▂▂▂▁▁▁
train/loss,█▄▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,1.14494
eval/runtime,14.433
eval/samples_per_second,3464.28
eval/steps_per_second,27.091
train/epoch,20.0
train/global_step,156260.0
train/learning_rate,0.0
train/loss,1.2571
train/total_flos,355516133955840.0
train/train_loss,1.34217


Assigning [UNK] to the unk_token key of the tokenizer
Assigning [SEP] to the sep_token key of the tokenizer
Assigning [PAD] to the pad_token key of the tokenizer
Assigning [CLS] to the cls_token key of the tokenizer
Assigning [MASK] to the mask_token key of the tokenizer
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 1000000
  Num Epochs = 20
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 156260
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss
5000,2.0299,1.427583
10000,1.5466,1.307512
15000,1.4388,1.266698
20000,1.3867,1.239994
25000,1.357,1.218349
30000,1.3403,1.204897
35000,1.3243,1.189534
40000,1.3167,1.187331
45000,1.3049,1.189532
50000,1.299,1.185083


***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_1/checkpoint-5000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_1/checkpoint-5000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_1/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_1/checkpoint-5000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_1/checkpoint-5000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_1/checkpoint-10000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_1/checkpoint-10000/config.json
Model weights saved in /scra

Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_1/checkpoint-55000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_1/checkpoint-55000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_1/checkpoint-55000/special_tokens_map.json
Deleting older checkpoint [/scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_1/checkpoint-45000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_1/checkpoint-60000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_1/checkpoint-60000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_1/checkpoint-60000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/po

Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_1/checkpoint-105000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_1/checkpoint-105000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_1/checkpoint-105000/special_tokens_map.json
Deleting older checkpoint [/scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_1/checkpoint-95000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_1/checkpoint-110000
Configuration saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_1/checkpoint-110000/config.json
Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_1/checkpoint-110000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/res

Model weights saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_1/checkpoint-155000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_1/checkpoint-155000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_1/checkpoint-155000/special_tokens_map.json
Deleting older checkpoint [/scratch/data_jz17d/result/pos_mlm_corenlp/retrained_pos_mlm_1/checkpoint-145000] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)




VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,█▅▄▄▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▁▁▁▁▁▁▁▁▁▁
eval/runtime,▆▇▆▆▆▆▆▆█▂▃▂▁▂▂▁▅▂▁▁▂▅▁▁▆▁▁▁▁▂▁
eval/samples_per_second,▃▂▃▃▃▃▃▃▁▇▆▇█▇▇█▄▇██▇▄██▃████▇█
eval/steps_per_second,▃▂▃▃▃▃▃▃▁▇▆▇█▇▇█▄▇██▇▄██▃████▇█
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/learning_rate,███▇▇▇▇▆▆▆▆▅▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▁▁▁
train/loss,█▄▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,1.13185
eval/runtime,14.4942
eval/samples_per_second,3449.664
eval/steps_per_second,26.976
train/epoch,20.0
train/global_step,156260.0
train/learning_rate,1e-05
train/loss,1.2366
train/total_flos,355516133955840.0
train/train_loss,1.31382


In [None]:
cache_location = f'/scratch/data_jz17d/data/bookcorpus/bookcorpus_{numerize.numerize(select).lower()}_{tagger}_transformed.hf'

trainset = load_from_disk(cache_location)