In [None]:
from tokenizers import Tokenizer
from tokenizers.trainers import WordLevelTrainer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
from transformers import BertForMaskedLM, BertConfig
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import torch
from nltk.parse.corenlp import CoreNLPParser, CoreNLPDependencyParser 
from nltk.tag.hunpos import HunposTagger
from nltk.tokenize import word_tokenize
import stanza
import nltk
import numpy as np
from datasets import load_dataset, load_from_disk
from numerize import numerize
import wandb
import os 
import typing
import tokenizers
from tqdm.auto import trange, tqdm

In [None]:
# get tagset 
def get_pos_vocab(tagger, tagset='xpos'):
#     upos_vocab = ['ADJ',
#                 'ADP',
#                 'ADV',
#                 'AUX',
#                 'CCONJ',
#                 'DET',
#                 'INTJ',
#                 'NOUN',
#                 'NUM',
#                 'PART',
#                 'PRON',
#                 'PROPN',
#                 'PUNCT',
#                 'SCONJ',
#                 'SYM',
#                 'VERB',
#                 'X']
    # if use the simple conversion, upos vocab is smaller
    upos_vocab = ['ADJ',
                 'ADP',
                 'ADV',
                 'CCONJ',
                 'DET',
                 'INTJ',
                 'NOUN',
                 'NUM',
                 'PART',
                 'PRON',
                 'PROPN',
                 'PUNCT',
                 'SYM',
                 'VERB',
                 'X']
    from nltk.data import load
    tagdict = load('help/tagsets/upenn_tagset.pickle')
    xpos_vocab = list(tagdict.keys())
    xpos_vocab = sorted(xpos_vocab)
    if tagset=='xpos':
        return xpos_vocab
    else:
        return upos_vocab


In [None]:
# this conversion comes from https://universaldependencies.org/tagset-conversion/en-penn-uposf.html
# however, it's just impossible to convert to upos without knowing the context. 
# Manning's word here: https://github.com/UniversalDependencies/docs/issues/212#issuecomment-148846154
# how to use Manning's converter: https://github.com/clulab/processors/wiki/Converting-from-Penn-Treebank-to-Basic-Stanford-Dependencies
xpos2upos = {'#': 'SYM',
             '$': 'SYM',
             "''": 'PUNCT',
             ',': 'PUNCT',
             '-LRB-': 'PUNCT',
             '-RRB-': 'PUNCT',
             '.': 'PUNCT',
             ':': 'PUNCT',
             'AFX': 'ADJ',
             'CC': 'CCONJ',
             'CD': 'NUM',
             'DT': 'DET',
             'EX': 'PRON',
             'FW': 'X',
             'HYPH': 'PUNCT',
             'IN': 'ADP',
             'JJ': 'ADJ',
             'JJR': 'ADJ',
             'JJS': 'ADJ',
             'LS': 'X',
             'MD': 'VERB',
             'NFP': 'PUNCT', # manually added. 
             'NIL': 'X',
             'NN': 'NOUN',
             'NNP': 'PROPN',
             'NNPS': 'PROPN',
             'NNS': 'NOUN',
             'PDT': 'DET',
             'POS': 'PART',
             'PRP': 'PRON',
             'PRP$': 'DET',
             'RB': 'ADV',
             'RBR': 'ADV',
             'RBS': 'ADV',
             'RP': 'ADP',
             'SYM': 'SYM',
             'TO': 'PART',
             'UH': 'INTJ',
             'VB': 'VERB',
             'VBD': 'VERB',
             'VBG': 'VERB',
             'VBN': 'VERB',
             'VBP': 'VERB',
             'VBZ': 'VERB',
             'WDT': 'DET',
             'WP': 'PRON',
             'WP$': 'DET',
             'WRB': 'ADV',
             '``': 'PUNCT'}

In [None]:
def get_tokenizer(vocab, model_max_length = 128):
    # Tokenizer is from tokenizers package. PreTrainedTokenizerFast is from tranformers package.
    # PreTrainedTokenizerFast can load vocab saved/trained by Tokenizer
    t = Tokenizer(WordLevel(unk_token="[UNK]"))
    t.pre_tokenizer = Whitespace()
    t.add_special_tokens(["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]",])

    t.add_tokens(vocab) 
#     trainer makes "-LRB-" 3 tokens
#     trainer = WordLevelTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
#     data = [' '.join(vocab)]
#     t.train_from_iterator(data, trainer=trainer)

    t.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", t.token_to_id("[CLS]")),
            ("[SEP]", t.token_to_id("[SEP]")),
        ],
    )

    t.enable_padding(pad_id=t.token_to_id("[PAD]"), pad_token="[PAD]")
    t.enable_truncation(max_length=model_max_length)
    t.save('/home/jz17d/Desktop/upos_tokenizer.json')

    tokenizer = PreTrainedTokenizerFast(tokenizer_file="/home/jz17d/Desktop/upos_tokenizer.json", unk_token="[UNK]")
    tokenizer.pad_token = '[PAD]'
    tokenizer.mask_token = '[MASK]'
    tokenizer.unk_token = '[UNK]'
    special_tokens = {
         "unk_token": "[UNK]",
         "sep_token": "[SEP]",
         "pad_token": "[PAD]",
         "cls_token": "[CLS]",
         "mask_token": "[MASK]" }
    tokenizer.add_special_tokens(special_tokens)
    # tokenizer.add_special_tokens({'unk_token':'[UNK]'})
    tokenizer.model_max_length=model_max_length
    return tokenizer

In [None]:
# load xpos tokenizer, use its convert_ids_to_tokens function later
xpos_tokenizer = PreTrainedTokenizerFast(tokenizer_file="/home/jz17d/Desktop/my_tokenizer.json", unk_token="[UNK]")
xpos_tokenizer.pad_token = '[PAD]'
xpos_tokenizer.mask_token = '[MASK]'
xpos_tokenizer.unk_token = '[UNK]'
special_tokens = {
     "unk_token": "[UNK]",
     "sep_token": "[SEP]",
     "pad_token": "[PAD]",
     "cls_token": "[CLS]",
     "mask_token": "[MASK]" }
xpos_tokenizer.add_special_tokens(special_tokens)
# tokenizer.add_special_tokens({'unk_token':'[UNK]'})
xpos_tokenizer.model_max_length=model_max_length

In [None]:
def xpos2upos_batch(samples):
    new = []
    for input_ids in samples['input_ids']:
        seq = xpos_tokenizer.convert_ids_to_tokens(input_ids)
        for i in range(len(seq)):
            if seq[i] in xpos2upos:
                seq[i] = xpos2upos[seq[i]]
        new.append(tokenizer.convert_tokens_to_ids(seq))
    samples['input_ids'] = new
    return samples

In [None]:
# collect tagset and create tokenizer
tagger = 'corenlp'
tagset = 'upos'
select = 1000000
model_max_length = 128

vocab = get_pos_vocab(tagger, tagset=tagset)
tokenizer = get_tokenizer(vocab, model_max_length = model_max_length)

In [None]:
# load processed xpos corpus, convert it to upos corpus
xpos_cache_location = f'/scratch/data_jz17d/data/bookcorpus/bookcorpus_{numerize.numerize(select).lower()}_{tagger}_transformed.hf'
trainset = load_from_disk(xpos_cache_location)

trainset = trainset.map(xpos2upos_batch, batched=True)
upos_cache_location = f'/scratch/data_jz17d/data/bookcorpus/bookcorpus_{numerize.numerize(select).lower()}_{tagger}_upos_transformed.hf'
trainset.save_to_disk(upos_cache_location)

trainset = trainset.remove_columns(['text'])

  0%|          | 0/1000 [00:00<?, ?ba/s]

In [None]:
# same for test set
xpos_cache_location = f'/scratch/data_jz17d/data/bookcorpus/bookcorpus_50k_{tagger}_transformed_test.hf'
testset = load_from_disk(xpos_cache_location)

testset = testset.map(xpos2upos_batch, batched=True)
upos_cache_location = f'/scratch/data_jz17d/data/bookcorpus/bookcorpus_50k_{tagger}_upos_transformed_test.hf'
testset.save_to_disk(upos_cache_location)

testset = testset.remove_columns(['text'])

  0%|          | 0/50 [00:00<?, ?ba/s]

# training

In [None]:

# bert related args
model_max_length = 128
hidden_size = 32
num_attention_heads = 4
intermediate_size = 128

# training related args
control_steps = 5000 # num_steps to log and save
num_train_epochs = 20
batchsize = 128
dropout_prob = 0.1
tagger = 'corenlp'

NUM_LAYERS = [4]
MLM_P = [0.15]
LR = [5e-4, 7e-4, 9e-4]

NUM_LAYERS, MLM_P, LR = np.meshgrid(NUM_LAYERS, MLM_P, LR)
NUM_LAYERS, MLM_P, LR = NUM_LAYERS.flatten(), MLM_P.flatten(), LR.flatten()
num_runs = len(LR)

for i_run in trange(num_runs):
    
    num_hidden_layers = int(NUM_LAYERS[i_run])
    mlm_probability = float(MLM_P[i_run])
    lr = float(LR[i_run])
    
    # mlm data collater
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=True,
        mlm_probability=mlm_probability
    )

    # model config
    config = BertConfig(vocab_size = len(tokenizer.get_vocab()),
                        hidden_size = hidden_size,
                        num_hidden_layers = num_hidden_layers,
                        num_attention_heads = num_attention_heads,
                        intermediate_size = intermediate_size,
                        hidden_act = 'gelu',
                        hidden_dropout_prob = dropout_prob,
                        attention_probs_dropout_prob = dropout_prob,
                        max_position_embeddings = model_max_length,
                        type_vocab_size = 2,
                        initializer_range = 0.02,
                        layer_norm_eps = 1e-12,
                        pad_token_id = tokenizer.pad_token_id)
    # init model
    bert = BertForMaskedLM(config)

    # trainer config
    training_args = TrainingArguments(
        learning_rate=lr,
        output_dir= f"/scratch/data_jz17d/result/upos_mlm_corenlp/run_{i_run}",
        overwrite_output_dir=True,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=batchsize,
        per_device_eval_batch_size=batchsize,
        evaluation_strategy='steps',
        save_steps=control_steps,
        logging_steps=control_steps,
        eval_steps=control_steps,
        save_total_limit=2,
        prediction_loss_only=True,
        remove_unused_columns=False,
#         report_to='wandb',
        )
    
    # wandb config
    wconfig = {}
    wconfig['num_hidden_layers'] = num_hidden_layers
    wconfig['mlm_probability'] = mlm_probability
    wconfig['lr'] = lr
    run = wandb.init(project="POS MLM UPOS (simple conversion)", 
                     entity="fsu-dsc-cil", 
                     dir='/scratch/data_jz17d/wandb_tmp/', 
                     config=wconfig,
                     name=f'run {i_run}',
                     reinit=True)

    trainer = Trainer(
        model=bert,
        args=training_args,
        tokenizer=tokenizer,
        data_collator=data_collator,
        train_dataset=trainset,
        eval_dataset=testset,
    )
    trainer.train()
    run.finish()

  0%|          | 0/3 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mcpuyyp[0m ([33mfsu-dsc-cil[0m). Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 1000000
  Num Epochs = 20
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 156260
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss
5000,1.6974,1.271106
10000,1.3782,1.181196
15000,1.2911,1.128774
20000,1.2348,1.113014
25000,1.2053,1.093791
30000,1.1889,1.081578
35000,1.1743,1.068646
40000,1.1647,1.06644
45000,1.1554,1.069607
50000,1.1486,1.061451


***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/upos_mlm_corenlp/run_0/checkpoint-5000
Configuration saved in /scratch/data_jz17d/result/upos_mlm_corenlp/run_0/checkpoint-5000/config.json
Model weights saved in /scratch/data_jz17d/result/upos_mlm_corenlp/run_0/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/upos_mlm_corenlp/run_0/checkpoint-5000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/upos_mlm_corenlp/run_0/checkpoint-5000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/upos_mlm_corenlp/run_0/checkpoint-10000
Configuration saved in /scratch/data_jz17d/result/upos_mlm_corenlp/run_0/checkpoint-10000/config.json
Model weights saved in /scratch/data_jz17d/result/upos_mlm_corenlp/run_0/checkpoint-10000/pytorch_model.bin
tokenizer c

Model weights saved in /scratch/data_jz17d/result/upos_mlm_corenlp/run_0/checkpoint-60000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/upos_mlm_corenlp/run_0/checkpoint-60000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/upos_mlm_corenlp/run_0/checkpoint-60000/special_tokens_map.json
Deleting older checkpoint [/scratch/data_jz17d/result/upos_mlm_corenlp/run_0/checkpoint-50000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/upos_mlm_corenlp/run_0/checkpoint-65000
Configuration saved in /scratch/data_jz17d/result/upos_mlm_corenlp/run_0/checkpoint-65000/config.json
Model weights saved in /scratch/data_jz17d/result/upos_mlm_corenlp/run_0/checkpoint-65000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/upos_mlm_corenlp/run_0/checkpoint-65000/tokenizer_config.json
Special tokens file saved in /s

tokenizer config file saved in /scratch/data_jz17d/result/upos_mlm_corenlp/run_0/checkpoint-115000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/upos_mlm_corenlp/run_0/checkpoint-115000/special_tokens_map.json
Deleting older checkpoint [/scratch/data_jz17d/result/upos_mlm_corenlp/run_0/checkpoint-105000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/upos_mlm_corenlp/run_0/checkpoint-120000
Configuration saved in /scratch/data_jz17d/result/upos_mlm_corenlp/run_0/checkpoint-120000/config.json
Model weights saved in /scratch/data_jz17d/result/upos_mlm_corenlp/run_0/checkpoint-120000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/upos_mlm_corenlp/run_0/checkpoint-120000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/upos_mlm_corenlp/run_0/checkpoint-120000/special_tokens_map.json
Deleting ol

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,█▅▄▄▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▃▅▂▁▃▅▂▆▇▆▅▆▂▁▁▃▇▅▃▂▃▂▂▁▄▄▂▃█▃▆
eval/samples_per_second,▆▄▇█▆▃▆▃▂▃▄▂▇██▆▂▄▆▇▆▇▇█▅▅▇▆▁▆▃
eval/steps_per_second,▆▄▇█▆▃▆▃▂▃▄▂▇██▆▂▄▆▇▆▇▇█▅▅▇▆▁▆▃
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/learning_rate,███▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▂▂▂▂▁▁▁
train/loss,█▄▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,1.01978
eval/runtime,15.3638
eval/samples_per_second,3254.393
eval/steps_per_second,25.449
train/epoch,20.0
train/global_step,156260.0
train/learning_rate,0.0
train/loss,1.0974
train/total_flos,355277097077760.0
train/train_loss,1.16102


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


***** Running training *****
  Num examples = 1000000
  Num Epochs = 20
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 156260
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss
5000,1.654,1.226765
10000,1.3146,1.14055
15000,1.2207,1.099715
20000,1.1828,1.084653
25000,1.1631,1.070345
30000,1.1521,1.059517
35000,1.1406,1.0481
40000,1.1348,1.048019
45000,1.1273,1.051631
50000,1.1223,1.043344


***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/upos_mlm_corenlp/run_1/checkpoint-5000
Configuration saved in /scratch/data_jz17d/result/upos_mlm_corenlp/run_1/checkpoint-5000/config.json
Model weights saved in /scratch/data_jz17d/result/upos_mlm_corenlp/run_1/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in /scratch/data_jz17d/result/upos_mlm_corenlp/run_1/checkpoint-5000/tokenizer_config.json
Special tokens file saved in /scratch/data_jz17d/result/upos_mlm_corenlp/run_1/checkpoint-5000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
Saving model checkpoint to /scratch/data_jz17d/result/upos_mlm_corenlp/run_1/checkpoint-10000
Configuration saved in /scratch/data_jz17d/result/upos_mlm_corenlp/run_1/checkpoint-10000/config.json
Model weights saved in /scratch/data_jz17d/result/upos_mlm_corenlp/run_1/checkpoint-10000/pytorch_model.bin
tokenizer c