In [1]:
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

In [2]:
corpora_path = Path("..", "data", "corpora")
corpora_fname = "corpus"
paths = [str(x) for x in corpora_path.glob(f"*_qya.txt")]
paths

['..\\data\\corpora\\corpus_qya.txt']

In [2]:
bpe_path = Path("..", "data", "models", "bpe")

In [4]:
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(
    files=paths, vocab_size=10_000, min_frequency=2, 
    special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>",],
    show_progress=True
)

In [13]:
modelpath = Path(bpe_path, "quenya.json")

In [6]:
tokenizer.save(modelpath.as_posix())

In [47]:
list(modelpath.parent.glob("*-*"))

[WindowsPath('../data/models/bpe/quenya-merges.txt'),
 WindowsPath('../data/models/bpe/quenya-vocab.json')]

In [18]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

tokenizer = ByteLevelBPETokenizer(
    Path(bpe_path, "quenya-vocab.json").as_posix(),
    Path(bpe_path, "quenya-merges.txt").as_posix()
)

### Train model from scratch

In [3]:
import torch
torch.cuda.is_available()

True

In [8]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=10_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [5]:
from tokenizers.implementations import ByteLevelBPETokenizer as Tokenizer

In [37]:
from transformers import PreTrainedTokenizerFast, RobertaTokenizer

#tokenizer = PreTrainedTokenizerFast(tokenizer_file=modelpath.as_posix())
tokenizer = RobertaTokenizer(
    vocab_file=Path(bpe_path, "quenya-vocab.json").as_posix()
    , merges_file=Path(bpe_path, "quenya-merges.txt").as_posix()
)

In [15]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

### Build training dataset

In [16]:
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=Path("..", "data", "corpora", "corpus_qya.txt"),
    block_size=128,
)

In [17]:
tokenizer.add_special_tokens(
{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}
)

0

In [24]:
from transformers import DataCollatorForLanguageModeling, DataCollatorForWholeWordMask
#from transformers.utils import 

In [132]:
from transformers.file_utils import PaddingStrategy

In [25]:
data_collator = DataCollatorForWholeWordMask(
    tokenizer=tokenizer, 
    mlm=True, 
    mlm_probability=0.15,
#    padding = PaddingStrategy.DO_NOT_PAD,
    #max_length = ? # maximum input length if not specified.
)

In [53]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="../data/models/trained",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=32,
    save_steps=1_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [143]:
import torch

In [145]:
torch.cuda.empty_cache()

In [54]:
trainer.train()

Step,Training Loss
500,6.2926
1000,6.3618
1500,6.2004
2000,6.0398


TrainOutput(global_step=2480, training_loss=6.17513417889995, metrics={'train_runtime': 158.3334, 'train_samples_per_second': 15.663, 'total_flos': 207235437936960.0, 'epoch': 10.0})

In [55]:
trainer.save_model("../data/models/trained")

In [56]:
del trainer

### Check the model

In [18]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="../data/models/trained",
    tokenizer=tokenizer
)



In [19]:
tokenizer.encode("mal Eru antane i alie")

[819, 337, 857, 264, 7413]

In [20]:
tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}

In [23]:
sentence = "mal Eru antane i alie".split(" ")
replace_at = 0
sentence[replace_at] = tokenizer.mask_token

fill_mask(" ".join(sentence))

[{'sequence': 'An Eru antane i alie',
  'score': 0.1441660374403,
  'token': 406,
  'token_str': 'An'},
 {'sequence': 'Mal Eru antane i alie',
  'score': 0.09933653473854065,
  'token': 357,
  'token_str': 'Mal'},
 {'sequence': 'Ar Eru antane i alie',
  'score': 0.09197397530078888,
  'token': 352,
  'token_str': 'Ar'},
 {'sequence': 'ar Eru antane i alie',
  'score': 0.059607118368148804,
  'token': 261,
  'token_str': 'ar'},
 {'sequence': 'Etta Eru antane i alie',
  'score': 0.03197641298174858,
  'token': 488,
  'token_str': 'Etta'}]