In [None]:
%pip install tokenizers

from pathlib import Path

from tokenizers import ByteLevelBPETokenizer # type: ignore

paths = [str(x) for x in Path("./eo_data/").glob("**/*.txt")]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

# Save files to disk
tokenizer.save_model(".", "esperberto")


In [None]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing


tokenizer = ByteLevelBPETokenizer(
    "./esperberto-vocab.json",
    "./esperberto-merges.txt",
)

In [None]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)  # type: ignore
tokenizer.enable_truncation(max_length=256)

In [None]:
tokenizer.encode("Mi estas Julien.")
tokenizer.encode("Mi estas Julien.").tokens

In [6]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=26_000,
    max_position_embeddings=257,
    num_attention_heads=6,
    num_hidden_layers=3,
    type_vocab_size=1,
)

In [2]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained(".", max_len=512)

In [7]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

In [None]:
model.num_parameters()

In [3]:
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./eo_data/oscar.eo.txt",
    block_size=128,
)



In [4]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [8]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./EsperBERTo",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_gpu_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


In [9]:
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


  0%|          | 0/15228 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


IndexError: index out of range in self

In [None]:
trainer.save_model("./esperBERTo")

In [None]:
%pip install datasets

In [None]:
import torch

file_path = "./eo_data/oscar.eo.txt"
block_size = 128
with open(file_path, encoding="utf-8") as f:
    lines = [
        line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())
    ]

batch_encoding = tokenizer(
    lines, add_special_tokens=True, truncation=True, max_length=block_size
)
examples = batch_encoding["input_ids"]
examples = [{"input_ids": torch.tensor(e, dtype=torch.long)} for e in examples]

In [None]:
from transformers import pipeline, PreTrainedTokenizerFast, BertModel

fill_mask = pipeline(
    "fill-mask",
    model=model,
    tokenizer=tokenizer,
)

In [None]:
fill_mask("La suno <mask>.")

In [None]:
fill_mask("Jen la komenco de bela <mask>.")