In [1]:
#!pip install --ignore-installed PyYAML
#!pip install transformers
#!pip install tensorflow
#!pip install datasets

# Train a model from scratch

## Train Tokenizer

In [2]:
# in this notebook we'll only get one of the files (the Oscar one) for the sake of simplicity and performance
#!wget -c https://cdn-datasets.huggingface.co/EsperBERTo/data/oscar.eo.txt

In [3]:
%%time 

from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path(".").glob("**/*.txt")]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

CPU times: user 15min 55s, sys: 2min 36s, total: 18min 32s
Wall time: 41.2 s


In [4]:
!mkdir EsperBERTo
tokenizer.save_model("EsperBERTo")

mkdir: cannot create directory ‘EsperBERTo’: File exists


['EsperBERTo/vocab.json', 'EsperBERTo/merges.txt']

### Use Tokenizer

In [5]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing


tokenizer = ByteLevelBPETokenizer(
    "./EsperBERTo/vocab.json",
    "./EsperBERTo/merges.txt",
)

In [6]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [7]:
tokenizer.encode("Mi estas Julien.")

Encoding(num_tokens=7, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [8]:
tokenizer.encode("Mi estas Julien.").tokens

['<s>', 'Mi', 'Ġestas', 'ĠJuli', 'en', '.', '</s>']

## Train a model

In [9]:
# Check that PyTorch sees it
import torch
torch.cuda.is_available()

True

In [10]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [11]:
!nvidia-smi

Thu Sep  9 11:07:54 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.91.03    Driver Version: 460.91.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro M2000        Off  | 00000000:AF:00.0  On |                  N/A |
| 56%   43C    P0    23W /  75W |    798MiB /  4040MiB |      2%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [12]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("./EsperBERTo", max_len=512)

In [13]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

In [14]:
model.num_parameters()
# => 84 million parameters

83504416

In [15]:
%%time
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./oscar.eo.txt",
    block_size=128,
)



CPU times: user 29min 20s, sys: 37.9 s, total: 29min 58s
Wall time: 1min 10s


In [16]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [17]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./EsperBERTo",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_gpu_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [19]:
import wandb
wandb.login()

True

In [None]:
%%time
trainer.train()

In [None]:
trainer.save_model("./EsperBERTo")

In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./EsperBERTo",
    tokenizer="./EsperBERTo"
)

In [None]:
# The sun <mask>.
# =>

fill_mask("La suno <mask>.")

In [None]:
fill_mask("Jen la komenco de bela <mask>.")

# This is the beginning of a beautiful <mask>.
# =>