# Train language model from scratch using Transformers and Tokenizers

We’ll use the Esperanto portion of the OSCAR corpus from INRIA

In [10]:
!wget -c https://cdn-datasets.huggingface.co/EsperBERTo/data/oscar.eo.txt

--2021-02-16 17:40:02--  https://cdn-datasets.huggingface.co/EsperBERTo/data/oscar.eo.txt
Resolving cdn-datasets.huggingface.co (cdn-datasets.huggingface.co)... 13.225.103.105, 13.225.103.115, 13.225.103.85, ...
Connecting to cdn-datasets.huggingface.co (cdn-datasets.huggingface.co)|13.225.103.105|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.



In [12]:
!pip install git+https://github.com/huggingface/transformers
!pip list | grep -E 'transformers|tokenizers'

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-h0vkuc94
  Running command git clone -q https://github.com/huggingface/transformers /tmp/pip-req-build-h0vkuc94
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (PEP 517) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.4.0.dev0-cp36-none-any.whl size=1824852 sha256=4249f53c1ef6666919db9cdc8119f0a463890e87f597b44b5a3100f427296ead
  Stored in directory: /tmp/pip-ephem-wheel-cache-a78rq4p1/wheels/70/d3/52/b3fa4f8b8ef04167ac62e5bb2accb62ae764db2a378247490e
Successfully built transformers
tokenizers                    0.10.1         
transformers                  4.4.0.dev0     


We are training a byte-level Byte-pair encoding tokenizer. 

In [2]:
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

In [3]:
paths = [str(x) for x in Path(".").glob("**/*.txt")]

In [4]:
paths

['oscar.eo.txt', 'oscarBERT/merges.txt']

In [5]:
tokenizer = ByteLevelBPETokenizer()

In [None]:
%%time
tokenizer.train(files=paths, vocab_size = 20000, min_frequency=2, special_tokens=[
      "<s>",
      "<pad>",
      "</s>",
      "<unk>",
      "<mask>",                                                                         
])

In [None]:
!mkdir oscarBERT
tokenizer.save_model("oscarBERT")

vocab.json: List of the most frequent tokens ranked by frequency.
merges.txt : list of merges

In [None]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

In [None]:
tokenizer = ByteLevelBPETokenizer(
    "./oscarBERT/vocab.json",
    "./oscarBERT/merges.txt"
)

In [None]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)

In [None]:
tokenizer.encode("Mi estas Julien.")

In [None]:
tokenizer.encode("Mi estas Julien.").tokens

In [None]:
import torch
from transformers import RobertaConfig
from transformers import RobertaTokenizerFast

In [None]:
config = RobertaConfig(
    vocab_size = 20000,
    max_position_embeddings = 514,
    num_attention_heads = 12,
    num_hidden_layers = 6,
    type_vocab_size = 1,
)

In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained('./oscarBERT',max_len=512)

Initialize our model only from Config as we are training it from scratch

In [None]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

In [None]:
model.num_parameters()

Build Training Dataset

In [None]:
%%time

from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = "./oscar.eo.txt",
    block_size=64,
)

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenzier = tokenizer, mlm=True, mlm_probability=0.20
)

Initialize the Trainer

In [None]:
from transformers import Trainer, TrainingArguments

In [None]:
training_args = TrainingArguments(
    output_dir = './oscarBERT',
    overwrite_output_dir = True,
    num_train_epochs = 1,
    per_gpu_train_batch_size = 64,
    save_steps = 5000,
    save_total_limit = 2,
)

trainer = Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    train_dataset = dataset,
    prediction_loss_only = True,
)

# Start Training

In [None]:
%%time
trainer.train()

In [None]:
trainer.save_model("./oscarBERT")

Check the model

In [None]:
from transformers import pipeline

In [None]:
fill_mask = pipeline(
    "fill-mask",
    model = "./oscarBERT",
    tokenizer = "./oscarBERT"
)

In [None]:
fill_mask("Just lister <mask")