In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Change the present working directory to the parent directory for WikiText
import os
os.chdir('/content/drive/MyDrive/Data/')
!ls

In [None]:
!pip install transformers

In [None]:
!pip install datasets

### **Section 1: Generate byte-level BPE tokens for a corpus**
**Section 1.1: Train the tokenizer**

In [None]:
# Download the Esperanto data set
!wget -c https://cdn-datasets.huggingface.co/EsperBERTo/data/oscar.eo.txt

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE

tokenizer = Tokenizer(BPE(unk_token="<unk>")) #BytePair Encoding

In [None]:
import tokenizers

tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel() # Byte-level BPE

In [None]:
tokenizer.enable_truncation(512)

In [None]:
from tokenizers.trainers import BpeTrainer
trainer = BpeTrainer(special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])

In [None]:
# Assume that "oscar.eo.txt" data set is present in the directory "Esperanto_Input"
files = ["./Esperanto_Input/oscar.eo.txt"]

In [None]:
tokenizer.train(files, trainer)

In [None]:
# Use of decoder ensures that the trained model generates the output in the original language 
# Else the garbled output seen by opening "tokenizer.json" file in a text editor is seen in the front-end as well
from tokenizers import decoders
tokenizer.decoder = decoders.ByteLevel()

In [None]:
tokenizer.save("./Esperanto_Output/tokenizer.json")

**Section 1.2: Load and test the tokenizer**


In [None]:
tokenizer_load = Tokenizer.from_file("./Esperanto_Output/tokenizer.json")

In [None]:
# Use of RoBERTa's special tokens at beginning and end of sentence
tokenizer_load.post_processor = tokenizers.processors.RobertaProcessing(sep=("</s>", tokenizer_load.token_to_id("</s>"))
                                                                  , cls=("<s>", tokenizer_load.token_to_id("<s>")))

In [None]:
output = tokenizer_load.encode("Hello, y'all!", "How are you 😁 ?")
print(output.tokens)

In [None]:
print(output.type_ids)

In [None]:
# Perform this step to generate non-garbled characters
# from tokenizers import decoders
# tokenizer_load.decoder = decoders.ByteLevel()
tokenizer_load.decode(output.ids)

### **Section 2: Train a masked LM using the tokenizer trained & saved in Section 1**

In [None]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=30000,  # value of 30K was chosen as the tokenizer was trained with a default value of 30K
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [None]:
# Save config.json
config.to_json_file('./Esperanto_Output/config.json')

In [None]:
from transformers import RobertaTokenizerFast

# class transformers.RobertaTokenizerFast(vocab_file=None, merges_file=None, tokenizer_file=None, errors='replace'
#                                       , bos_token='<s>', eos_token='</s>', sep_token='</s>', cls_token='<s>', unk_token='<unk>', pad_token='<pad>'
#                                       , mask_token='<mask>', add_prefix_space=False, **kwargs)

In [None]:
tokenizer_new = RobertaTokenizerFast.from_pretrained("./Esperanto_Output")

In [None]:
from transformers import RobertaForMaskedLM
model = RobertaForMaskedLM(config=config)

In [None]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer_new, mlm=True, mlm_probability=0.15)

In [None]:
# Split the file into multple sub-files so that the model can be trained in less than 1 hr using a free Google Colab account
!mkdir ./shards
!split -a 40 -l 25600 -d "./Esperanto_Input/oscar.eo.txt" ./shards/shard_

In [None]:
import glob
files = glob.glob('./shards/*')
# files

In [None]:
from datasets import load_dataset
# dataset = load_dataset('text', data_files=files, split='train')
dataset = load_dataset('text', data_files=files[0], split='train') #Use only one batch of 256000 examples

In [None]:
def encode(examples):
  return tokenizer_new(examples['text'], truncation=True, padding='max_length', max_length=512)

dataset = dataset.map(encode, batched=True) # Apply the "encode" function to all elements of "dataset" which is passed as "example" variable
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./Esperanto_Output",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=8, # lowered the batch size from 64
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [None]:
%%time
trainer.train()

In [None]:
# EXAMPLE OUTPUT for the above cell

# The following columns in the training set  don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: text.
# ***** Running training *****
#   Num examples = 25603
#   Num Epochs = 1
#   Instantaneous batch size per device = 8
#   Total train batch size (w. parallel, distributed & accumulation) = 8
#   Gradient Accumulation steps = 1
#   Total optimization steps = 3201

# [1425/3201 24:12 < 30:12, 0.98 it/s, Epoch 0.44/1]
# Step 	Training Loss
# 500 	7.665600
# 1000 	7.513300

# [3201/3201 54:26, Epoch 1/1]
# Step 	Training Loss
# 500 	7.665600
# 1000 	7.513300
# 1500 	7.345800
# 2000 	7.225700
# 2500 	7.166000
# 3000 	7.074200

# Training completed. Do not forget to share your model on huggingface.co/models =)

# CPU times: user 54min 2s, sys: 11.4 s, total: 54min 14s
# Wall time: 54min 27s

# TrainOutput(global_step=3201, training_loss=7.3185122203022495, metrics={'train_runtime': 3267.2692, 'train_samples_per_second': 7.836, 'train_steps_per_second': 0.98, 'total_flos': 3393922222964736.0, 'train_loss': 7.3185122203022495, 'epoch': 1.0})

In [None]:
trainer.save_model("./Esperanto_Model")

### **Section 3: Load and test the trained masked LM**

In [None]:
from transformers import pipeline

In [None]:
# fill_mask = pipeline("fill-mask", model="./Esperanto_Model", tokenizer="./Esperanto_Output")
fill_mask = pipeline("fill-mask", model="./Esperanto_Model", tokenizer="./Esperanto_Model")

In [None]:
fill_mask("La suna <mask>")

# [{'score': 0.013835701160132885,
#   'sequence': 'La Ġsuna Ġ,',
#   'token': 16,
#   'token_str': ','},
#  {'score': 0.01227512676268816,
#   'sequence': 'La Ġsuna Ġ -',
#   'token': 17,
#   'token_str': '-'},
#  {'score': 0.009938908740878105,
#   'sequence': 'La Ġsuna Ġ :',
#   'token': 30,
#   'token_str': ':'},
#  {'score': 0.008791058324277401,
#   'sequence': 'La Ġsuna Ġ Ġla',
#   'token': 228,
#   'token_str': 'Ġla'},
#  {'score': 0.008423665538430214,
#   'sequence': 'La Ġsuna Ġ Ġkaj',
#   'token': 252,
#   'token_str': 'Ġkaj'}]

In [None]:
fill_mask("Mi estas <mask>")