In [None]:
import wget

In [None]:
# download file from url
wget.download('https://raw.githubusercontent.com/Denis2054/Transformers-for-NLP-2nd-Edition/master/Chapter04/kant.txt', 'F:/datasets/pre-trained-roberta/kant.txt')

In [None]:
from pathlib import Path

In [None]:
from tokenizers import ByteLevelBPETokenizer

In [None]:
path = 'F:/datasets/pre-trained-roberta/kant.txt'

In [None]:
print(str(Path(".").glob("D:/datasets/pre-trained-roberta/*.txt")))

In [None]:
tokenizer = ByteLevelBPETokenizer()

In [None]:
tokenizer.train(files=path, vocab_size=52_000, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>" ])

In [None]:
tokenizer.save_model("F:\datasets\pre-trained-roberta\KantaiBERT")

In [None]:
from tokenizers.implementations import ByteLevelBPETokenizer

In [None]:
from tokenizers.processors import BertProcessing

In [None]:
tokenizer = ByteLevelBPETokenizer('F:/datasets/pre-trained-roberta/KantaiBERT/vocab.json', 'F:/datasets/pre-trained-roberta/KantaiBERT/merges.txt')

In [None]:
# let's test the tokenizer
tokenizer.encode("Hi, everyone! How you are enjoying this presentation!").tokens

In [None]:
# see number of tokens
tokenizer.encode("Hello, everyone! How you are enjoying this presentation!")

In [None]:
# add start and end tokens
tokenizer._tokenizer.post_processor = BertProcessing(("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")),)


In [None]:
tokenizer.enable_truncation(max_length=512)

In [None]:
# lets encode the sentence
tokenizer.encode("Hello, everyone! How you are enjoying this presentation!")

In [None]:
tokenizer.encode("Hello, everyone! How you are enjoying this presentation!").tokens

In [None]:
import torch
# check specs of available GPU
torch.cuda.get_device_name(0)

In [None]:
# Compute Unified Device Architecture (CUDA) was developed by NVIDIA to use the parallel computing power of GPUs for general purpose computing.
torch.cuda.is_available()


In [None]:
from transformers import RobertaConfig


In [None]:
config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [None]:
from transformers import RobertaTokenizer

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('F:/datasets/pre-trained-roberta/KantaiBERT', max_len=512)

In [None]:
from transformers import RobertaForMaskedLM

In [None]:
model = RobertaForMaskedLM(config=config)

In [None]:
print(model.config)

In [None]:
# build dataset
from transformers import LineByLineTextDataset
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path='F:/datasets/pre-trained-roberta/kant.txt',
    block_size=128,
)

In [None]:
# COLATOR
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
# trainer
from transformers import Trainer, TrainingArguments

In [None]:
training_args = TrainingArguments(
    output_dir='F:/datasets/pre-trained-roberta/KantaiBERT',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    save_steps=10_000,
    save_total_limit=2,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [None]:
# pretraining the model

trainer.train()

In [None]:
# save model
trainer.save_model('F:/datasets/pre-trained-roberta/KantaiBERT')

In [1]:
# load model
from transformers import RobertaForMaskedLM
model = RobertaForMaskedLM.from_pretrained('F:/datasets/pre-trained-roberta/KantaiBERT')


In [2]:
# fill mask pipeline
from transformers import pipeline
fill_mask = pipeline(
    "fill-mask",
    model="F:/datasets/pre-trained-roberta/KantaiBERT", 
    tokenizer="F:/datasets/pre-trained-roberta/KantaiBERT"
)

In [None]:
# ask our model to think like immanuel Kant
fill_mask("Immanuel Kant was a German philosopher who is a central figure in modern philosophy. In his writings, the term \"metaphysics\" refers to: <mask>")

In [3]:
fill_mask("Human thinking is a <mask> phenomenon.")

[{'score': 0.032270532101392746,
  'token': 610,
  'token_str': ' practical',
  'sequence': 'Human thinking is a practical phenomenon.'},
 {'score': 0.021122438833117485,
  'token': 468,
  'token_str': ' pure',
  'sequence': 'Human thinking is a pure phenomenon.'},
 {'score': 0.019380025565624237,
  'token': 589,
  'token_str': ' empirical',
  'sequence': 'Human thinking is a empirical phenomenon.'},
 {'score': 0.017984643578529358,
  'token': 483,
  'token_str': ' mere',
  'sequence': 'Human thinking is a mere phenomenon.'},
 {'score': 0.017538659274578094,
  'token': 758,
  'token_str': ' transcendental',
  'sequence': 'Human thinking is a transcendental phenomenon.'}]

In [None]:
fill_mask("The human mind is a <mask> phenomenon.")