In [1]:
import wget

In [2]:
# download file from url
wget.download('https://raw.githubusercontent.com/Denis2054/Transformers-for-NLP-2nd-Edition/master/Chapter04/kant.txt', 'D:/datasets/pre-trained-roberta/kant.txt')

'D:/datasets/pre-trained-roberta/kant.txt'

In [2]:
from pathlib import Path

In [3]:
from tokenizers import ByteLevelBPETokenizer

In [4]:
path = 'D:/datasets/pre-trained-roberta/kant.txt'

In [8]:
print(str(Path(".").glob("D:/datasets/pre-trained-roberta/*.txt")))

<generator object Path.glob at 0x000001C229D0C3C0>


In [5]:
tokenizer = ByteLevelBPETokenizer()

In [6]:
tokenizer.train(files=path, vocab_size=52_000, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>" ])

In [7]:
tokenizer.save_model("D:\datasets\pre-trained-roberta\KantaiBERT")

['D:\\datasets\\pre-trained-roberta\\KantaiBERT\\vocab.json',
 'D:\\datasets\\pre-trained-roberta\\KantaiBERT\\merges.txt']

In [8]:
from tokenizers.implementations import ByteLevelBPETokenizer

In [9]:
from tokenizers.processors import BertProcessing

In [10]:
tokenizer = ByteLevelBPETokenizer('D:/datasets/pre-trained-roberta/KantaiBERT/vocab.json', 'D:/datasets/pre-trained-roberta/KantaiBERT/merges.txt')

In [11]:
# let's test the tokenizer
tokenizer.encode("Hi, everyone! How you are enjoying this presentation!").tokens

['H',
 'i',
 ',',
 'Ġeveryone',
 '!',
 'ĠHow',
 'Ġyou',
 'Ġare',
 'Ġenjoying',
 'Ġthis',
 'Ġpresentation',
 '!']

In [12]:
# see number of tokens
tokenizer.encode("Hello, everyone! How you are enjoying this presentation!")

Encoding(num_tokens=13, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [13]:
# add start and end tokens
tokenizer._tokenizer.post_processor = BertProcessing(("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")),)


In [14]:
tokenizer.enable_truncation(max_length=512)

In [15]:
# lets encode the sentence
tokenizer.encode("Hello, everyone! How you are enjoying this presentation!")

Encoding(num_tokens=15, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [16]:
tokenizer.encode("Hello, everyone! How you are enjoying this presentation!").tokens

['<s>',
 'H',
 'ell',
 'o',
 ',',
 'Ġeveryone',
 '!',
 'ĠHow',
 'Ġyou',
 'Ġare',
 'Ġenjoying',
 'Ġthis',
 'Ġpresentation',
 '!',
 '</s>']

In [17]:
import torch
# check specs of available GPU
torch.cuda.get_device_name(0)

'NVIDIA RTX A6000'

In [18]:
# Compute Unified Device Architecture (CUDA) was developed by NVIDIA to use the parallel computing power of GPUs for general purpose computing.
torch.cuda.is_available()


True

In [19]:
from transformers import RobertaConfig


In [20]:
config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [21]:
from transformers import RobertaTokenizer

In [23]:
tokenizer = RobertaTokenizer.from_pretrained('D:/datasets/pre-trained-roberta/KantaiBERT', max_len=512)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizer'.


In [24]:
from transformers import RobertaForMaskedLM

In [25]:
model = RobertaForMaskedLM(config=config)

In [26]:
print(model.config)

RobertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.27.0.dev0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 52000
}



In [27]:
# build dataset
from transformers import LineByLineTextDataset
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path='D:/datasets/pre-trained-roberta/kant.txt',
    block_size=128,
)



In [28]:
# COLATOR
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [29]:
# trainer
from transformers import Trainer, TrainingArguments

In [30]:
training_args = TrainingArguments(
    output_dir='D:/datasets/pre-trained-roberta/KantaiBERT',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    save_steps=10_000,
    save_total_limit=2,
)

In [31]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [34]:
#pretraining the model

trainer.train()



  0%|          | 0/2672 [00:00<?, ?it/s]



{'loss': 6.6144, 'learning_rate': 4.06437125748503e-05, 'epoch': 0.19}
{'loss': 5.7477, 'learning_rate': 3.12874251497006e-05, 'epoch': 0.37}
{'loss': 5.2769, 'learning_rate': 2.1931137724550898e-05, 'epoch': 0.56}
{'loss': 5.0147, 'learning_rate': 1.2574850299401197e-05, 'epoch': 0.75}
{'loss': 4.8651, 'learning_rate': 3.218562874251497e-06, 'epoch': 0.94}
{'train_runtime': 545.9491, 'train_samples_per_second': 313.15, 'train_steps_per_second': 4.894, 'train_loss': 5.45996922932699, 'epoch': 1.0}


TrainOutput(global_step=2672, training_loss=5.45996922932699, metrics={'train_runtime': 545.9491, 'train_samples_per_second': 313.15, 'train_steps_per_second': 4.894, 'train_loss': 5.45996922932699, 'epoch': 1.0})

In [35]:
# save model
trainer.save_model('D:/datasets/pre-trained-roberta/KantaiBERT')

In [36]:
# fill mask pipeline
from transformers import pipeline
fill_mask = pipeline(
    "fill-mask",
    model="D:/datasets/pre-trained-roberta/KantaiBERT", 
    tokenizer="D:/datasets/pre-trained-roberta/KantaiBERT"
)

In [38]:
# ask our model to think like immanuel Kant
fill_mask("Immanuel Kant was a German philosopher who is a central figure in modern philosophy. In his writings, the term \"metaphysics\" refers to: <mask>")

[{'score': 0.030743876472115517,
  'token': 16,
  'token_str': ',',
  'sequence': 'Immanuel Kant was a German philosopher who is a central figure in modern philosophy. In his writings, the term "metaphysics" refers to:,'},
 {'score': 0.010728741064667702,
  'token': 18,
  'token_str': '.',
  'sequence': 'Immanuel Kant was a German philosopher who is a central figure in modern philosophy. In his writings, the term "metaphysics" refers to:.'},
 {'score': 0.01028077770024538,
  'token': 339,
  'token_str': ' not',
  'sequence': 'Immanuel Kant was a German philosopher who is a central figure in modern philosophy. In his writings, the term "metaphysics" refers to: not'},
 {'score': 0.007070682942867279,
  'token': 470,
  'token_str': ' other',
  'sequence': 'Immanuel Kant was a German philosopher who is a central figure in modern philosophy. In his writings, the term "metaphysics" refers to: other'},
 {'score': 0.006622942630201578,
  'token': 322,
  'token_str': ' as',
  'sequence': 'Imman

In [39]:
fill_mask("Human thinking is a <mask> phenomenon.")

[{'score': 0.01841389574110508,
  'token': 610,
  'token_str': ' practical',
  'sequence': 'Human thinking is a practical phenomenon.'},
 {'score': 0.014414318837225437,
  'token': 666,
  'token_str': ' moral',
  'sequence': 'Human thinking is a moral phenomenon.'},
 {'score': 0.013597640208899975,
  'token': 468,
  'token_str': ' pure',
  'sequence': 'Human thinking is a pure phenomenon.'},
 {'score': 0.009760517627000809,
  'token': 569,
  'token_str': ' possible',
  'sequence': 'Human thinking is a possible phenomenon.'},
 {'score': 0.008659623563289642,
  'token': 758,
  'token_str': ' transcendental',
  'sequence': 'Human thinking is a transcendental phenomenon.'}]