In [1]:
import wget

In [2]:
# download file from url
wget.download('https://raw.githubusercontent.com/Denis2054/Transformers-for-NLP-2nd-Edition/master/Chapter04/kant.txt', 'D:/datasets/pre-trained-roberta/kant.txt')

'D:/datasets/pre-trained-roberta/kant.txt'

In [3]:
from pathlib import Path

In [4]:
from tokenizers import ByteLevelBPETokenizer

In [5]:
path = 'D:/datasets/pre-trained-roberta/kant.txt'

In [6]:
print(str(Path(".").glob("D:/datasets/pre-trained-roberta/*.txt")))

<generator object Path.glob at 0x000001AB05703990>


In [7]:
tokenizer = ByteLevelBPETokenizer()

In [8]:
tokenizer.train(files=path, vocab_size=52_000, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>" ])

In [9]:
tokenizer.save_model("D:\datasets\pre-trained-roberta\KantaiBERT")

['D:\\datasets\\pre-trained-roberta\\KantaiBERT\\vocab.json',
 'D:\\datasets\\pre-trained-roberta\\KantaiBERT\\merges.txt']

In [10]:
from tokenizers.implementations import ByteLevelBPETokenizer

In [11]:
from tokenizers.processors import BertProcessing

In [12]:
tokenizer = ByteLevelBPETokenizer('D:/datasets/pre-trained-roberta/KantaiBERT/vocab.json', 'D:/datasets/pre-trained-roberta/KantaiBERT/merges.txt')

In [13]:
# let's test the tokenizer
tokenizer.encode("Hi, everyone! How you are enjoying this presentation!").tokens

['H',
 'i',
 ',',
 'Ġeveryone',
 '!',
 'ĠHow',
 'Ġyou',
 'Ġare',
 'Ġenjoying',
 'Ġthis',
 'Ġpresentation',
 '!']

In [14]:
# see number of tokens
tokenizer.encode("Hello, everyone! How you are enjoying this presentation!")

Encoding(num_tokens=13, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [15]:
# add start and end tokens
tokenizer._tokenizer.post_processor = BertProcessing(("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")),)


In [16]:
tokenizer.enable_truncation(max_length=512)

In [17]:
# lets encode the sentence
tokenizer.encode("Hello, everyone! How you are enjoying this presentation!")

Encoding(num_tokens=15, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [18]:
tokenizer.encode("Hello, everyone! How you are enjoying this presentation!").tokens

['<s>',
 'H',
 'ell',
 'o',
 ',',
 'Ġeveryone',
 '!',
 'ĠHow',
 'Ġyou',
 'Ġare',
 'Ġenjoying',
 'Ġthis',
 'Ġpresentation',
 '!',
 '</s>']

In [19]:
import torch
# check specs of available GPU
torch.cuda.get_device_name(0)

'NVIDIA RTX A6000'

In [20]:
# Compute Unified Device Architecture (CUDA) was developed by NVIDIA to use the parallel computing power of GPUs for general purpose computing.
torch.cuda.is_available()


True

In [21]:
from transformers import RobertaConfig


In [22]:
config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [23]:
from transformers import RobertaTokenizer

In [24]:
tokenizer = RobertaTokenizer.from_pretrained('D:/datasets/pre-trained-roberta/KantaiBERT', max_len=512)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizer'.


In [25]:
from transformers import RobertaForMaskedLM

In [26]:
model = RobertaForMaskedLM(config=config)

In [27]:
print(model.config)

RobertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.27.0.dev0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 52000
}



In [28]:
# build dataset
from transformers import LineByLineTextDataset
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path='D:/datasets/pre-trained-roberta/kant.txt',
    block_size=128,
)



In [29]:
# COLATOR
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [30]:
# trainer
from transformers import Trainer, TrainingArguments

In [31]:
training_args = TrainingArguments(
    output_dir='D:/datasets/pre-trained-roberta/KantaiBERT',
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=32,
    save_steps=10_000,
    save_total_limit=2,
)

In [32]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [33]:
#pretraining the model

trainer.train()



  0%|          | 0/26720 [00:00<?, ?it/s]



{'loss': 6.5853, 'learning_rate': 4.906437125748503e-05, 'epoch': 0.19}
{'loss': 5.6499, 'learning_rate': 4.812874251497006e-05, 'epoch': 0.37}
{'loss': 5.0541, 'learning_rate': 4.7193113772455094e-05, 'epoch': 0.56}
{'loss': 4.7111, 'learning_rate': 4.625748502994012e-05, 'epoch': 0.75}
{'loss': 4.4615, 'learning_rate': 4.532185628742515e-05, 'epoch': 0.94}
{'loss': 4.2873, 'learning_rate': 4.438622754491018e-05, 'epoch': 1.12}
{'loss': 4.1694, 'learning_rate': 4.345059880239521e-05, 'epoch': 1.31}
{'loss': 4.0066, 'learning_rate': 4.251497005988024e-05, 'epoch': 1.5}
{'loss': 3.9312, 'learning_rate': 4.157934131736527e-05, 'epoch': 1.68}
{'loss': 3.8512, 'learning_rate': 4.06437125748503e-05, 'epoch': 1.87}
{'loss': 3.7586, 'learning_rate': 3.970808383233533e-05, 'epoch': 2.06}
{'loss': 3.6657, 'learning_rate': 3.877245508982036e-05, 'epoch': 2.25}
{'loss': 3.6165, 'learning_rate': 3.783682634730539e-05, 'epoch': 2.43}
{'loss': 3.5435, 'learning_rate': 3.6901197604790425e-05, 'epoch'



{'loss': 3.2041, 'learning_rate': 3.035179640718563e-05, 'epoch': 3.93}
{'loss': 3.1774, 'learning_rate': 2.9416167664670658e-05, 'epoch': 4.12}
{'loss': 3.1393, 'learning_rate': 2.8480538922155693e-05, 'epoch': 4.3}
{'loss': 3.0893, 'learning_rate': 2.754491017964072e-05, 'epoch': 4.49}
{'loss': 3.0979, 'learning_rate': 2.660928143712575e-05, 'epoch': 4.68}
{'loss': 3.0665, 'learning_rate': 2.5673652694610778e-05, 'epoch': 4.87}
{'loss': 3.0426, 'learning_rate': 2.473802395209581e-05, 'epoch': 5.05}
{'loss': 2.9744, 'learning_rate': 2.3802395209580838e-05, 'epoch': 5.24}
{'loss': 2.9572, 'learning_rate': 2.286676646706587e-05, 'epoch': 5.43}
{'loss': 2.9397, 'learning_rate': 2.1931137724550898e-05, 'epoch': 5.61}
{'loss': 2.9151, 'learning_rate': 2.099550898203593e-05, 'epoch': 5.8}
{'loss': 2.906, 'learning_rate': 2.0059880239520957e-05, 'epoch': 5.99}
{'loss': 2.8419, 'learning_rate': 1.912425149700599e-05, 'epoch': 6.18}
{'loss': 2.8293, 'learning_rate': 1.818862275449102e-05, 'epo



{'loss': 2.7237, 'learning_rate': 1.1639221556886227e-05, 'epoch': 7.67}
{'loss': 2.6915, 'learning_rate': 1.0703592814371257e-05, 'epoch': 7.86}
{'loss': 2.676, 'learning_rate': 9.767964071856289e-06, 'epoch': 8.05}
{'loss': 2.6774, 'learning_rate': 8.832335329341319e-06, 'epoch': 8.23}
{'loss': 2.6576, 'learning_rate': 7.896706586826349e-06, 'epoch': 8.42}
{'loss': 2.6603, 'learning_rate': 6.961077844311377e-06, 'epoch': 8.61}
{'loss': 2.6357, 'learning_rate': 6.0254491017964076e-06, 'epoch': 8.79}
{'loss': 2.6322, 'learning_rate': 5.0898203592814375e-06, 'epoch': 8.98}
{'loss': 2.6148, 'learning_rate': 4.1541916167664675e-06, 'epoch': 9.17}
{'loss': 2.6155, 'learning_rate': 3.218562874251497e-06, 'epoch': 9.36}
{'loss': 2.5936, 'learning_rate': 2.282934131736527e-06, 'epoch': 9.54}
{'loss': 2.6005, 'learning_rate': 1.3473053892215569e-06, 'epoch': 9.73}
{'loss': 2.5991, 'learning_rate': 4.116766467065869e-07, 'epoch': 9.92}
{'train_runtime': 5402.8054, 'train_samples_per_second': 31

TrainOutput(global_step=26720, training_loss=3.2903394824730423, metrics={'train_runtime': 5402.8054, 'train_samples_per_second': 316.436, 'train_steps_per_second': 4.946, 'train_loss': 3.2903394824730423, 'epoch': 10.0})

In [34]:
# save model
trainer.save_model('D:/datasets/pre-trained-roberta/KantaiBERT')

In [35]:
# fill mask pipeline
from transformers import pipeline
fill_mask = pipeline(
    "fill-mask",
    model="D:/datasets/pre-trained-roberta/KantaiBERT", 
    tokenizer="D:/datasets/pre-trained-roberta/KantaiBERT"
)

In [36]:
# ask our model to think like immanuel Kant
fill_mask("Immanuel Kant was a German philosopher who is a central figure in modern philosophy. In his writings, the term \"metaphysics\" refers to: <mask>")

[{'score': 0.3568747639656067,
  'token': 16,
  'token_str': ',',
  'sequence': 'Immanuel Kant was a German philosopher who is a central figure in modern philosophy. In his writings, the term "metaphysics" refers to:,'},
 {'score': 0.05340297147631645,
  'token': 30,
  'token_str': ':',
  'sequence': 'Immanuel Kant was a German philosopher who is a central figure in modern philosophy. In his writings, the term "metaphysics" refers to::'},
 {'score': 0.03968842700123787,
  'token': 263,
  'token_str': ' a',
  'sequence': 'Immanuel Kant was a German philosopher who is a central figure in modern philosophy. In his writings, the term "metaphysics" refers to: a'},
 {'score': 0.03458130732178688,
  'token': 17,
  'token_str': '-',
  'sequence': 'Immanuel Kant was a German philosopher who is a central figure in modern philosophy. In his writings, the term "metaphysics" refers to:-'},
 {'score': 0.023171335458755493,
  'token': 18,
  'token_str': '.',
  'sequence': 'Immanuel Kant was a German 

In [37]:
fill_mask("Human thinking is a <mask> phenomenon.")

[{'score': 0.13386596739292145,
  'token': 483,
  'token_str': ' mere',
  'sequence': 'Human thinking is a mere phenomenon.'},
 {'score': 0.08646190166473389,
  'token': 629,
  'token_str': ' given',
  'sequence': 'Human thinking is a given phenomenon.'},
 {'score': 0.07142633944749832,
  'token': 973,
  'token_str': ' real',
  'sequence': 'Human thinking is a real phenomenon.'},
 {'score': 0.05899018421769142,
  'token': 1393,
  'token_str': ' simple',
  'sequence': 'Human thinking is a simple phenomenon.'},
 {'score': 0.030353426933288574,
  'token': 468,
  'token_str': ' pure',
  'sequence': 'Human thinking is a pure phenomenon.'}]

In [38]:
fill_mask("The human mind is a <mask> phenomenon.")

[{'score': 0.07098536938428879,
  'token': 692,
  'token_str': ' necessary',
  'sequence': 'The human mind is a necessary phenomenon.'},
 {'score': 0.05616537481546402,
  'token': 1393,
  'token_str': ' simple',
  'sequence': 'The human mind is a simple phenomenon.'},
 {'score': 0.0540444515645504,
  'token': 483,
  'token_str': ' mere',
  'sequence': 'The human mind is a mere phenomenon.'},
 {'score': 0.05046757683157921,
  'token': 756,
  'token_str': ' certain',
  'sequence': 'The human mind is a certain phenomenon.'},
 {'score': 0.04848719388246536,
  'token': 2228,
  'token_str': ' single',
  'sequence': 'The human mind is a single phenomenon.'}]

In [39]:
fill_mask("The good or bad consequences arising from the performance of an <mask> action")

[{'score': 0.07824818044900894,
  'token': 663,
  'token_str': ' same',
  'sequence': 'The good or bad consequences arising from the performance of an same action'},
 {'score': 0.02895955927670002,
  'token': 774,
  'token_str': ' end',
  'sequence': 'The good or bad consequences arising from the performance of an end action'},
 {'score': 0.022305356338620186,
  'token': 394,
  'token_str': ' object',
  'sequence': 'The good or bad consequences arising from the performance of an object action'},
 {'score': 0.02161231078207493,
  'token': 1465,
  'token_str': ' infinite',
  'sequence': 'The good or bad consequences arising from the performance of an infinite action'},
 {'score': 0.02143501304090023,
  'token': 2114,
  'token_str': ' actual',
  'sequence': 'The good or bad consequences arising from the performance of an actual action'}]