https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb#scrollTo=YpvnFFmZJD-N

https://zablo.net/blog/post/training-roberta-from-scratch-the-missing-guide-polish-language-model/

https://huggingface.co/blog/how-to-train

https://blog.naver.com/PostView.naver?blogId=sooftware&logNo=222494375953&parentCategoryNo=&categoryNo=13&viewDate=&isShowPopularPosts=false&from=postView

In [11]:
import os
import datasets

from transformers import AlbertConfig
from tokenizers import CharBPETokenizer
from tokenizers.processors import RobertaProcessing
from tokenizers import SentencePieceBPETokenizer
from tokenizers import BertWordPieceTokenizer
from transformers import AlbertForMaskedLM

In [2]:
tokenizer = BertWordPieceTokenizer(handle_chinese_chars=False)

In [3]:
data_path = '../data/preprocessed/'
data_files = [os.path.join(data_path, path) for path in os.listdir(data_path)]

In [4]:
tokenizer.train(files=data_files, vocab_size=10000, min_frequency=2)

In [5]:
# tokenizer.train(files=data_files[0], vocab_size=10000, min_frequency=2, special_tokens=[
#     "<s>",
#     "<pad>",
#     "</s>",
#     "<unk>",
#     "<mask>",
# ])

In [6]:
os.makedirs("tk", exist_ok=True)

In [7]:
tokenizer.save_model("tk")

['tk/vocab.txt']

In [8]:
tokenizer._tokenizer.post_processor = RobertaProcessing(
    ('[SEP]', tokenizer.token_to_id('[SEP]')), 
    ('[CLS]', tokenizer.token_to_id('[CLS]')))
tokenizer.enable_truncation(max_length=512)

In [9]:
tokenizer.save('tk/tokenizer.json')

In [54]:
# tokenizer._tokenizer.post_processor = RobertaProcessing(
#     ("</s>", tokenizer.token_to_id("</s>")),
#     ("<s>", tokenizer.token_to_id("<s>")),
# )
# tokenizer.enable_truncation(max_length=512)

In [10]:
tokenizer.encode('안녕하세요. 저는 이은식입니다. 만나서 반갑습니다.').tokens

['[CLS]',
 '안',
 '##녕',
 '##하',
 '##세',
 '##요',
 '.',
 '저',
 '##는',
 '이',
 '##은',
 '##식',
 '##입',
 '##니다',
 '.',
 '만나',
 '##서',
 '반',
 '##갑',
 '##습',
 '##니다',
 '.',
 '[SEP]']

In [13]:
config = AlbertConfig(
    vocab_size=10000,
    embedding_size=128,
    hidden_size=768,
    num_hidden_layers=4,
    num_hidden_groups=1,
    num_attention_heads=12,
    intermediate_size=768*4,
    inner_group_num=1,
    hidden_act='gelu_new',
    hidden_dropout_prob=0,
    attention_probs_dropout_prob=0,
    max_position_embeddings=512,
    type_vocab_size=2,
    initializer_range=0.02,
    layer_norm_eps=1e-12,
    classifier_dropout_prob=0.1,
    position_embedding_type='absolute'
)

In [14]:
model = AlbertForMaskedLM(config=config)

In [15]:
model.num_parameters()

8641680

In [17]:
# from transformers import LineByLineWithSOPTextDataset

In [18]:
# dataset = LineByLineWithSOPTextDataset(tokenizer=tokenizer, file_dir=data_path, block_size=128)

In [16]:
tokenizer

Tokenizer(vocabulary_size=10000, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=False, strip_accents=None, lowercase=True, wordpieces_prefix=##)

In [29]:
tokenizer.mask_token = '[MASK]'

In [30]:
# from transformers import DataCollatorForLanguageModeling

# data_collator = DataCollatorForLanguageModeling(
#     tokenizer=tokenizer, mlm=True, mlm_probability=0.15
# )

In [17]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(
    tokenizer_file='tk/tokenizer.json',
    model_max_length=512,
    unk_token='[UNK]',
    sep_token='[SEP]',
    pad_token='[PAD]',
    cls_token='[CLS]',
    mask_token='[MASK]'
)
# tokenizer.save_pretrained('SAVE_DIR')

In [18]:
tokenizer

PreTrainedTokenizerFast(name_or_path='', vocab_size=10000, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [34]:
dataset = datasets.load_dataset('text', data_files=data_files, split='train')

Using custom data configuration default-9bff3161a5c9f6fe
Reusing dataset text (/root/.cache/huggingface/datasets/text/default-9bff3161a5c9f6fe/0.0.0/d86c40dad297bdddf277b406c6a59f0250b5318c400bf23d420a31aff88c84c4)


In [35]:
%%time
def encode(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=150)
dataset = dataset.map(encode, batched=True, batch_size=2000, num_proc=8)

CPU times: user 2.07 s, sys: 509 ms, total: 2.58 s
Wall time: 5min 38s


In [52]:
dataset.data['input_ids'][151]

<pyarrow.ListScalar: [2, 3076, 2235, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]>

In [37]:
os.makedirs("results", exist_ok=True)

In [38]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

In [39]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `AlbertForMaskedLM.forward` and have been ignored: text.
***** Running training *****
  Num examples = 10071586
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 629475


KeyError: 'loss'

https://huggingface.co/docs/datasets/master/en/loading#text-files

In [75]:
dataset

Dataset({
    features: ['text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 10071586
})