https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb#scrollTo=YpvnFFmZJD-N

https://zablo.net/blog/post/training-roberta-from-scratch-the-missing-guide-polish-language-model/

https://huggingface.co/blog/how-to-train

https://blog.naver.com/PostView.naver?blogId=sooftware&logNo=222494375953&parentCategoryNo=&categoryNo=13&viewDate=&isShowPopularPosts=false&from=postView

In [1]:
import os
import datasets

from transformers import AlbertConfig
from tokenizers import CharBPETokenizer
from tokenizers.processors import RobertaProcessing
from tokenizers import SentencePieceBPETokenizer
from tokenizers import BertWordPieceTokenizer
from transformers import AlbertForMaskedLM

In [11]:
data_path = './korean/'
data_path = './trainable_corpus/'
data_files = [os.path.join(data_path, path) for path in os.listdir(data_path)]

# Get Ready

In [2]:
tokenizer = BertWordPieceTokenizer(handle_chinese_chars=False)

In [4]:
tokenizer.train(files=data_files, vocab_size=10000, min_frequency=2)

In [5]:
# tokenizer.train(files=data_files[0], vocab_size=10000, min_frequency=2, special_tokens=[
#     "<s>",
#     "<pad>",
#     "</s>",
#     "<unk>",
#     "<mask>",
# ])

In [5]:
os.makedirs("tk", exist_ok=True)

In [6]:
tokenizer.save_model("tk")

['tk/vocab.txt']

In [7]:
tokenizer._tokenizer.post_processor = RobertaProcessing(
    ('[SEP]', tokenizer.token_to_id('[SEP]')), 
    ('[CLS]', tokenizer.token_to_id('[CLS]')))
tokenizer.enable_truncation(max_length=512)

In [8]:
tokenizer.save('tk/tokenizer.json')

In [9]:
# tokenizer._tokenizer.post_processor = RobertaProcessing(
#     ("</s>", tokenizer.token_to_id("</s>")),
#     ("<s>", tokenizer.token_to_id("<s>")),
# )
# tokenizer.enable_truncation(max_length=512)

In [12]:
tokenizer.encode('안녕하세요. 저는 이은식입니다. 만나서 반갑습니다. 오늘 회의는 8시 대회의실에서 개최됩니다.').tokens

['[CLS]',
 '안',
 '##녕',
 '##하',
 '##세',
 '##요',
 '.',
 '저',
 '##는',
 '이',
 '##은',
 '##식',
 '##입',
 '##니다',
 '.',
 '만나',
 '##서',
 '반',
 '##갑',
 '##습',
 '##니다',
 '.',
 '오늘',
 '회의',
 '##는',
 '8',
 '##시',
 '대회',
 '##의',
 '##실',
 '##에',
 '##서',
 '개최',
 '##되',
 '##ᆸ니다',
 '.',
 '[SEP]']

In [13]:
dataset = datasets.load_dataset('text', data_files=data_files, split='train')

Using custom data configuration default-ef8b9a452d4493ad


Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-ef8b9a452d4493ad/0.0.0/d86c40dad297bdddf277b406c6a59f0250b5318c400bf23d420a31aff88c84c4...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-ef8b9a452d4493ad/0.0.0/d86c40dad297bdddf277b406c6a59f0250b5318c400bf23d420a31aff88c84c4. Subsequent calls will reuse this data.


In [14]:
%%time
def encode(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)
dataset = dataset.map(encode, batched=True, batch_size=1000, num_proc=8)

CPU times: user 235 ms, sys: 88.1 ms, total: 323 ms
Wall time: 1min 21s


In [15]:
os.makedirs('dataset_v2')
dataset.save_to_disk('dataset_v2')

In [35]:
import numpy as np
max([np.count_nonzero(i.values.to_numpy()) for i in dataset.data['input_ids']])

# EDA

In [3]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(
    tokenizer_file='tk/tokenizer.json',
    model_max_length=512,
    unk_token='[UNK]',
    sep_token='[SEP]',
    pad_token='[PAD]',
    cls_token='[CLS]',
    mask_token='[MASK]'
)

In [4]:
dataset = datasets.load_from_disk('dataset')

In [5]:
# dataset.remove_columns('text')

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 4213106
})

In [15]:
tokenizer.decode()

PreTrainedTokenizerFast(name_or_path='', vocab_size=10000, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [None]:
len(dataset['input_ids'])

In [5]:
dataset

Dataset({
    features: ['text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 4213106
})

In [4]:
import re
import os

In [5]:
def line_preprocess(line):
    line = line.strip()
    line = line.replace('\n\n', '\n').replace('&lt;', '').replace('&gt;', '')
    line = re.sub('<.*?>', '', line)
    line = re.sub('\(.*?\)', '', line)
    hangul = re.compile('[^ ,.!?0-9a-zA-Zㄱ-ㅎ가-힣]')
    line = hangul.sub('', line)
    line = line.strip()
    return line

In [6]:
os.makedirs('trainable_corpus', exist_ok=True)

In [7]:
input_files = [path for path in os.listdir('../../data/text')]

In [8]:
data_path = '../../data/text'
corpus_path = './trainable_corpus/'
for i in input_files:
    path = os.path.join(data_path, i)
    files = os.listdir(path)
    os.makedirs(os.path.join(corpus_path, i), exist_ok=True)
    for j in files:
        with open(os.path.join(data_path, i, j)) as fr:
            with open(os.path.join(corpus_path, i, j), 'w') as fw:
                paragraphs = []
                paragraph = ""
                for line in fr.readlines():
                    if '<doc id' in line:
                        paragraph = ""
                        continue

                    if '</doc>' in line:
                        paragraphs.append(paragraph)
                        continue

                    line = line_preprocess(line)
                    if not line:
                        continue

                    paragraph += line


                for paragraph in paragraphs:
                    texts = paragraph.split(".")

                    new_texts = []
                    tmp = ""
                    for text in texts:
                        text = text.strip()
                        tokenized_text = tokenizer.tokenize(text)
                        if len(tmp) + len(tokenized_text) > 510:
                            new_texts.append(tmp.rstrip())
                            tmp = ""
                        tmp += text + '. '


                    fw.write('\n'.join(new_texts))
                
os.chdir(corpus_path)
for i in os.listdir():
    os.chdir(i)
    os.system('cat wiki_* > merged_' + i)
    os.system('mv merged_'+i+' ../')
    os.chdir('../')
    os.system('rm -rf '+i)   

Token indices sequence length is longer than the specified maximum sequence length for this model (769 > 512). Running this sequence through the model will result in indexing errors


In [None]:
input_files = [path for path in os.listdir('../../data/preprocessed/') if 'txt' in path]

os.makedirs('korean', exist_ok=True)
hangul = re.compile('[^ ,.!?0-9a-zA-Zㄱ-ㅎ가-힣]')
for file in input_files:
    with open('../../data/preprocessed/' + file) as fr:
        with open('korean/'+file, 'w') as fw:
            for line in fr.readlines():
                if not line.strip():
                    continue
                line = hangul.sub('', line)
                line += '\n'
                fw.write(line)

In [17]:
with open('../../data/text/AA/wiki_00') as f:
    paragraphs = []
    paragraph = ""
    for line in f.readlines():
        if '<doc id' in line:
            paragraph = ""
            continue
        
        if '</doc>' in line:
            paragraphs.append(paragraph)
            continue
        
        line = line_preprocess(line)
        if not line:
            continue
        
        paragraph += line

In [21]:
texts = paragraphs[0].split(".")

In [23]:
tokenizer.tokenize(texts[0])

['지미',
 '카',
 '##터',
 '##제',
 '##임스',
 '얼',
 '카',
 '##터',
 '주니어',
 '##는',
 '민주당',
 '출신',
 '미국',
 '39',
 '##대',
 '대통령',
 '이',
 '##며',
 ',',
 '독재',
 '##자',
 '##의',
 '사신',
 '##이',
 '##라',
 '##는',
 '별명',
 '##을',
 '가지',
 '##고',
 '있',
 '##다']

In [27]:
new_texts = []
tmp = ""
for text in texts:
    text = text.strip()
    tokenized_text = tokenizer.tokenize(text)
    if len(tmp) + len(tokenized_text) > 510:
        new_texts.append(tmp.rstrip())
        tmp = ""
    tmp += text + '. '

In [28]:
new_texts[0]

'지미 카터제임스 얼 카터 주니어는 민주당 출신 미국 39대 대통령 이며, 독재자의 사신이라는 별명을 가지고 있다. 생애. 어린 시절. 지미 카터는 조지아주 섬터 카운티 플레인스 마을에서 태어났다. 조지아 공과대학교를 졸업하였다. 그 후 해군에 들어가 전함·원자력·잠수함의 승무원으로 일하였다. 1953년 미국 해군 대위로 예편하였고 이후 땅콩·면화 등을 가꿔 많은 돈을 벌었다. 그의 별명이 "땅콩 농부" 로 알려졌다. 정계 입문. 1962년 조지아 주 상원 의원 선거에서 낙선하나 그 선거가 부정선거 였음을 입증하게 되어 당선되고, 1966년 조지아 주지사 선거에 낙선하지만, 1970년 조지아 주지사를 역임했다. 대통령이 되기 전 조지아주 상원의원을 두번 연임했으며, 1971년부터 1975년까지 조지아 지사로 근무했다. 조지아 주지사로 지내면서, 미국에 사는 흑인 등용법을 내세웠다. 대통령 재임. 1976년 미합중국 제39대 대통령 선거에 민주당 후보로 출마하여 도덕주의 정책으로 내세워서, 많은 지지를 받고 제럴드 포드 대통령을 누르고 당선되었다.'

# Jump

In [3]:
config = AlbertConfig(
    vocab_size=10000,
    embedding_size=128,
    hidden_size=768,
    num_hidden_layers=3,
    num_hidden_groups=1,
    num_attention_heads=8,
    intermediate_size=768*4,
    inner_group_num=1,
    hidden_act='gelu_new',
    hidden_dropout_prob=0,
    attention_probs_dropout_prob=0,
    max_position_embeddings=512,
    type_vocab_size=2,
    initializer_range=0.02,
    layer_norm_eps=1e-12,
    classifier_dropout_prob=0.1,
    position_embedding_type='absolute'
)

In [4]:
model = AlbertForMaskedLM(config=config)

In [5]:
model.num_parameters()

8641680

In [6]:
# from transformers import LineByLineWithSOPTextDataset
# dataset = LineByLineWithSOPTextDataset(tokenizer=tokenizer, file_dir=data_path, block_size=128)
# tokenizer.mask_token = '[MASK]'

In [7]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(
    tokenizer_file='tk/tokenizer.json',
    model_max_length=512,
    unk_token='[UNK]',
    sep_token='[SEP]',
    pad_token='[PAD]',
    cls_token='[CLS]',
    mask_token='[MASK]'
)
# tokenizer.save_pretrained('SAVE_DIR')

In [8]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [9]:
tokenizer

PreTrainedTokenizerFast(name_or_path='', vocab_size=10000, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [10]:
import datasets
dataset = datasets.load_from_disk('dataset')

In [11]:
os.makedirs("results", exist_ok=True)

In [12]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    fp16=True,
    gradient_accumulation_steps=8,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator
)

Using amp fp16 backend


In [13]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `AlbertForMaskedLM.forward` and have been ignored: text.
***** Running training *****
  Num examples = 4213106
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 8
  Total optimization steps = 32915


Step,Training Loss
500,7.9959
1000,7.0319
1500,6.9063


KeyboardInterrupt: 

In [15]:
del trainer

NameError: name 'trainer' is not defined

https://huggingface.co/docs/datasets/master/en/loading#text-files