In [None]:
from transformers import PreTrainedTokenizerFast

In [None]:
tokenizer = PreTrainedTokenizerFast.from_pretrained('./model_save/tokenizer')

In [None]:
len(tokenizer)

# 1. 训练tokenizer（可选）

In [2]:
import tokenizers
from tokenizers import Tokenizer, decoders
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Punctuation, Digits, Metaspace, ByteLevel
from tokenizers.normalizers import NFKC 
from rich import progress

# 2. 定义tokenizer训练语料来源

In [3]:
cropus_file =  './data/wiki.simple.txt'
tokenizer_save_path = './model_save/hf_bpe_tokenizer.josn'

# 3. 训练tokenizer的函数
`get_training_corpus`函数将多个短拒绝拼接成长度大于`chunk_len=2048`句子，每次迭代返回`buffer_size=1000`个这样的长句子

In [None]:
def train_my_huggingface_wiki_tokenizer(max_train_line: int=None, token_type: str='char') -> None:
    '''
    训练tokenizer with huggingface，至少需要32G内存，运行大概需要半个小时。
    '''

    # if not exists(tokenizer_save_path): mkdir(tokenizer_save_path)

    def get_training_corpus(buffer_size: int=1000, chunk_len: int=2048) -> list:
        '''
        一个文本块大小2048
        '''
        line_cnt = 0
        buffer = []
        with open(cropus_file, 'r', encoding='utf-8') as f_read:
            cur_chunk_txt, txt_len = [], 0
            for line in f_read:

                cur_chunk_txt.append(line)
                txt_len += len(line)
                line_cnt += 1

                if txt_len >= chunk_len:
                    buffer.append(
                        ''.join(cur_chunk_txt)
                    )
                    cur_chunk_txt, txt_len = [], 0
                
                if len(buffer) >= buffer_size:
                    yield buffer
                    buffer = []

                if isinstance(max_train_line, int) and line_cnt > max_train_line: break
                
            # yield last
            if len(buffer) > 0: yield buffer        

    special_tokens = ["[PAD]","[EOS]","[SEP]","[BOS]", "[CLS]", "[MASK]", "[UNK]"]
    
    if token_type == 'char':
        model = BPE(unk_token="[UNK]")
        tokenizer = Tokenizer(model)
        
        

        # 用兼容等价分解合并对utf编码进行等价组合，比如全角A转换为半角A
        tokenizer.normalizer = tokenizers.normalizers.Sequence([NFKC()])

        # 标点符号，数字，及Metaspace预分割（否则decode出来没有空格）
        tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence(
            [Punctuation(), Digits(individual_digits=True), Metaspace()]
        )

        tokenizer.add_special_tokens(special_tokens)
        tokenizer.decoder = decoders.Metaspace()
    elif token_type == 'byte':
        # byte BPE n不需要unk_token
        model = BPE() 
        tokenizer = Tokenizer(model)
        tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=True)

        tokenizer.add_special_tokens(special_tokens)
        tokenizer.decoder = decoders.ByteLevel(add_prefix_space=False, use_regex=True)
        tokenizer.post_processor = tokenizers.processors.ByteLevel(trim_offsets=False)
    else:
        raise Exception('token type must be `char` or `byte`')

    trainer = BpeTrainer(vocab_size=40960, min_frequency=100, show_progress=True, special_tokens=special_tokens)
    tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)

    # add \t \n 
    if '\t' not in tokenizer.get_vocab():
        tokenizer.add_tokens(['\t'])
    if '\n' not in tokenizer.get_vocab():
        tokenizer.add_tokens(['\n'])

    tokenizer.save(tokenizer_save_path)

# 4. 开始训练tokenizer
1亿个字符至少需要`32G`内存（其实`32G`还是不太够，会频繁触发swap），CPU`13600k`训练时长大概1个小时。

In [None]:
train_my_huggingface_wiki_tokenizer(token_type='byte')

# 5. 将训练的tokenizer转换为PreTrainedTokenizerFast并保存
转换是为了方便作为`AutoTokenizer`传到其他`huggingface`组件使用。

转换时要手动指定`pad_token`、`eos_token`等特殊token，因为它不指定你原来的tokenizer中哪些字符是这些特殊字符

In [None]:
slow_tokenizer = Tokenizer.from_file(tokenizer_save_path)
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=slow_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
    bos_token='[BOS]',
    eos_token='[EOS]',                  
)
tokenizer.save_pretrained('./model_save/fast_tokenizer/')