# 这里演示三种tokenizer的获取的方法

+ 现有词表构建tokenizer
+ BPE训练tokenizer
+ WordPiece训练tokenizer

## 基于现有词表进行构建

In [1]:
import pandas as pd

In [None]:
# 词表链接 https://github.com/bedlate/cn-corpus/blob/master/现代汉语常用字表.xls
data = pd.read_excel(r"C:/Users/Administrator/Downloads/现代汉语常用字表.xls", skiprows=4)
data.head()

In [None]:
id2label = {0: "[PAD]"}
label2id = {"[PAD]": 0}
for _, row in data.iterrows():
    id_ = row["ID"]
    label = row["汉字"]
    id2label[id_] = label
    label2id[label] = id_
else:
    index = id_ + 1
    id2label[index] = "[UNK]"
    label2id["[UNK]"] = index
    index += 1
    id2label[index] = "[BOS]"
    label2id["[BOS]"] = index
    index += 1
    id2label[index] = "[EOS]"
    label2id["[EOS]"] = index

    
    # 手动添加一些额外的符号
    #! 这里请特别注意省略号是两个符号，要处理成一个符号
    sign = "。，？《》；“”：、…！123456789,.?/;'\"!~()（）*&^￥@|\#ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"  # 自己补充吧。。。    
    for item in sign:
        index += 1
        id2label[index] = item
        label2id[item] = index
    
    
print(id2label)
print(len(id2label))
print(label2id)
print(len(label2id))

In [16]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import BertNormalizer
from tokenizers.decoders import Strip
from transformers import PreTrainedTokenizerFast

In [None]:
normalizer = BertNormalizer(clean_text=True, handle_chinese_chars=True)
model = WordLevel(label2id, unk_token="[UNK]")
tokenizer = Tokenizer(model)
tokenizer.normalizer = normalizer
tokenizer.pre_tokenizer = Whitespace()
tokenizer.decoder = Strip()
tokenizer.enable_padding()
tokenizer.enable_truncation(max_length=512)
print(tokenizer)


In [None]:
tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
tokenizer

In [None]:
print(len(id2label))
tokenizer.decode([len(id2label)])

In [None]:
special_tokens_dict = {
    "eos_token": "[EOS]",
    "bos_token": "[BOS]",
    "pad_token": "[PAD]"
}
tokenizer.add_special_tokens(special_tokens_dict=special_tokens_dict)

In [None]:
input_ids = tokenizer("[BOS]你好。我是A昐[EOS][PAD]")["input_ids"]
tokenizer.decode(input_ids)

In [None]:
tokenizer.eos_token, tokenizer.eos_token_id

## BPE训练

In [1]:
# 训练数据来自https://github.com/ciaoyizhen/crawler_for_generate_model
data_file_list = [
    r"C:/Users/Administrator/Downloads/斗破苍穹.txt",
    r"C:/Users/Administrator/Downloads/武动乾坤.txt"
]

In [None]:
data = []
for data_file in data_file_list:
    with open(data_file, "r", encoding="utf-8") as f:
        for line in f.readlines():
            data.append(line)
len(data)

In [3]:
from tokenizers import Tokenizer
from tokenizers.pre_tokenizers import ByteLevel as PreByteLevel
from tokenizers.decoders import ByteLevel as PostByteLevel
from tokenizers.trainers import BpeTrainer
from tokenizers.models import BPE

In [4]:
tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = PreByteLevel(add_prefix_space=False)  # 预处理时，将词转化为编码,不然未见过的词无法处理
tokenizer.decoder = PostByteLevel()  # 不加会导致decode回来的时候 仍然是乱码的
special_tokens = [
    "<assistant>",
    "<user>",
    "<system>",
    "<eos_token>",
]
trainer = BpeTrainer(vocab_size=25000, min_frequency=2, special_tokens=special_tokens, show_progress=True)
tokenizer.train_from_iterator(data, trainer=trainer)

In [None]:
ids = tokenizer.encode("萧玄跟古元同辈，萧炎是萧玄的后代，熏儿是古元的女儿，萧炎跟熏儿谈恋爱？").ids
tokenizer.decode(ids)

In [None]:
from transformers import PreTrainedTokenizerFast

In [7]:
tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)

In [None]:
input_ids = tokenizer("萧玄跟古元同辈，萧炎是萧玄的后代，熏儿是古元的女儿，萧炎跟熏儿谈恋爱？")["input_ids"]
tokenizer.decode(input_ids)

In [None]:
tokenizer("囍")

## WordPiece训练

In [1]:
# 训练数据来自https://github.com/ciaoyizhen/crawler_for_generate_model
data_file_list = [
    r"C:/Users/Administrator/Downloads/斗破苍穹.txt",
    r"C:/Users/Administrator/Downloads/武动乾坤.txt"
]

In [None]:
data = []
for data_file in data_file_list:
    with open(data_file, "r", encoding="utf-8") as f:
        for line in f.readlines():
            data.append(line)
len(data)

In [3]:
from tokenizers import Tokenizer
from tokenizers.pre_tokenizers import WhitespaceSplit
from tokenizers.models import WordPiece
from tokenizers.normalizers import BertNormalizer
from tokenizers.trainers import WordPieceTrainer
from tokenizers.decoders import WordPiece as DecoderWordPiece

In [4]:
tokenizer = Tokenizer(model=WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = WhitespaceSplit()
tokenizer.normalizer = BertNormalizer(handle_chinese_chars=False)  # pre_tokenizer在根据空格切，这个参数会在中文旁边生成空格
tokenizer.decoder = DecoderWordPiece()
trainer = WordPieceTrainer(vocab_size=25000, show_progress=True, special_tokens=["[UNK]", "[BOS]", "[EOS]"])
tokenizer.train_from_iterator(data, trainer=trainer)

In [None]:
tokenizer.encode("[UNK][BOS]")

In [None]:
ids = tokenizer.encode("萧玄跟古元同辈，萧炎是萧玄的后代，熏儿是古元的女儿，萧炎跟熏儿谈恋爱？").ids
print(tokenizer.decode(ids))

In [None]:
tokenizer.get_vocab_size()

In [None]:
from transformers import PreTrainedTokenizerFast

In [None]:
tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
special_tokens_dict = {
    "bos_token": "[BOS]",
    "eos_token": "[EOS]"
}
tokenizer.add_special_tokens(special_tokens_dict=special_tokens_dict)

In [None]:
tokenizer("囍")

In [None]:
tokenizer.encode("萧炎")

## 题外话内容

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-chinese")
text = "如何解决编码回来自带的空格"
print(tokenizer.decode(tokenizer.encode(text)))

In [None]:
from tokenizers.decoders import Strip

tokenizer.backend_tokenizer.decoder = Strip()
text = "如何解决编码回来自带的空格"
print(tokenizer.decode(tokenizer.encode(text)))