## Tokenization示例

In [1]:
# 英文Tokenization示例
import nltk

text = "ChatGPT is a free-to-use AI system."
tokens = nltk.word_tokenize(text)
print("原始文本：", text)
print("分词文本：", tokens)

原始文本： ChatGPT is a free-to-use AI system.
分词文本： ['ChatGPT', 'is', 'a', 'free-to-use', 'AI', 'system', '.']


In [4]:
# 中文Tokenization示例
import jieba

text = "ChatGPT是一个免费使用的人工智能系统。"
tokens = list(jieba.cut(text))
print("原始文本：", text)
print("分词文本：", tokens)

原始文本： ChatGPT是一个免费使用的人工智能系统。
分词文本： ['ChatGPT', '是', '一个', '免费', '使用', '的', '人工智能', '系统', '。']


## BPE示例

In [63]:
from collections import defaultdict
from tokenizers import pre_tokenizers

corpus = [  # The first sentences from the abstract of "<Attention Is All You Need>"
    "The dominant sequence transduction models are based on complex recurrent orconvolutional neural networks that include an encoder and a decoder.",
    "The bestperforming models also connect the encoder and decoder through an attentionmechanism.",
    "We propose a new simple network architecture, the Transformer,based solely on attention mechanisms, dispensing with recurrence and convolutionsentirely."
]
#################### Step1: word freq ################
word_freqs = defaultdict(int)
pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

for text in corpus:
    words_with_offsets = pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freqs[word] += 1

print(word_freqs)
# defaultdict(<class 'int'>, {'The': 2, 'Ġdominant': 1, 'Ġsequence': 1, 'Ġtransduction': 1, ...})

#################### Step2: alphabet ################
alphabet = []  # 字母表
for word in word_freqs.keys():
    for letter in word:
        if letter not in alphabet:
            alphabet.append(letter)
alphabet.sort()

print(alphabet)  # 'Ġ' 是空格符
# [',', '.', 'T', 'W', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'Ġ']
vocab = ["<|endoftext|>"] + alphabet.copy()  # add special token for GPT-2

#################### Step3: split word to char ################
splits = {word: [c for c in word] for word in word_freqs.keys()}
print(splits)  # 每个字符作为一个 subword


# {'The': ['T', 'h', 'e'], 'Ġdominant': ['Ġ', 'd', 'o', 'm', 'i', 'n', 'a', 'n', 't'],...}

#################### Step4: find most freq and merge ################

def compute_pair_freqs(splits):
    ''' 计算相邻子词合并之后作为一个整体所出现的频次

    :param splits: 截止到目前为止，每个单词的拆分
    '''
    pair_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        split = splits[word]
        if len(split) == 1:
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            pair_freqs[pair] += freq
    return pair_freqs


def find_most_freq(pair_freqs):
    ''' 计算频次最高的子词
    '''
    best_pair = ""
    max_freq = None

    for pair, freq in pair_freqs.items():
        if max_freq is None or max_freq < freq:
            best_pair = pair
            max_freq = freq
    print("\t Find most freq: pair[%s], freq[%s]" % (best_pair, max_freq))
    return best_pair


def merge_pair(a, b, splits):
    ''' 子词合并，将当前 splits 中的所有 "a b" 形式的子词合并为 "ab"
    '''
    combine_ab = "%s%s" % (a, b)

    for word in word_freqs:
        split = splits[word]  # word 当前的子词拆分
        if len(split) == 1:  # 子词只有一个，表示子词就是 word 自身
            continue

        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:  # a 和 b 连续出现，可以合并
                split = split[:i] + [combine_ab, ] + split[i + 2:]
            else:
                i += 1
        splits[word] = split
    return splits


merges = {}
vocab_size = 50

while len(vocab) < vocab_size:
    print("Current vocab size:%s" % len(vocab))
    pair_freqs = compute_pair_freqs(splits)
    print("\t Top3 Pair freq:%s" % sorted(pair_freqs.items(), key=lambda x: -x[1])[:3])  # 频次降序排列
    current_pair = find_most_freq(pair_freqs)
    new_subword = "%s%s" % (current_pair[0], current_pair[1])
    splits = merge_pair(current_pair[0], current_pair[1], splits)
    print("\t Merge '%s %s' to '%s'" % (current_pair[0], current_pair[1], new_subword))
    merges[current_pair] = new_subword
    vocab.append(new_subword)
# Current vocab size:30
#    Top3 Pair freq:[(('Ġ', 'm'), 3), (('l', 's'), 3), (('Ġ', 'c'), 3)]
#    Find most freq: pair[('Ġ', 'm')], freq[3]
#    Merge 'Ġ m' to 'Ġm'
# Current vocab size:31
#    Top3 Pair freq:[(('l', 's'), 3), (('Ġ', 'c'), 3), (('l', 'e'), 3)]
#    Find most freq: pair[('l', 's')], freq[3]
#    Merge 'l s' to 'ls'
# ...

print(merges)  # 20 条 merge 规则
# {('Ġ', 'm'): 'Ġm', ('l', 's'): 'ls', ('Ġ', 'c'): 'Ġc', ('l', 'e'): 'le', ...}
print(vocab)  # 词表由 special token、初始字母表、以及 merge结果所组成
# ['<|endoftext|>', ',', '.', 'T', 'W', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'Ġ', 'Ġm', 'ls', 'Ġc', 'le', 'lu', 'Ġand', 'is', 'The', 'Ġd', 'om', 'ence', 'ran', 'rans', 'Ġmode', 'Ġmodels', 'Ġar', 'Ġb', 'ase', 'ased', 'Ġon']

defaultdict(<class 'int'>, {'The': 2, 'Ġdominant': 1, 'Ġsequence': 1, 'Ġtransduction': 1, 'Ġmodels': 2, 'Ġare': 1, 'Ġbased': 1, 'Ġon': 2, 'Ġcomplex': 1, 'Ġrecurrent': 1, 'Ġorconvolutional': 1, 'Ġneural': 1, 'Ġnetworks': 1, 'Ġthat': 1, 'Ġinclude': 1, 'Ġan': 2, 'Ġencoder': 2, 'Ġand': 3, 'Ġa': 2, 'Ġdecoder': 2, '.': 3, 'Ġbestperforming': 1, 'Ġalso': 1, 'Ġconnect': 1, 'Ġthe': 2, 'Ġthrough': 1, 'Ġattentionmechanism': 1, 'We': 1, 'Ġpropose': 1, 'Ġnew': 1, 'Ġsimple': 1, 'Ġnetwork': 1, 'Ġarchitecture': 1, ',': 3, 'ĠTransformer': 1, 'based': 1, 'Ġsolely': 1, 'Ġattention': 1, 'Ġmechanisms': 1, 'Ġdispensing': 1, 'Ġwith': 1, 'Ġrecurrence': 1, 'Ġconvolutionsentirely': 1})
[',', '.', 'T', 'W', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'Ġ']
{'The': ['T', 'h', 'e'], 'Ġdominant': ['Ġ', 'd', 'o', 'm', 'i', 'n', 'a', 'n', 't'], 'Ġsequence': ['Ġ', 's', 'e', 'q', 'u', 'e', 'n', 'c', 'e'], 'Ġtransduction': ['Ġ', 't', 'r', 'a', 'n'

In [60]:
import re, collections


def get_vocab(filename):
    """构建初始词典"""
    vocab = collections.defaultdict(int)
    with open(filename, 'r', encoding='utf-8') as fin:
        for line in fin:
            words = line.strip().split()
            for word in words:
                vocab[' '.join(list(word)) + ' </w>'] += 1

    return vocab


def get_stats(vocab):  # vocab : 存储 word -> freq 的 dict
    """计算词表中，字符的 2-gram 及其出现频次"""
    pairs = collections.defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pairs[symbols[i], symbols[i + 1]] += freq
    return pairs


def merge_vocab(pair, v_in):  # pair 为最高频的 2-gram，v_in 为已有的 vocab
    """利用最高频的 2-gram 来更新已有的词表"""
    v_out = {}
    bigram = re.escape(' '.join(pair))  # 对字符串中可能被解释为正则运算符的字符进行转义
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')  # 编译一个正则模式
    # \S 匹配任意非空字符
    # (?<! \S) 前向否定界定符。当 bigram 之前不是任意非空字符之时，匹配成功
    # (?! \S) 后向否定界定符。当 bigram 之后不是任意非空字符之时，匹配成功
    for word in v_in:
        w_out = p.sub(''.join(pair), word)  # 将word中已有的pair替换为紧凑版本(移除中间的空格)
        # 注意这里有两个 join(pair), 一个是 ' '.join() 带空格, 另一个是 ''.join() 不带空格
        v_out[w_out] = v_in[word]
    return v_out


def get_tokens_from_vocab(vocab):
    """统计token频次"""
    tokens_frequencies = collections.defaultdict(int)
    vocab_tokenization = {}
    for word, freq in vocab.items():
        word_tokens = word.split()
        for token in word_tokens:
            tokens_frequencies[token] += freq
        vocab_tokenization[''.join(word_tokens)] = word_tokens
    return tokens_frequencies, vocab_tokenization


# pre tokenization之后的初始vocab
vocab = get_vocab('../data/All Around the Moon.txt')

print('==========')
print('Tokens Before BPE')
tokens_frequencies, vocab_tokenization = get_tokens_from_vocab(vocab)
# print('All tokens: {}'.format(tokens_frequencies.keys()))
print('Number of tokens: {}'.format(len(tokens_frequencies.keys())))
print('==========')

num_merges = 1500
for i in range(num_merges):
    pairs = get_stats(vocab)
    if not pairs:
        break
    best = max(pairs, key=pairs.get)
    vocab = merge_vocab(best, vocab)
    print('Iter: {}'.format(i))
    print('Best pair: {}'.format(best))
    tokens_frequencies, vocab_tokenization = get_tokens_from_vocab(vocab)
    # print('All tokens: {}'.format(tokens_frequencies.keys()))
    print('Number of tokens: {}'.format(len(tokens_frequencies.keys())))
    print('==========')

# 统计下最终合并后的token
sorted_tokens_tuple = sorted(tokens_frequencies.items(), key=lambda item: (measure_token_length(item[0]), item[1]),
                             reverse=True)
sorted_tokens = [token for (token, freq) in sorted_tokens_tuple]
top_K = 10
print(f"打印前{top_K}个频次的token：", sorted_tokens[:top_K])

Tokens Before BPE
Number of tokens: 104
Iter: 0
Best pair: ('e', '</w>')
Number of tokens: 105
Iter: 1
Best pair: ('t', 'h')
Number of tokens: 106
Iter: 2
Best pair: ('t', '</w>')
Number of tokens: 107
Iter: 3
Best pair: ('s', '</w>')
Number of tokens: 108
Iter: 4
Best pair: ('d', '</w>')
Number of tokens: 109
Iter: 5
Best pair: ('i', 'n')
Number of tokens: 110
Iter: 6
Best pair: ('e', 'r')
Number of tokens: 111
Iter: 7
Best pair: (',', '</w>')
Number of tokens: 112
Iter: 8
Best pair: ('th', 'e</w>')
Number of tokens: 113
Iter: 9
Best pair: ('a', 'n')
Number of tokens: 114
Iter: 10
Best pair: ('y', '</w>')
Number of tokens: 115
Iter: 11
Best pair: ('o', 'n')
Number of tokens: 116
Iter: 12
Best pair: ('e', 'n')
Number of tokens: 117
Iter: 13
Best pair: ('o', 'u')
Number of tokens: 118
Iter: 14
Best pair: ('a', 'r')
Number of tokens: 119
Iter: 15
Best pair: ('f', '</w>')
Number of tokens: 120
Iter: 16
Best pair: ('o', 'r')
Number of tokens: 121
Iter: 17
Best pair: ('o', '</w>')
Number of

In [59]:
def measure_token_length(token):
    if token[-4:] == '</w>':
        return len(token[:-4]) + 1
    else:
        return len(token)


def tokenize_word(string, sorted_tokens, unknown_token='</u>'):
    """对单词进行tokenize"""
    if string == '':
        return []
    if sorted_tokens == []:
        return [unknown_token]

    string_tokens = []
    for i in range(len(sorted_tokens)):
        token = sorted_tokens[i]
        token_reg = re.escape(token.replace('.', '[.]'))

        matched_positions = [(m.start(0), m.end(0)) for m in re.finditer(token_reg, string)]
        if len(matched_positions) == 0:
            continue
        substring_end_positions = [matched_position[0] for matched_position in matched_positions]

        substring_start_position = 0
        for substring_end_position in substring_end_positions:
            substring = string[substring_start_position:substring_end_position]
            string_tokens += tokenize_word(string=substring, sorted_tokens=sorted_tokens[i + 1:],
                                           unknown_token=unknown_token)
            string_tokens += [token]
            substring_start_position = substring_end_position + len(token)
        remaining_substring = string[substring_start_position:]
        string_tokens += tokenize_word(string=remaining_substring, sorted_tokens=sorted_tokens[i + 1:],
                                       unknown_token=unknown_token)
        break
    return string_tokens


# Let's check how tokenization will be for a known word
word_given_known = 'mountains</w>'
word_given_unknown = 'Ilikeeatingapples!</w>'

word_given = word_given_known

print('Tokenizing word: {}...'.format(word_given))
if word_given in vocab_tokenization:
    print('Tokenization of the known word:')
    print(vocab_tokenization[word_given])
    print('Tokenization treating the known word as unknown:')
    print(tokenize_word(string=word_given, sorted_tokens=sorted_tokens, unknown_token='</u>'))
else:
    print('Tokenizating of the unknown word:')
    print(tokenize_word(string=word_given, sorted_tokens=sorted_tokens, unknown_token='</u>'))

word_given = word_given_unknown

print('Tokenizing word: {}...'.format(word_given))
if word_given in vocab_tokenization:
    print('Tokenization of the known word:')
    print(vocab_tokenization[word_given])
    print('Tokenization treating the known word as unknown:')
    print(tokenize_word(string=word_given, sorted_tokens=sorted_tokens, unknown_token='</u>'))
else:
    print('Tokenizating of the unknown word:')
    print(tokenize_word(string=word_given, sorted_tokens=sorted_tokens, unknown_token='</u>'))

Tokenizing word: mountains</w>...
Tokenization of the known word:
['mountain', 's</w>']
Tokenization treating the known word as unknown:
['mountain', 's</w>']
Tokenizing word: Ilikeeatingapples!</w>...
Tokenizating of the unknown word:
['I', 'li', 'ke', 'e', 'at', 'ing', 'ap', 'ple', 's', '!</w>']


## hugging face tokenizers使用示例

In [64]:
# 初始化一个模型
from tokenizers import Tokenizer
from tokenizers.models import BPE
model = BPE(unk_token="[UNK]")
tokenizer = Tokenizer(model)

In [65]:
# 定义pre_tokenizer
from tokenizers.pre_tokenizers import Whitespace
tokenizer.pre_tokenizer = Whitespace()

In [66]:
# 定义trainer
from tokenizers.trainers import BpeTrainer
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

In [69]:
# 训练
files = ["../data/ALL Around the Moon.txt"]
tokenizer.train(files, trainer)






In [71]:
# 保存
tokenizer.save("../data/tokenizer-book.json")

In [74]:
# 加载训练好的模型
tokenizer = Tokenizer.from_file("../data/tokenizer-book.json")

In [73]:
tokenizer.get_vocab()

{'6th': 4850,
 'even': 341,
 'weaker': 13955,
 '_Indian': 10187,
 'occ': 655,
 'miracul': 13152,
 'prom': 2545,
 'Five': 3830,
 'eye': 818,
 'Moons': 11696,
 'eteen': 3980,
 'lead': 2711,
 '%': 9,
 'plagiarist': 16279,
 'track': 3349,
 'yrene': 5932,
 'softened': 8890,
 'Cooled': 15382,
 'gl': 353,
 'acles': 3991,
 'berately': 12055,
 'GLI': 9762,
 'disconcerting': 14171,
 'REPUBLIC': 16421,
 'fare': 10347,
 'million': 2644,
 'extend': 4572,
 'terstellar': 11493,
 'terminated': 14909,
 'Lussac_': 15662,
 'Bast': 9640,
 'homet': 5156,
 'anticipation': 9238,
 'imitate': 9225,
 'Columbiad': 2013,
 'Himalayahs': 4837,
 'arn': 10955,
 'poetical': 14913,
 'deck': 3063,
 '\'?"': 9534,
 'yourself': 3100,
 '_Rill': 8881,
 'Clos': 15567,
 'occupied': 3152,
 'alogue': 13835,
 'coaxing': 16177,
 'came': 968,
 'oon': 2314,
 'this': 263,
 'ically': 1190,
 'wondrous': 6699,
 '_m': 3516,
 'Brit': 9654,
 'tal': 962,
 'pp': 742,
 'securely': 14631,
 'grating': 8078,
 'tingling': 12293,
 'ively': 1103,
 