Build WordPiece Tokenize Algo step by step, and combine steps into one class at end, also compared with transformers method

Used in models based on BERT, DistilBERT, MobileBERT, Funnel, MPNET

Reference:
https://huggingface.co/learn/nlp-course/chapter6/6?fw=pt

step by step

In [1]:
from transformers import AutoTokenizer
from collections import defaultdict

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [3]:
corpus = [
    "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]

In [4]:
word_freqs = defaultdict(int)
for text in corpus:
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freqs[word] += 1
print(word_freqs)

defaultdict(<class 'int'>, {'This': 3, 'is': 2, 'the': 1, 'Hugging': 1, 'Face': 1, 'Course': 1, '.': 4, 'chapter': 1, 'about': 1, 'tokenization': 1, 'section': 1, 'shows': 1, 'several': 1, 'tokenizer': 1, 'algorithms': 1, 'Hopefully': 1, ',': 1, 'you': 1, 'will': 1, 'be': 1, 'able': 1, 'to': 1, 'understand': 1, 'how': 1, 'they': 1, 'are': 1, 'trained': 1, 'and': 1, 'generate': 1, 'tokens': 1})


In [5]:
alphabet = []
for word in word_freqs.keys():
    if word[0] not in alphabet:
        alphabet.append(word[0])
    for letter in word[1:]:
        if f"##{letter}" not in alphabet:
            alphabet.append(f"##{letter}")
alphabet.sort()
vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] + alphabet.copy()
print(vocab)

['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', '##a', '##b', '##c', '##d', '##e', '##f', '##g', '##h', '##i', '##k', '##l', '##m', '##n', '##o', '##p', '##r', '##s', '##t', '##u', '##v', '##w', '##y', '##z', ',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'g', 'h', 'i', 's', 't', 'u', 'w', 'y']


In [7]:
splits = {word: [c if i == 0 else f"##{c}" for i, c in enumerate(word)] for word in word_freqs.keys()}
print(splits)

{'This': ['T', '##h', '##i', '##s'], 'is': ['i', '##s'], 'the': ['t', '##h', '##e'], 'Hugging': ['H', '##u', '##g', '##g', '##i', '##n', '##g'], 'Face': ['F', '##a', '##c', '##e'], 'Course': ['C', '##o', '##u', '##r', '##s', '##e'], '.': ['.'], 'chapter': ['c', '##h', '##a', '##p', '##t', '##e', '##r'], 'about': ['a', '##b', '##o', '##u', '##t'], 'tokenization': ['t', '##o', '##k', '##e', '##n', '##i', '##z', '##a', '##t', '##i', '##o', '##n'], 'section': ['s', '##e', '##c', '##t', '##i', '##o', '##n'], 'shows': ['s', '##h', '##o', '##w', '##s'], 'several': ['s', '##e', '##v', '##e', '##r', '##a', '##l'], 'tokenizer': ['t', '##o', '##k', '##e', '##n', '##i', '##z', '##e', '##r'], 'algorithms': ['a', '##l', '##g', '##o', '##r', '##i', '##t', '##h', '##m', '##s'], 'Hopefully': ['H', '##o', '##p', '##e', '##f', '##u', '##l', '##l', '##y'], ',': [','], 'you': ['y', '##o', '##u'], 'will': ['w', '##i', '##l', '##l'], 'be': ['b', '##e'], 'able': ['a', '##b', '##l', '##e'], 'to': ['t', '##o'],

In [8]:
def compute_pair_scores(splits):
    letter_freqs = defaultdict(int)
    pair_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        split = splits[word]
        if len(split) == 1:
            letter_freqs[split[0]] += freq
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            letter_freqs[split[i]] += freq
            pair_freqs[pair] += freq
        letter_freqs[split[-1]] += freq
    
    scores = {pair: freq / (letter_freqs[pair[0]] * letter_freqs[pair[1]]) for pair, freq in pair_freqs.items()}
    return scores

pair_scores = compute_pair_scores(splits)
for i, key in enumerate(pair_scores.keys()):
    print(f"{key} : {pair_scores[key]}")
    if i >= 5:
        break

('T', '##h') : 0.125
('##h', '##i') : 0.03409090909090909
('##i', '##s') : 0.02727272727272727
('i', '##s') : 0.1
('t', '##h') : 0.03571428571428571
('##h', '##e') : 0.011904761904761904


In [9]:
best_pair = ""
max_score = None
for pair, score in pair_scores.items():
    if max_score is None or max_score < score:
        best_pair = pair
        max_score = score
print(best_pair, max_score)

('a', '##b') 0.2


In [10]:
vocab.append('ab')

In [11]:
def merge_pair(a, b, splits):
    for word in word_freqs:
        split = splits[word]
        if len(split) == 1:
            continue
        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                merge = a + b[2:] if b.startswith('##') else a + b
                split = split[:i] + [merge] + split[i + 2:]
            else:
                i += 1
        splits[word] = split
    return splits

splits = merge_pair('a', '##b', splits)
print(splits['about'])

['ab', '##o', '##u', '##t']


In [12]:
vocab_size = 70
while len(vocab) < vocab_size:
    scores = compute_pair_scores(splits)
    best_pair, max_score = "", None
    for pair, score in scores.items():
        if max_score is None or max_score < score:
            best_pair = pair
            max_score = score
    splits = merge_pair(*best_pair, splits)
    new_token = (best_pair[0] + best_pair[1][2:] if best_pair[1].startswith('##') else best_pair[0] + best_pair[1])
    vocab.append(new_token)
print(vocab)

['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', '##a', '##b', '##c', '##d', '##e', '##f', '##g', '##h', '##i', '##k', '##l', '##m', '##n', '##o', '##p', '##r', '##s', '##t', '##u', '##v', '##w', '##y', '##z', ',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'g', 'h', 'i', 's', 't', 'u', 'w', 'y', 'ab', '##fu', 'Fa', 'Fac', '##ct', '##ful', '##full', '##fully', 'Th', 'ch', '##hm', 'cha', 'chap', 'chapt', '##thm', 'Hu', 'Hug', 'Hugg', 'sh', 'th', 'is', '##thms', '##za', '##zat', '##ut']


In [13]:
def encode_word(word):
    tokens = []
    while len(word) > 0:
        i = len(word)
        while i > 0 and word[:i] not in vocab:
            i -= 1
        if i == 0:
            return ["[UNK]"]
        tokens.append(word[:i])
        word = word[i:]
        if len(word) > 0:
            word = f"##{word}"
    return tokens

print(encode_word('Hugging'))
print(encode_word('HOgging'))

['Hugg', '##i', '##n', '##g']
['[UNK]']


In [14]:
def tokenize(text):
    pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
    pre_tokenized_text = [word for word, offset in pre_tokenize_result]
    encoded_words = [encode_word(word) for word in pre_tokenized_text]
    return sum(encoded_words, [])

print(tokenize("This is the Hugging Face course!"))

['Th', '##i', '##s', 'is', 'th', '##e', 'Hugg', '##i', '##n', '##g', 'Fac', '##e', 'c', '##o', '##u', '##r', '##s', '##e', '[UNK]']


Combine them into one class

In [None]:
from transformers import AutoTokenizer
from collections import defaultdict

In [18]:
class MyTokenizer:
    def __init__(self, base_tokenizer):
        self.tokenizer = AutoTokenizer.from_pretrained(base_tokenizer)

    def cal_info(self, corpus):
        self.word_freqs = defaultdict(int)
        for text in corpus:
            words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
            new_words = [word for word, offset in words_with_offsets]
            for word in new_words:
                self.word_freqs[word] += 1
        
        alphabet = []
        for word in self.word_freqs.keys():
            if word[0] not in alphabet:
                alphabet.append(word[0])
            for letter in word[1:]:
                if f"##{letter}" not in alphabet:
                    alphabet.append(f"##{letter}")
        alphabet.sort()

        self.vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] + alphabet.copy()

        self.splits = {word: [c if i == 0 else f"##{c}" for i, c in enumerate(word)] for word in self.word_freqs.keys()}

    def find_best_pair(self):
        letter_freqs = defaultdict(int)
        pair_freqs = defaultdict(int)
        for word, freq in self.word_freqs.items():
            split = self.splits[word]
            if len(split) == 1:
                letter_freqs[split[0]] += freq
                continue
            for i in range(len(split) - 1):
                pair = (split[i], split[i + 1])
                letter_freqs[split[i]] += freq
                pair_freqs[pair] += freq
            letter_freqs[split[-1]] += freq

        scores = {pair: freq / (letter_freqs[pair[0]] * letter_freqs[pair[1]]) for pair, freq in pair_freqs.items()}

        best_pair = ""
        max_score = None
        for pair, score in pair_scores.items():
            if max_score is None or max_score < score:
                best_pair = pair
                max_score = score
        
        return best_pair
    
    def merge_pair(self, pair):
        for word in self.word_freqs:
            split = self.splits[word]
            if len(split) == 1:
                continue
            i = 0
            while i < len(split) - 1:
                if split[i] == pair[0] and split[i + 1] == pair[1]:
                    merge = pair[0] + pair[1][2:] if pair[1].startswith("##") else pair[0] + pair[1]
                    split = split[:i] + [merge] + split[i + 2:]
                else:
                    i += 1
            self.splits[word] = split
        self.vocab.append(pair[0] + pair[1][2:] if pair[1].startswith("##") else pair[0] + pair[1])

    def train(self, corpus, vocab_size):
        self.cal_info(corpus)
        while len(self.vocab) < vocab_size:
            pair = self.find_best_pair()
            self.merge_pair(pair)

    def encode_word(self, word):
        tokens = []
        while len(word) > 0:
            i = len(word)
            while i > 0 and word[:i] not in self.vocab:
                i -= 1
            if i == 0:
                return ["[UNK]"]
            tokens.append(word[:i])
            word = word[i:]
            if len(word) > 0:
                word = f"##{word}"
        return tokens
    
    def tokenize(self, text):
        pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
        pre_tokenized_text = [word for word, offset in pre_tokenize_result]
        encoded_words = [encode_word(word) for word in pre_tokenized_text]
        return sum(encoded_words, [])

In [19]:
corpus = [
    "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]

new_tokenizer = MyTokenizer('bert-base-cased')
new_tokenizer.train(corpus, 70)
print(new_tokenizer.tokenize('This is the Hugging Face course!'))

['Th', '##i', '##s', 'is', 'th', '##e', 'Hugg', '##i', '##n', '##g', 'Fac', '##e', 'c', '##o', '##u', '##r', '##s', '##e', '[UNK]']


Use Transformer package, there are 2 methods here, high level api vs low level api

In [None]:
#Method 1

from transformers import AutoTokenizer

In [21]:
old_tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

corpus = [
    "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]

tokenizer = old_tokenizer.train_new_from_iterator(corpus, 70)
print(tokenizer.tokenize('This is the Hugging Face course!'))

['This', 'is', 't', '##h', '##e', 'H', '##u', '##g', '##g', '##i', '##n', '##g', 'F', '##a', '##c', '##e', 'c', '##ou', '##r', '##s', '##e', '[UNK]']


In [23]:
#Method 2

from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer

In [24]:
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)
#tokenizer.normalizer = normalizers.Sequence([normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()])
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
#tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
#tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
#tokenizer.pre_tokenizer = pre_tokenizers.Sequence([pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Punctuation()])
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=70, special_tokens=special_tokens)
tokenizer.train_from_iterator(corpus, trainer=trainer)

In [25]:
print(tokenizer.encode('This is the Hugging Face course!').tokens)

['this', 'is', 'th', '##e', 'h', '##u', '##g', '##g', '##in', '##g', 'f', '##a', '##c', '##e', 'c', '##ou', '##r', '##s', '##e', '[UNK]']
