Build Unigram Tokenize Algo step by step, and combine steps into one class at end, also compared with transformers method

Used in AlBERT, T5, mBART, BigBird, XLNet

Reference:
https://huggingface.co/learn/nlp-course/chapter6/7?fw=pt

step by step

In [1]:
from transformers import AutoTokenizer
from collections import defaultdict
from math import log
import copy

In [2]:
tokenizer = AutoTokenizer.from_pretrained('xlnet-base-cased')

In [3]:
corpus = [
    "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]

In [4]:
word_freqs = defaultdict(int)
for text in corpus:
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freqs[word] += 1
print(word_freqs)

defaultdict(<class 'int'>, {'▁This': 3, '▁is': 2, '▁the': 1, '▁Hugging': 1, '▁Face': 1, '▁Course.': 1, '▁chapter': 1, '▁about': 1, '▁tokenization.': 1, '▁section': 1, '▁shows': 1, '▁several': 1, '▁tokenizer': 1, '▁algorithms.': 1, '▁Hopefully,': 1, '▁you': 1, '▁will': 1, '▁be': 1, '▁able': 1, '▁to': 1, '▁understand': 1, '▁how': 1, '▁they': 1, '▁are': 1, '▁trained': 1, '▁and': 1, '▁generate': 1, '▁tokens.': 1})


In [5]:
char_freqs = defaultdict(int)
subwords_freqs = defaultdict(int)
for word, freq in word_freqs.items():
    for i in range(len(word)):
        char_freqs[word[i]] += freq
        for j in range(i + 2, len(word) + 1):
            subwords_freqs[word[i: j]] += freq

sorted_subwords = sorted(subwords_freqs.items(), key=lambda x: x[1], reverse=True)
token_freqs = list(char_freqs.items()) + sorted_subwords[:300 - len(char_freqs)]
token_freqs = {token: freq for token, freq in token_freqs}

In [6]:
total_sum = sum([freq for token, freq in token_freqs.items()])
model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}

In [7]:
def encode_word(word, model):
    best_segmentations = [{"start": 0, "score": 1}] + [{"start": None, "score": None} for _ in range(len(word))]
    for start_idx in range(len(word)):
        best_score_at_start = best_segmentations[start_idx]["score"]
        for end_idx in range(start_idx + 1, len(word) + 1):
            token = word[start_idx: end_idx]
            if token in model and best_score_at_start is not None:
                score = model[token] + best_score_at_start
                if best_segmentations[end_idx]['score'] is None or best_segmentations[end_idx]['score'] > score:
                    best_segmentations[end_idx] = {"start": start_idx, "score": score}
    
    segmentation = best_segmentations[-1]
    if segmentation['score'] is None:
        return ["<unk>"], None
    
    score = segmentation['score']
    start = segmentation['start']
    end = len(word)
    tokens = []
    while start != 0:
        tokens.insert(0, word[start: end])
        next_start = best_segmentations[start]['start']
        end = start
        start = next_start
    tokens.insert(0, word[start: end])
    return tokens, score

print(encode_word("Hopefully", model))
print(encode_word("This", model))

(['H', 'o', 'p', 'e', 'f', 'u', 'll', 'y'], 41.5157494601402)
(['This'], 6.288267030694535)


In [8]:
def compute_loss(model):
    loss = 0
    for word, freq in word_freqs.items():
        _, word_loss = encode_word(word, model)
        loss += word_loss * freq
    return loss

print(compute_loss(model))

413.10377642940875


In [9]:
def compute_scores(model):
    scores = {}
    model_loss = compute_loss(model)
    for token, score in model.items():
        if len(token) == 1:
            continue
        model_without_token = copy.deepcopy(model)
        _ = model_without_token.pop(token)
        scores[token] = compute_loss(model_without_token) - model_loss
    return scores

scores = compute_scores(model)
print(scores['ll'], scores['his'])

6.376412403623874 0.0


In [10]:
percent_to_remove = 0.1
while len(model) > 100:
    scores = compute_scores(model)
    sorted_scores = sorted(scores.items(), key=lambda x: x[1])
    for i in range(int(len(model) * percent_to_remove)):
        _ = token_freqs.pop(sorted_scores[i][0])

    total_sum = sum([freq for token, freq in token_freqs.items()])
    model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}

In [11]:
def tokenize(text, model):
    words_with_offset = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    pre_tokenized_text = [word for word, offset in words_with_offset]
    encoded_words = [encode_word(word, model)[0] for word in pre_tokenized_text]
    return sum(encoded_words, [])

print(tokenize("This is the Hugging Face course.", model))

['▁This', '▁is', '▁the', '▁Hugging', '▁Face', '▁', 'c', 'ou', 'r', 's', 'e', '.']


Combine them into one class

In [2]:
from transformers import AutoTokenizer
from collections import defaultdict
from math import log
import copy

In [6]:
class MyTokenizer:
    def __init__(self, base_tokenizer):
        self.tokenizer = AutoTokenizer.from_pretrained(base_tokenizer)

    def cal_info(self, corpus, base_size):
        self.word_freqs = defaultdict(int)
        for text in corpus:
            words_with_offsets = self.tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
            new_words = [word for word, offset in words_with_offsets]
            for word in new_words:
                self.word_freqs[word] += 1

        char_freqs = defaultdict(int)
        subwords_freqs = defaultdict(int)
        for word, freq in self.word_freqs.items():
            for i in range(len(word)):
                char_freqs[word[i]] += freq
                for j in range(i + 2, len(word) + 1):
                    subwords_freqs[word[i: j]] += freq
        sorted_subwords = sorted(subwords_freqs.items(), key=lambda x: x[1], reverse=True)
        self.token_freqs = list(char_freqs.items()) + sorted_subwords[: base_size - len(char_freqs)]
        self.token_freqs = {token: freq for token, freq in self.token_freqs}

    def cal_model(self):
        total_sum = sum([freq for token, freq in self.token_freqs.items()])
        model = {token: -log(freq / total_sum) for token, freq in self.token_freqs.items()}
        return model

    def encode_word(self, word, model):
        best_segmentation = [{"start": 0, "score": 1}] + [{"start": None, "score": None} for _ in range(len(word))]
        for start_idx in range(len(word)):
            best_score_at_start = best_segmentation[start_idx]["score"]
            for end_idx in range(start_idx + 1, len(word) + 1):
                token = word[start_idx: end_idx]
                if token in model and best_score_at_start is not None:
                    score = model[token] + best_score_at_start
                    if best_segmentation[end_idx]["score"] is None or best_segmentation[end_idx]["score"] > score:
                        best_segmentation[end_idx] = {"start": start_idx, "score": score}

        segmentation = best_segmentation[-1]
        if segmentation["score"] is None:
            return ["<unk>"], None
        
        score = segmentation["score"]
        start = segmentation["start"]
        end = len(word)
        tokens = []
        while start != 0:
            tokens.insert(0, word[start: end])
            next_start = best_segmentation[start]["start"]
            end = start
            start = next_start
        tokens.insert(0, word[start: end])
        return tokens, score
    
    def compute_loss(self, model):
        loss = 0
        for word, freq in self.word_freqs.items():
            _, word_loss = self.encode_word(word, model)
            loss += word_loss * freq
        return loss
    
    def compute_scores(self, model):
        scores = {}
        model_loss = self.compute_loss(model)
        for token, score in model.items():
            if len(token) == 1:
                continue
            model_without_token = copy.deepcopy(model)
            _ = model_without_token.pop(token)
            scores[token] = self.compute_loss(model_without_token) - model_loss
        return scores
    
    def train(self, corpus, base_size, target_size, percent_remove=0.1):
        self.cal_info(corpus, base_size)
        model = self.cal_model()
        while len(model) > target_size:
            scores = self.compute_scores(model)
            sorted_scores = sorted(scores.items(), key=lambda x: x[1])
            for i in range(int(len(model) * percent_remove)):
                _ = self.token_freqs.pop(sorted_scores[i][0])
            model = self.cal_model()
        self.model = model

    def tokenize(self, text):
        words_with_offsets = self.tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
        pre_tokenized_text = [word for word, offset in words_with_offsets]
        encoded_words = [self.encode_word(word, self.model)[0] for word in pre_tokenized_text]
        return sum(encoded_words, [])

In [9]:
corpus = [
    "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]

new_tokenizer = MyTokenizer('xlnet-base-cased')
new_tokenizer.train(corpus, 300, 100)
print(new_tokenizer.tokenize('This is the Hugging Face course.'))

['▁This', '▁is', '▁the', '▁Hugging', '▁Face', '▁', 'c', 'ou', 'r', 's', 'e', '.']


Use Transformer package, there are 2 methods here, high level api vs low level api

In [None]:
#Method 1

from transformers import AutoTokenizer

In [25]:
old_tokenizer = AutoTokenizer.from_pretrained('xlnet-base-cased')

corpus = [
    "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]

tokenizer = old_tokenizer.train_new_from_iterator(corpus, vocab_size=100)
print(tokenizer.vocab_size)
print(tokenizer.tokenize('This is the Hugging Face course.'))

62
['▁', 'T', 'h', 'i', 's', '▁', 'i', 's', '▁the', '▁H', 'u', 'g', 'g', 'in', 'g', '▁', 'F', 'a', 'c', 'e', '▁', 'c', 'ou', 'rs', 'e', '.']


In [28]:
#Method 2

from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer, Regex

In [29]:
tokenizer = Tokenizer(models.Unigram())
tokenizer.normalizer = normalizers.Sequence([normalizers.Replace("``", '"'), normalizers.Replace("''", '"'), normalizers.NFKD(),
                                             normalizers.StripAccents(), normalizers.Replace(Regex(" {2,}"), " ")])
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace()
special_tokens = ["<cls>", "<sep>", "<unk>", "<pad>", "<mask>", "<s>", "</s>"]
trainer = trainers.UnigramTrainer(vocab_size=100, special_tokens=special_tokens, unk_token="<unk>")
tokenizer.train_from_iterator(corpus, trainer=trainer)

In [33]:
print(tokenizer.get_vocab_size())
print(tokenizer.encode('This is the Hugging Face course.').tokens)

60
['▁', 'T', 'h', 'i', 's', '▁', 'i', 's', '▁the', '▁H', 'u', 'g', 'g', 'in', 'g', '▁', 'F', 'a', 'c', 'e', '▁', 'c', 'ou', 'rs', 'e', '.']
