In [200]:
from collections import defaultdict, Counter
import re

# Byte Pair Encoding (BPE)

In [201]:
# Define the example corpus
corpus = "low low low low low lower lower newest newest newest newest newest newest widest widest widest"
example_sentence = "lowering the newest wide"

In [202]:
def get_subword_pairs(vocab) -> dict:
    """Get frequency of adjacent subword pairs in the vocabulary."""
    pairs = defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pairs[(symbols[i], symbols[i + 1])] += freq
    return pairs


def merge_vocab(pair, v_in) -> dict:
    v_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in v_in:
        w_out = p.sub(''.join(pair), word)
        v_out[w_out] = v_in[word]
    return v_out


def byte_pair_encoding(corpus, num_merges) -> tuple[dict, list]:
    # Initialize vocabulary with individual characters
    vocab = {' '.join(word) + ' </w>': count for word, count in Counter(corpus.split()).items()}
    final_vocab = []

    for i in range(num_merges):
        pairs = get_subword_pairs(vocab)
        if not pairs:
            break
        best = max(pairs, key=pairs.get)
        vocab = merge_vocab(best, vocab)
        final_vocab.append(best[0] + best[1])
        print(f"Iteration {i + 1}: Merged {best}")

    return vocab, final_vocab

num_merges = 5
merged_vocab, final_bpe_vocab = byte_pair_encoding(corpus, num_merges)
print("\nFinal vocabulary:", final_bpe_vocab)
print("Merged vocabulary:", merged_vocab)

Iteration 1: Merged ('e', 's')
Iteration 2: Merged ('es', 't')
Iteration 3: Merged ('est', '</w>')
Iteration 4: Merged ('l', 'o')
Iteration 5: Merged ('lo', 'w')

Final vocabulary: ['es', 'est', 'est</w>', 'lo', 'low']
Merged vocabulary: {'low </w>': 5, 'low e r </w>': 2, 'n e w est</w>': 6, 'w i d est</w>': 3}


# Wordpiece Tokenization

In [203]:
def get_subword_pairs(vocab) -> dict:
    """Get frequency of adjacent subword pairs in the vocabulary."""
    pairs = defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pairs[(symbols[i], symbols[i + 1])] += freq
    return pairs


def merge_vocab(pair, v_in) -> dict:
    """Merge the most frequent pair in the vocabulary."""
    v_out = {}
    bigram = re.escape(" ".join(pair))
    p = re.compile(r"(?<!\S)" + bigram + r"(?!\S)")  # Match full token pairs

    for word in v_in:
        # Merge the pair into a single token
        w_out = p.sub("".join(pair), word)
        v_out[w_out] = v_in[word]

    return v_out


def compute_score(pair, pairs, vocab) -> float:
    """Compute the WordPiece score: P(xy) / (P(x) * P(y))"""
    xy = pairs[pair]  # Frequency of merged token
    x = sum(vocab[word] for word in vocab if pair[0] in word.split())
    y = sum(vocab[word] for word in vocab if pair[1] in word.split())
    return xy / (x * y) if x * y > 0 else 0  # Avoid division by zero


def wordpiece_tokenization(corpus, vocab_size) -> tuple[dict, set]:
    """Train WordPiece tokenization."""
    # Step 1: Initialize vocabulary with characters + [UNK]
    word_freqs = Counter(corpus.split())
    vocab = {" ".join(word) + " </w>": freq for word, freq in word_freqs.items()}
    final_vocab = set(char for word in word_freqs for char in word) | {"[UNK]"}

    while len(final_vocab) < vocab_size:
        pairs = get_subword_pairs(vocab)
        if not pairs:
            break

        # Step 2: Select merge based on WordPiece scoring function
        best_pair = max(pairs, key=lambda p: compute_score(p, pairs, vocab))
        merged_token = best_pair[0] + best_pair[1]

        # Add to vocabulary (use ## if not at word start)
        if best_pair[0] in final_vocab:
            merged_token = "##" + merged_token if not best_pair[0].endswith("</w>") else merged_token
        final_vocab.add(merged_token)

        # Step 3: Merge the vocabulary
        vocab = merge_vocab(best_pair, vocab)
        print(f"Merged: {best_pair} -> {merged_token}")

    return vocab, final_vocab

vocab_size = 20
merged_vocab, final_wp_vocab = wordpiece_tokenization(corpus, vocab_size)

print("\nFinal vocabulary:", final_wp_vocab)
print("Merged vocabulary:", merged_vocab)

Merged: ('i', 'd') -> ##id
Merged: ('l', 'o') -> ##lo
Merged: ('s', 't') -> ##st
Merged: ('e', 'r') -> ##er
Merged: ('n', 'e') -> ##ne
Merged: ('e', 'st') -> ##est
Merged: ('id', 'est') -> idest
Merged: ('lo', 'w') -> low
Merged: ('low', 'er') -> ##lower

Final vocabulary: {'##est', 'low', 's', 'n', 't', 'o', '##lo', 'idest', '##er', 'r', 'i', '##ne', 'e', '[UNK]', '##lower', '##st', '##id', 'd', 'w', 'l'}
Merged vocabulary: {'low </w>': 5, 'lower </w>': 2, 'ne w est </w>': 6, 'w idest </w>': 3}


### Tokenize sequence with vocabulary

In [204]:
def tokenize_word_for_bpe(word, start_char_index, vocab) -> tuple[str, int]:
    start_char = word[start_char_index]
    end_char_index = None
    chars = ""
    token = None
    index = start_char_index
    for char in word[start_char_index:]:
        index += 1
        chars += char
        if chars in vocab or chars == "</w>":
            token = chars
            end_char_index = index
    if token is None:
        return start_char, start_char_index + 1
    else:
        return token, end_char_index


def apply_bpe_tokenizer(sequence, vocab) -> list:
    encoded = []
    words = [word + "</w>" for word in sequence.split()]

    for word in words:
        word_char_index = 0
        while word_char_index is not len(word):
            token, word_char_index = tokenize_word_for_bpe(word, word_char_index, vocab)
            encoded.append(token)

    return encoded

In [205]:
def tokenize_word_for_wp(word, start_char_index, vocab) -> tuple[str, int]:
    start_char = word[start_char_index]
    end_char_index = None
    chars = "##"
    token = None
    index = start_char_index
    for char in word[start_char_index:]:
        index += 1
        chars += char
        if chars in vocab or chars == "</w>":
            token = chars
            end_char_index = index
    if token is None:
        return start_char, start_char_index + 1
    else:
        return token, end_char_index

def apply_wp_tokenizer(sequence, vocab) -> list:
    encoded = []
    words = [word for word in sequence.split()]
    for word in words:
            word_char_index = 0
            while word_char_index is not len(word):
                token, word_char_index = tokenize_word_for_wp(word, word_char_index, vocab)
                encoded.append(token)
    return encoded

# Other implemplementations

### BPE

In [206]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

# Initialize BPE tokenizer
bpw_tokenizer = Tokenizer(BPE())

# Use whitespace to split words
bpw_tokenizer.pre_tokenizer = Whitespace()

# Train on a corpus
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
bpw_tokenizer.train_from_iterator([corpus], trainer=trainer)

# Show the final vocab
vocab = bpw_tokenizer.get_vocab()
print("Final vocab:", vocab)

# Tokenize a sentence
tokenized = bpw_tokenizer.encode(example_sentence)
print(f"\nTokenization of '{example_sentence}': {tokenized.tokens}")




Final vocab: {'[SEP]': 2, 'd': 5, 'n': 9, 'new': 20, 'est': 16, '[CLS]': 1, 'r': 11, 'e': 6, 'newest': 21, 'l': 8, 'es': 15, 'idest': 23, 't': 13, 'widest': 24, '[PAD]': 3, 'ew': 19, 'lo': 17, 'lower': 26, 'dest': 22, 'low': 18, 'w': 14, '[MASK]': 4, '[UNK]': 0, 's': 12, 'o': 10, 'er': 25, 'i': 7}

Tokenization of 'lowering the newest wide': ['lower', 'i', 'n', 't', 'e', 'newest', 'w', 'i', 'd', 'e']


### WordPiece

In [207]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer

# Initialize WordPiece tokenizer
wp_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

wp_tokenizer.pre_tokenizer = Whitespace()

# Train on a corpus
trainer = WordPieceTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
wp_tokenizer.train_from_iterator([corpus], trainer=trainer)

# Show the final vocab
vocab = wp_tokenizer.get_vocab()
print("Final vocab:", vocab)

# Tokenize a sentence
tokenized = wp_tokenizer.encode(example_sentence)
print(f"\nTokenization of '{example_sentence}': {tokenized.tokens}")




Final vocab: {'[CLS]': 1, '##d': 22, 'i': 7, '##e': 15, '[UNK]': 0, '##west': 28, 'e': 6, '##dest': 31, 'n': 9, '##s': 17, '##i': 21, 'newest': 29, 't': 13, '[MASK]': 4, 'lower': 34, '##est': 24, '[SEP]': 2, 'wi': 30, 'o': 10, 'w': 14, 'r': 11, '##o': 19, 'l': 8, '##r': 20, 'low': 26, 'widest': 32, '##t': 18, '##w': 16, '##es': 23, 'ne': 27, 'd': 5, 'lo': 25, '##er': 33, '[PAD]': 3, 's': 12}

Tokenization of 'lowering the newest wide': ['[UNK]', '[UNK]', 'newest', 'wi', '##d', '##e']


# Hugging Face Implementations
BertTokenizer, RobertaTokenizer

In [208]:
from transformers import BertTokenizer, RobertaTokenizer

# Load pre-trained tokenizers
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize using BERT tokenizer
bert_tokens = bert_tokenizer.tokenize(example_sentence)

# Tokenize using RoBERTa tokenizer
roberta_tokens = roberta_tokenizer.tokenize(example_sentence)

print("BERT Tokens:", bert_tokens)
print("RoBERTa Tokens:", roberta_tokens)

BERT Tokens: ['lowering', 'the', 'newest', 'wide']
RoBERTa Tokens: ['lower', 'ing', 'Ġthe', 'Ġnewest', 'Ġwide']


# Compare the tokenization using different tokenizers

In [209]:
print(f"Original Sentence: {example_sentence}")

custom_bpe_tokens = apply_bpe_tokenizer(example_sentence, final_bpe_vocab)
custom_wordpiece_tokens = apply_wp_tokenizer(example_sentence, final_wp_vocab)

lib_bpe_tokenized = bpw_tokenizer.encode(example_sentence)
lib_wp_tokenized = wp_tokenizer.encode(example_sentence)

bert_tokens = bert_tokenizer.tokenize(example_sentence)
roberta_tokens = roberta_tokenizer.tokenize(example_sentence)

print("\nCustom BPE Tokens:", custom_bpe_tokens)
print("Library BPE Tokens:", lib_bpe_tokenized.tokens)

print("\nCustom WordPiece Tokens:", custom_wordpiece_tokens)
print("Library WordPiece Tokens:", lib_wp_tokenized.tokens)
print("\nBERT Tokens:", bert_tokens)
print("RoBERTa Tokens:", roberta_tokens)

Original Sentence: lowering the newest wide

Custom BPE Tokens: ['low', 'e', 'r', 'i', 'n', 'g', '</w>', 't', 'h', 'e', '</w>', 'n', 'e', 'w', 'est</w>', 'w', 'i', 'd', 'e', '</w>']
Library BPE Tokens: ['lower', 'i', 'n', 't', 'e', 'newest', 'w', 'i', 'd', 'e']

Custom WordPiece Tokens: ['##lower', 'i', 'n', 'g', 't', 'h', 'e', '##ne', 'w', '##est', 'w', '##id', 'e']
Library WordPiece Tokens: ['[UNK]', '[UNK]', 'newest', 'wi', '##d', '##e']

BERT Tokens: ['lowering', 'the', 'newest', 'wide']
RoBERTa Tokens: ['lower', 'ing', 'Ġthe', 'Ġnewest', 'Ġwide']
