In [6]:
from collections import defaultdict, Counter
import re

# Byte Pair Encoding (BPE)

In [7]:
def get_stats(vocab):
    pairs = defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pairs[(symbols[i], symbols[i + 1])] += freq
    return pairs

def merge_vocab(pair, v_in):
    v_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in v_in:
        w_out = p.sub(''.join(pair), word)
        v_out[w_out] = v_in[word]
    return v_out

def byte_pair_encoding(corpus, num_merges):
    # Initialize vocabulary with individual characters
    vocab = {' '.join(word) + ' </w>': count for word, count in Counter(corpus.split()).items()}

    for i in range(num_merges):
        pairs = get_stats(vocab)
        if not pairs:
            break
        best = max(pairs, key=pairs.get)
        vocab = merge_vocab(best, vocab)
        print(f"Iteration {i+1}: Merged {best}")

    return vocab

# Example usage
corpus = "low lower lowest"
num_merges = 10
vocab = byte_pair_encoding(corpus, num_merges)
print("Final vocabulary:", vocab)

Iteration 1: Merged ('l', 'o')
Iteration 2: Merged ('lo', 'w')
Iteration 3: Merged ('low', 'e')
Iteration 4: Merged ('low', '</w>')
Iteration 5: Merged ('lowe', 'r')
Iteration 6: Merged ('lower', '</w>')
Iteration 7: Merged ('lowe', 's')
Iteration 8: Merged ('lowes', 't')
Iteration 9: Merged ('lowest', '</w>')
Final vocabulary: {'low</w>': 1, 'lower</w>': 1, 'lowest</w>': 1}


In [8]:
# Define the example corpus
corpus = "low lower lowest"

# Wordpiece Tokenization

In [9]:
from collections import defaultdict, Counter
import re

def get_pairs(vocab):
    pairs = defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pairs[(symbols[i], symbols[i + 1])] += freq
    return pairs

def merge_vocab(pair, v_in):
    v_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in v_in:
        w_out = p.sub(''.join(pair), word)
        v_out[w_out] = v_in[word]
    return v_out

def wordpiece_tokenization(corpus, vocab_size):
    # Initialize vocabulary with individual characters
    vocab = {' '.join(word) + ' </w>': count for word, count in Counter(corpus.split()).items()}

    while len(vocab) < vocab_size:
        pairs = get_pairs(vocab)
        if not pairs:
            break
        best_pair = max(pairs, key=pairs.get)
        vocab = merge_vocab(best_pair, vocab)
        print(f"Merged {best_pair}")

    return vocab

# Example usage
vocab_size = 20
vocab = wordpiece_tokenization(corpus, vocab_size)
print("Final vocabulary:", vocab)


Merged ('l', 'o')
Merged ('lo', 'w')
Merged ('low', 'e')
Merged ('low', '</w>')
Merged ('lowe', 'r')
Merged ('lower', '</w>')
Merged ('lowe', 's')
Merged ('lowes', 't')
Merged ('lowest', '</w>')
Final vocabulary: {'low</w>': 1, 'lower</w>': 1, 'lowest</w>': 1}


In [10]:
from transformers import BertTokenizer, RobertaTokenizer

# Load pre-trained tokenizers
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')


# Tokenize using BERT tokenizer
bert_tokens = bert_tokenizer.tokenize(corpus)
bert_ids = bert_tokenizer.convert_tokens_to_ids(bert_tokens)

# Tokenize using RoBERTa tokenizer
roberta_tokens = roberta_tokenizer.tokenize(corpus)
roberta_ids = roberta_tokenizer.convert_tokens_to_ids(roberta_tokens)

print("BERT Tokens:", bert_tokens)
print("BERT Token IDs:", bert_ids)
print("RoBERTa Tokens:", roberta_tokens)
print("RoBERTa Token IDs:", roberta_ids)

BERT Tokens: ['low', 'lower', 'lowest']
BERT Token IDs: [2659, 2896, 7290]
RoBERTa Tokens: ['low', 'Ġlower', 'Ġlowest']
RoBERTa Token IDs: [5481, 795, 3912]
