# Naive BPE implementation

In [8]:
import regex as re

# A humble corpus
texts = [
    "low low low low low",
    "lower lower widest widest widest",
    "newest newest newest newest newest newest"
]


In [9]:
vocabulary = ["<EOS>"] + [chr(k) for k in range(ord('a'), ord('z')+1)]

print(" - ".join(vocabulary))

<EOS> - a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - z


## Pretokenize texts


In [21]:
# Gpt-2 pre-tokenization regex
PAT = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""

def pre_tokenize(text: str) -> list[str]:
    return re.findall(PAT, text)

def whitespace_pretokenize(text: str) -> list[str]:
    return text.split()

In [22]:
pre_tokenize("hola mundo...")

['hola', ' mundo', '...']

In [23]:
pre_tokenize("hello! こんにちは!")

['hello', '!', ' こんにちは', '!']

In [35]:
list(word.encode("utf-8"))

[110, 101, 119, 101, 115, 116]

In [32]:
from collections import defaultdict
frequency_table: defaultdict[bytes, int] = defaultdict(int)

for text in texts:
    words = whitespace_pretokenize(text)

    for word in words:
        frequency_table[tuple(word.encode("utf-8"))] += 1

frequency_table


defaultdict(int,
            {(108, 111, 119): 5,
             (108, 111, 119, 101, 114): 2,
             (119, 105, 100, 101, 115, 116): 3,
             (110, 101, 119, 101, 115, 116): 6})

In [26]:
b'low'[0:2]

b'lo'

In [31]:
# Get all merges

pair_count = defaultdict(int)

current_max_value = None
current_max_count = None
for word in frequency_table:
    for i in range(len(word)-1):
        pair = word[i:i+2]
        count = pair_count[pair] + 1
        pair_count[pair] = count

        if current_max_count is None:
            current_max_value = pair
            current_max_count = pair_count[pair]
        elif count > current_max_count or (current_max_count == count and pair < current_max_value):
            current_max_value = pair
            current_max_count = count

current_max_value, current_max_count

(b'es', 2)