In [None]:
s = "नमस्ते 👋 (hello in hindi!)"
s

In [None]:
# ord() returns the Unicode code point for a character
[ord(c) for c in s]

In [None]:
# encode() returns the encoded bytes of the string
# utf-8 is the most common encoding for text in the world, it is the only encoding that is backwards compatible with ASCII
s.encode('utf-8')

In [None]:
# variable length (1-4 bytes) utf-8 encodings, here each integer in the list is a byte (decimal representation) in the utf-8 encoding
for c in s:
    print(c, list(c.encode('utf-8')))

In [None]:
# taken from https://www.reedbeta.com/blog/programmers-intro-to-unicode/
text = "Ｕｎｉｃｏｄｅ! 🅤🅝🅘🅒🅞🅓🅔‽ 🇺‌🇳‌🇮‌🇨‌🇴‌🇩‌🇪! 😄 The very name strikes fear and awe into the hearts of programmers worldwide. We all know we ought to “support Unicode” in our software (whatever that means—like using wchar_t for all the strings, right?). But Unicode can be abstruse, and diving into the thousand-page Unicode Standard plus its dozens of supplementary annexes, reports, and notes can be more than a little intimidating. I don’t blame programmers for still finding the whole thing mysterious, even 30 years after Unicode’s inception."
tokens = text.encode('utf-8') # utf-8 encoding of the text (raw bytes)
tokens = list(map(int, tokens)) # convert each byte to an integer
print('----')
print(text)
print(len(text))
print('----')
print(tokens)
print(len(tokens))
print('----')

In [None]:
# find the frequency of each byte pair
def get_freqs(ids):
    freqs = {}
    for pair in zip(ids, ids[1:]):
        freqs[pair] = freqs.get(pair, 0) + 1
    return freqs
stats = get_freqs(tokens)
print(stats)


In [None]:
print(sorted(((v,k) for k,v in stats.items()), reverse=True))

In [None]:
chr(101), chr(32)

In [None]:

# max() finds the key in stats dict that has the highest value
# key=stats.get tells max() to compare keys by their corresponding values in the dict
top_pair = max(stats, key=stats.get)
top_pair

In [None]:
def merge(ids, pair, idx):
    new_ids = []
    i = 0
    while i < len(ids):
        if i<len(ids)-1 and ids[i:i+2] == list(pair):
            new_ids.append(idx)
            i += 2
        else:
            new_ids.append(ids[i])
            i += 1
    return new_ids

#print(merge([5,6,7,6,5,4,3,2,1,0], (6,7), 99))
tokens2 = merge(tokens, top_pair, 256)
print(tokens2)
print("length:", len(tokens2))

In [None]:
# scrape all the text from the link https://www.reedbeta.com/blog/programmers-intro-to-unicode/
import requests
from bs4 import BeautifulSoup

url = "https://www.reedbeta.com/blog/programmers-intro-to-unicode/"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Extract the main content text (remove scripts, styles, etc.)
for script in soup(["script", "style"]):
    script.decompose()

# Get text from the main content area
text = soup.get_text()

# Clean up the text - remove extra whitespace
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
text = ' '.join(chunk for chunk in chunks if chunk)

print(f"Scraped text length: {len(text)} characters")
print(f"First 500 characters:\n{text[:500]}...")

# save text to a file
with open('data/text.txt', 'w') as f:
    f.write(text)

In [None]:
tokens = text.encode('utf-8')
tokens = list(map(int, tokens)) 
print("Number of tokens:", len(tokens))

In [None]:
# find the frequency of each byte pair
def get_freqs(ids):
    freqs = {}
    for pair in zip(ids, ids[1:]):
        freqs[pair] = freqs.get(pair, 0) + 1
    return freqs

def merge(ids, pair, idx):
    new_ids = []
    i = 0
    while i < len(ids):
        if i<len(ids)-1 and ids[i:i+2] == list(pair):
            new_ids.append(idx)
            i += 2
        else:
            new_ids.append(ids[i])
            i += 1
    return new_ids

########################################################
vocab_size = 276
num_merges = vocab_size - 256
ids = list(tokens) # make a copy of the tokens
# merges will be our trained tokenizer vocabulary
merges = {} # (int, int) -> int

for i in range(num_merges):
    stats = get_freqs(ids)
    pair = max(stats, key=stats.get)
    idx = 256 + i
    print(f"Merging {pair} at index {idx}")
    ids = merge(ids, pair, idx)
    merges[pair] = idx


In [None]:
# get compression ratio
print(f"tokens length: {len(tokens)}")
print(f"ids length: {len(ids)}")
print(f"Compression ratio: {len(tokens) / len(ids):.2f}x")


Decode the tokens using the trained tokenizer

In [None]:
# bytes([idx]) - creates bytes from a list containing one integer
# whereas bytes(idx) - creates a bytes object of length idx, filled with zeros
vocab = {idx: bytes([idx]) for idx in range(256)}
for (p0, p1), idx in merges.items():
    vocab[idx] = vocab[p0] + vocab[p1]

def decode(ids):
    # given a list of token ids (integers), return the decoded string
    # b"".join(vocab[id] for id in ids) - concatenates the bytes objects for each id in the list
    tokens = b"".join(vocab[id] for id in ids)
    # replace any invalid utf-8 bytes with the replacement character, as in utf-8 encoding not all bytes are valid (eg. 128)
    text = tokens.decode('utf-8', errors='replace')
    return text

print(decode([128]))
print(decode([97, 116]))

In [None]:
vocab = {idx: bytes(idx) for idx in range(256)}
print(vocab)

Encode the text using the trained tokenizer

In [None]:
merges

In [None]:
def encode(text):
    # convert the text to a list of tokens
    tokens = text.encode('utf-8')
    while len(tokens)>=2:
        freqs = get_freqs(tokens)
        # Find the pair with the lowest merge index (earliest merge in training)
        # min() finds the pair with the smallest value returned by the key function
        # lambda p: merges.get(p, float('inf')) returns:
        #   - the merge index if the pair exists in merges (lower index = earlier merge)
        #   - float('inf') if the pair doesn't exist in merges (ensures it won't be selected)
        # This ensures we apply merges in the same order they were learned during training
        pair = min(freqs, key=lambda p: merges.get(p, float('inf')))
        if pair not in merges:
            break
        tokens = merge(tokens, pair, merges[pair])
    return tokens

print(encode("hello world"))


In [None]:
text2 = decode(encode(text))
print(text2 == text)

Forced splits using the regex patterns (GPT series)

In [None]:
import regex as re

gpt2pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
print(re.findall(gpt2pat, "Hello've world123 how's are you!!!?"))