In [1]:
import tiktoken
import regex # using regex, not re

Create a tiktoken with my own "hardcoded" (not learned) tokens and try encoding and decoding.

In [2]:
# copied from https://github.com/karpathy/nanochat/blob/master/nanochat/tokenizer.py
SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,2}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""

In [3]:
regex.findall(SPLIT_PATTERN, "The cat is 12 + 231 çœŸçš„.")

['The', ' cat', ' is', ' ', '12', ' +', ' ', '23', '1', ' çœŸçš„', '.']

Why does leading space get included in the words? Does this mean when a tokenizer is trained we end up with lots of multi-byte tokens that start with space?

In [4]:
mergeable_ranks = {t.encode('utf-8'):i for i, t in enumerate(['a', 'b', 'c', 'd', 'e', 'f', '.', ' ', 'ab'])}
mergeable_ranks

{b'a': 0,
 b'b': 1,
 b'c': 2,
 b'd': 3,
 b'e': 4,
 b'f': 5,
 b'.': 6,
 b' ': 7,
 b'ab': 8}

In [5]:
enc = tiktoken.Encoding(
    name="my-toy-encoding",
    pat_str=SPLIT_PATTERN,
    mergeable_ranks=mergeable_ranks,
    special_tokens={'<bos>': 9})

In [6]:
enc.encode_ordinary("a cab")

[0, 7, 2, 8]

In [7]:
enc.decode([0, 7, 2, 8])

'a cab'

In [8]:
enc.decode([9, 0, 7, 2, 8])

'<bos>a cab'

Now try one that has all bytes plus 'ab', 'bo', etc.

In [9]:
mergeable_ranks = {bytes([i]): i for i in range(256)} | {b'ab' : 256, b'bo' : 257, b'12' : 258, b'23' : 259, b'123' : 260}
list(mergeable_ranks.items())[:3], list(mergeable_ranks.items())[65:68], list(mergeable_ranks.items())[254:]

([(b'\x00', 0), (b'\x01', 1), (b'\x02', 2)],
 [(b'A', 65), (b'B', 66), (b'C', 67)],
 [(b'\xfe', 254),
  (b'\xff', 255),
  (b'ab', 256),
  (b'bo', 257),
  (b'12', 258),
  (b'23', 259),
  (b'123', 260)])

In [10]:
enc = tiktoken.Encoding(
    name="my-toy-encoding",
    pat_str=SPLIT_PATTERN,
    mergeable_ranks=mergeable_ranks,
    special_tokens={'<bos>': 261})

In [11]:
enc.encode_ordinary("a cab")

[97, 32, 99, 256]

In [12]:
enc.decode([97, 32, 99, 256])

'a cab'

In [13]:
enc.encode_ordinary("ðŸ‘‹ ä½ å¥½ cab!")

[240, 159, 145, 139, 32, 228, 189, 160, 229, 165, 189, 32, 99, 256, 33]

In [14]:
enc.decode([240, 159, 145, 139, 32, 228, 189, 160, 229, 165, 189, 32, 99, 256, 33])

'ðŸ‘‹ ä½ å¥½ cab!'

In [15]:
'ðŸ‘‹'.encode('utf-8')

b'\xf0\x9f\x91\x8b'

In [16]:
0xf0, 0x9f, 0x91, 0x8b

(240, 159, 145, 139)

In [17]:
enc.encode_ordinary("123") # it doesn't matter that we have token '123' = 260 because of how we split digits

[258, 51]

In [18]:
enc.encode_ordinary("above") # I thought it might have gone with the "higher rank" bo = 257 but guess that's not how it works

[256, 111, 118, 101]

In [19]:
enc.encode_single_token('a')

97

In [20]:
enc.encode_single_token('ab')

256

In [21]:
enc.encode_single_token('abc') # error expected

KeyError: b'abc'

In [22]:
enc.encode_single_token('<bos>')

261