In [1]:
# to construct byte-level base tokenizer
from magicab.etoken import TokenTrie 
from magicab import ETokenizer 

tok = ETokenizer(mode="byte")

byte_vocab = tok.byte_vocab
# TokenTrie(byte_vocab=byte_vocab)

In [2]:
tok.token_trie.save(path="test.json", mode=tok.mode)

In [None]:
from magicab.etoken import TokenTrie 

token_trie = TokenTrie.load("test.json") 


In [9]:
# tok.encode("I am super duper") # bug 

from magicab.etoken import encode_bytes
self = tok
text = "I am super duper"

# encoding 
ids = encode_bytes(text, self.special_tokens, self.special2idx, self.byte2idx)
ids = tok.encode_id(text)
ids = tok.encode(text)

# decoding 
tok.decode(ids)


'I am super duper'

In [21]:
b = tok.byte_vocab[1]
b

b'\x01'

In [22]:
int_value = int.from_bytes(b, byteorder='big')

In [23]:
int_value

1

In [35]:
tok.token_trie.id2token

{0: b'\x00',
 1: b'\x01',
 2: b'\x02',
 3: b'\x03',
 4: b'\x04',
 5: b'\x05',
 6: b'\x06',
 7: b'\x07',
 8: b'\x08',
 9: b'\t',
 10: b'\n',
 11: b'\x0b',
 12: b'\x0c',
 13: b'\r',
 14: b'\x0e',
 15: b'\x0f',
 16: b'\x10',
 17: b'\x11',
 18: b'\x12',
 19: b'\x13',
 20: b'\x14',
 21: b'\x15',
 22: b'\x16',
 23: b'\x17',
 24: b'\x18',
 25: b'\x19',
 26: b'\x1a',
 27: b'\x1b',
 28: b'\x1c',
 29: b'\x1d',
 30: b'\x1e',
 31: b'\x1f',
 32: b' ',
 33: b'!',
 34: b'"',
 35: b'#',
 36: b'$',
 37: b'%',
 38: b'&',
 39: b"'",
 40: b'(',
 41: b')',
 42: b'*',
 43: b'+',
 44: b',',
 45: b'-',
 46: b'.',
 47: b'/',
 48: b'0',
 49: b'1',
 50: b'2',
 51: b'3',
 52: b'4',
 53: b'5',
 54: b'6',
 55: b'7',
 56: b'8',
 57: b'9',
 58: b':',
 59: b';',
 60: b'<',
 61: b'=',
 62: b'>',
 63: b'?',
 64: b'@',
 65: b'A',
 66: b'B',
 67: b'C',
 68: b'D',
 69: b'E',
 70: b'F',
 71: b'G',
 72: b'H',
 73: b'I',
 74: b'J',
 75: b'K',
 76: b'L',
 77: b'M',
 78: b'N',
 79: b'O',
 80: b'P',
 81: b'Q',
 82: b'R',
 83: b'

In [33]:
tok.byte_vocab

import json 

def save_byte_vocab(byte_vocab: dict, save_dir: str): 
    byte_to_int = {bytes([i]): i for i in range(256)}
    converted_vocab = {k: byte_to_int[v] for k, v in tok.byte_vocab.items()}
    with open(save_dir + "/byte_vocab.json", "w") as f: 
        json.dump(converted_vocab, f)

def load_byte_vocab(save_dir: str): 
    with open(save_dir + "/byte_vocab.json", "r") as f: 
        raw_vocab = json.load(f)
    int_to_byte = lambda k: bytes([k])
    
    byte_vocab = {k: int_to_byte(v) for k, v in raw_vocab.items()}
    return byte_vocab


save_dir = "."
save_byte_vocab(tok.byte_vocab, save_dir)
byte_vocab = load_byte_vocab(save_dir)



In [13]:
self.token_trie.id2token # missing byte elements  --> initialization wrong

{0: '<|endoftext|>', 1: '<pad>', 2: '<USER> ', 3: '<ASSISTANT> '}

In [5]:
# -------- input -------------
import re 
special_tokens = self.special_tokens
special2idx = self.special2idx
byte2idx = self.byte2idx # this guy is off --> we need key to have integer values, not byte values
# -------- end ----------------


pattern = f"({'|'.join(re.escape(token) for token in special_tokens)})"
segments = re.split(pattern, text)
ids = []
for seg in segments:
    if not seg:  # Skip empty segments
        continue
    if seg in special2idx:  # Handle special tokens
        ids.append(special2idx[seg])
    else:  # Handle regular text as bytes
        # Convert text to bytes and map to token IDs
        byte_data = seg.encode('utf-8')
        ids.extend(byte2idx[b] for b in byte_data)

In [5]:
self.byte2idx

{b'\x00': 0,
 b'\x01': 1,
 b'\x02': 2,
 b'\x03': 3,
 b'\x04': 4,
 b'\x05': 5,
 b'\x06': 6,
 b'\x07': 7,
 b'\x08': 8,
 b'\t': 9,
 b'\n': 10,
 b'\x0b': 11,
 b'\x0c': 12,
 b'\r': 13,
 b'\x0e': 14,
 b'\x0f': 15,
 b'\x10': 16,
 b'\x11': 17,
 b'\x12': 18,
 b'\x13': 19,
 b'\x14': 20,
 b'\x15': 21,
 b'\x16': 22,
 b'\x17': 23,
 b'\x18': 24,
 b'\x19': 25,
 b'\x1a': 26,
 b'\x1b': 27,
 b'\x1c': 28,
 b'\x1d': 29,
 b'\x1e': 30,
 b'\x1f': 31,
 b' ': 32,
 b'!': 33,
 b'"': 34,
 b'#': 35,
 b'$': 36,
 b'%': 37,
 b'&': 38,
 b"'": 39,
 b'(': 40,
 b')': 41,
 b'*': 42,
 b'+': 43,
 b',': 44,
 b'-': 45,
 b'.': 46,
 b'/': 47,
 b'0': 48,
 b'1': 49,
 b'2': 50,
 b'3': 51,
 b'4': 52,
 b'5': 53,
 b'6': 54,
 b'7': 55,
 b'8': 56,
 b'9': 57,
 b':': 58,
 b';': 59,
 b'<': 60,
 b'=': 61,
 b'>': 62,
 b'?': 63,
 b'@': 64,
 b'A': 65,
 b'B': 66,
 b'C': 67,
 b'D': 68,
 b'E': 69,
 b'F': 70,
 b'G': 71,
 b'H': 72,
 b'I': 73,
 b'J': 74,
 b'K': 75,
 b'L': 76,
 b'M': 77,
 b'N': 78,
 b'O': 79,
 b'P': 80,
 b'Q': 81,
 b'R': 82,
 b'S': 

In [16]:
tok._init_token_trie(byte_vocab=byte_vocab)

{0: b'\x00',
 1: b'\x01',
 2: b'\x02',
 3: b'\x03',
 4: b'\x04',
 5: b'\x05',
 6: b'\x06',
 7: b'\x07',
 8: b'\x08',
 9: b'\t',
 10: b'\n',
 11: b'\x0b',
 12: b'\x0c',
 13: b'\r',
 14: b'\x0e',
 15: b'\x0f',
 16: b'\x10',
 17: b'\x11',
 18: b'\x12',
 19: b'\x13',
 20: b'\x14',
 21: b'\x15',
 22: b'\x16',
 23: b'\x17',
 24: b'\x18',
 25: b'\x19',
 26: b'\x1a',
 27: b'\x1b',
 28: b'\x1c',
 29: b'\x1d',
 30: b'\x1e',
 31: b'\x1f',
 32: b' ',
 33: b'!',
 34: b'"',
 35: b'#',
 36: b'$',
 37: b'%',
 38: b'&',
 39: b"'",
 40: b'(',
 41: b')',
 42: b'*',
 43: b'+',
 44: b',',
 45: b'-',
 46: b'.',
 47: b'/',
 48: b'0',
 49: b'1',
 50: b'2',
 51: b'3',
 52: b'4',
 53: b'5',
 54: b'6',
 55: b'7',
 56: b'8',
 57: b'9',
 58: b':',
 59: b';',
 60: b'<',
 61: b'=',
 62: b'>',
 63: b'?',
 64: b'@',
 65: b'A',
 66: b'B',
 67: b'C',
 68: b'D',
 69: b'E',
 70: b'F',
 71: b'G',
 72: b'H',
 73: b'I',
 74: b'J',
 75: b'K',
 76: b'L',
 77: b'M',
 78: b'N',
 79: b'O',
 80: b'P',
 81: b'Q',
 82: b'R',
 83: b'

In [6]:
# char <--> byte
# text = "I am super duper"
text = "I am super duper 😊 áéíóú 你好"
print("Characters: ", [t for t in text])
print("Bytes (utf-8 encoding): ", [b for b in text.encode("utf-8")])
byte_list = text.encode("utf-8")
print("Text (utf-8 decoding): ", [t for t in byte_list.decode("utf-8")])


Characters:  ['I', ' ', 'a', 'm', ' ', 's', 'u', 'p', 'e', 'r', ' ', 'd', 'u', 'p', 'e', 'r', ' ', '😊', ' ', 'á', 'é', 'í', 'ó', 'ú', ' ', '你', '好']
Bytes (utf-8 encoding):  [73, 32, 97, 109, 32, 115, 117, 112, 101, 114, 32, 100, 117, 112, 101, 114, 32, 240, 159, 152, 138, 32, 195, 161, 195, 169, 195, 173, 195, 179, 195, 186, 32, 228, 189, 160, 229, 165, 189]
Text (utf-8 decoding):  ['I', ' ', 'a', 'm', ' ', 's', 'u', 'p', 'e', 'r', ' ', 'd', 'u', 'p', 'e', 'r', ' ', '😊', ' ', 'á', 'é', 'í', 'ó', 'ú', ' ', '你', '好']


In [2]:
# what does encoding + save do ? 
import json 
utf_file = "utf_file.json"
char_file = "char_file.json"
data = list(byte_list)
with open(utf_file, 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)
with open(char_file, 'w') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

In [4]:
with open(utf_file, 'w') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)
bytes(data).decode("utf-8")

'I am super duper 😊 áéíóú 你好'

In [5]:
with open(char_file, 'w') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)
bytes(data).decode("utf-8")

'I am super duper 😊 áéíóú 你好'