In [1]:
test_string = "hello! こんにちは!"
utf8_encoded = test_string.encode("utf-8")
print(utf8_encoded)

b'hello! \xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf!'


In [2]:
utf16_encoded = test_string.encode("utf-16")
print(utf16_encoded)

b'\xff\xfeh\x00e\x00l\x00l\x00o\x00!\x00 \x00S0\x930k0a0o0!\x00'


In [3]:
utf32_encoded = test_string.encode("utf-32")
print(utf32_encoded)

b'\xff\xfe\x00\x00h\x00\x00\x00e\x00\x00\x00l\x00\x00\x00l\x00\x00\x00o\x00\x00\x00!\x00\x00\x00 \x00\x00\x00S0\x00\x00\x930\x00\x00k0\x00\x00a0\x00\x00o0\x00\x00!\x00\x00\x00'


In [4]:
# 1. Storage Efficiency

# UTF-8 wins for most content: Especially for ASCII-heavy text (English, code, markup), UTF-8 uses 1 byte per character vs 2-4 bytes for UTF-16/32
# Web and programming contexts: Most internet content and code is ASCII-heavy, making UTF-8 optimal
# Variable-length advantage: UTF-8 only uses more bytes when needed (2-4 bytes for non-ASCII)

# 2. Vocabulary Size and Range

# UTF-8: Fixed vocabulary of exactly 256 possible byte values (0-255)
# UTF-16: Potentially 65,536 different code units (0-65535)
# UTF-32: Over 1 million possible code points (0-1,114,111)

# This is crucial because:

# Smaller vocabulary = more efficient tokenizer training
# Bounded vocabulary size makes model architecture simpler
# Every possible UTF-8 byte sequence can be handled with just 256 base tokens

# 3. Universality and Robustness

# UTF-8 can represent any Unicode text with just byte-level tokens
# Handles invalid/corrupted data gracefully - invalid byte sequences don't break the tokenizer
# No endianness issues (unlike UTF-16/32)
# Backward compatible with ASCII

# 4. Training Data Efficiency
# From our analysis:

# Most training data (English, code, markup) is heavily ASCII
# UTF-8 provides the best compression for this common case
# Multilingual content still works, just uses more bytes when needed

# 5. Practical Considerations

# Internet standard: Most web content is UTF-8 encoded
# File size: Training datasets are smaller in UTF-8
# Processing speed: Fewer total bytes to process for typical training corpora
# Compatibility: Works with existing text processing pipelines

# The key insight is that while UTF-16 occasionally wins for specific scripts (Japanese, emoji), 
# UTF-8's combination of efficiency for common cases, bounded vocabulary size, and universal compatibility 
# makes it the optimal choice for tokenizer training on diverse text corpora.

In [5]:
def decode_utf8_bytes_to_str_wrong(bytestring: bytes):
    return "".join([bytes([b]).decode("utf-8") for b in bytestring])

decode_utf8_bytes_to_str_wrong("café".encode("utf-8"))

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc3 in position 0: unexpected end of data

In [6]:
def decode_utf8_bytes_to_str_correct(bytestring: bytes):
    # The key insight is that UTF-8 decoding must be done on the entire byte sequence at once, not byte-by-byte, 
    # because the encoding uses variable-length sequences where multiple bytes work together to represent single Unicode characters.
    return bytestring.decode("utf-8")

decode_utf8_bytes_to_str_correct("café".encode("utf-8"))

'café'

In [7]:
# Give a two byte sequence that does not decode to any Unicode character(s).

In [8]:
decode_utf8_bytes_to_str_wrong([192, 127])

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc0 in position 0: invalid start byte

In [9]:
# First byte 0xC0 (192) indicates the start of a 2-byte UTF-8 sequence (pattern 110xxxxx)
# Second byte 0x7F (127) is an ASCII character, not a UTF-8 continuation byte
# Continuation bytes must be in range 0x80-0xBF (pattern 10xxxxxx)