# 深入理解utf-8编码

In [1]:
hex(123)

'0x7b'

In [2]:
0x4e25

20005

In [57]:
def hex_to_utf8(hex_str):
    """
    将四位十六进制数转换为 UTF-8 编码字符
    Args:
        hex_str: str，四位十六进制数
    Returns:
        utf8_str: str，UTF-8 编码
    """
    binary_str = bin(int(hex_str, 16))[2:]  # 将十六进制数转换为二进制数
    binary_str_len = len(binary_str)  # 计算二进制数长度

    binary_utf8 = ''  # 用于存储 UTF-8 编码
    #      n字节 
    # - 第1个字节n个1，1个0，后面字节前两位为10 
    # - 1字节：0xxxxxxx
    # - 2字节：110xxxxx 10xxxxxx
    # - 3字节：1110xxxx 10xxxxxx 10xxxxxx
    # - 4字节：11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
    # - 代码点位从后往前填入，剩下补0
    # - 代码点位数：7, 11, 16, 21

    binary_str = binary_str[::-1]  # 将二进制数反转

    if binary_str_len <= 7:  # 1 字节
        binary_utf8 = '0' + binary_str[::-1]
    elif binary_str_len <= 11:  # 2 字节
        binary_utf8 = '10' + binary_str[0:6][::-1]
        binary_utf8 = '110' + binary_str[6:][::-1].zfill(5) + binary_utf8
    elif binary_str_len <= 16:  # 3 字节
        binary_utf8 = '10' + binary_str[0:6][::-1]
        binary_utf8 = '10' + binary_str[6:12][::-1] + binary_utf8
        binary_utf8 = '1110' + binary_str[12:][::-1].zfill(4) + binary_utf8
    elif binary_str_len <= 21:  # 4 字节
        binary_utf8 = '10' + binary_str[0:6][::-1]
        binary_utf8 = '10' + binary_str[6:12][::-1] + binary_utf8
        binary_utf8 = '10' + binary_str[12:18][::-1] + binary_utf8
        binary_utf8 = '11110' + binary_str[18:][::-1].zfill(3) + binary_utf8
    
    return binary_utf8


def print_utf8(binary_utf8):
    """
    打印 UTF-8 编码
    Args:
        binary_utf8: str，UTF-8 编码
    """
    # 每 4 位二进制数打印一个下划线
    # 每 8 位二进制数打印一个空格
    # 每 16 位二进制数打印一个换行符
    for i in range(len(binary_utf8)):
        if i % 16 == 0 and i != 0:
            print()
        elif i % 8 == 0 and i != 0:
            print(' ', end='')
        elif i % 4 == 0 and i != 0:
            print('_', end='')
        print(binary_utf8[i], end='')
    

In [75]:
hex_str = '0x4e25'
print(hex_str)
print(int(hex_str, 16))
print(bin(int(hex_str, 16)))
print(bin(int(hex_str, 16))[2:])

0x4e25
20005
0b100111000100101
100111000100101


In [78]:
hex_to_utf8('0x4e25')

'111001001011100010100101'

In [79]:
print_utf8(hex_to_utf8('0x4e25'))

1110_0100 1011_1000
1010_0101

In [80]:
hex(ord('严'))

'0x4e25'

In [81]:
chr(ord('严'))

'严'

In [83]:
# 零一编码
# zero_one_code = '010101000110100001100101'
zero_one_code = hex_to_utf8(hex(ord('严')))

# 将零一编码转换为二进制字节串
binary_data = bytes.fromhex(hex(int(zero_one_code, 2))[2:])

# 将二进制字节串写入文件
with open('output.bin', 'wb') as f:
    f.write(binary_data)

# 从文件中读取二进制字节串并以utf-8格式解码
with open('output.bin', 'rb') as f:
    utf8_text = f.read().decode('utf-8')

print(utf8_text)

严
