In [8]:
import unicodedata
name = unicodedata.name('A')
unicodedata.name('A'), unicodedata.lookup(name)

('LATIN CAPITAL LETTER A', 'A')

In [10]:
import unicodedata
name = unicodedata.name('$')
unicodedata.name('$'), unicodedata.lookup(name)

('DOLLAR SIGN', '$')

In [13]:
import unicodedata
name = unicodedata.name('\u00a2') #\u + 四個十六進位數字 
unicodedata.name('\u00a2'), unicodedata.lookup(name)

('CENT SIGN', '¢')

In [15]:
import unicodedata
name = unicodedata.name('\u2603') #\u + 四個十六進位數字 
unicodedata.name('\u2603'), unicodedata.lookup(name)

('SNOWMAN', '☃')

In [2]:
import unicodedata
name = unicodedata.name('\u00e9') #\u + 四個十六進位數字 
unicodedata.name('\u00e9'), unicodedata.lookup(name)

('LATIN SMALL LETTER E WITH ACUTE', 'é')

In [5]:
place = 'caf\N{LATIN SMALL LETTER E WITH ACUTE}'
place

'café'

In [3]:
len('\u00e9'),len('é')

(1, 1)

In [9]:
import unicodedata
snowman = '\u2603'
snowman.encode('utf-8'), snowman.encode('utf-16'), snowman.encode('utf-32')

(b'\xe2\x98\x83', b'\xff\xfe\x03&', b'\xff\xfe\x00\x00\x03&\x00\x00')

In [14]:
import unicodedata
snowman = '\u2603'
a = snowman.encode('utf-8')
b = snowman.encode('utf-16')
c = snowman.encode('utf-32')
a.decode('utf-8'), b.decode('utf-16'), c.decode('utf-32')

('☃', '☃', '☃')

In [10]:
#little endian
u16LittleEndian = '今天好餓'.encode('utf_16le')
list(u16LittleEndian)

[202, 78, 41, 89, 125, 89, 19, 153]

In [1]:
#big endian
u16BigEndian = '今天好餓'.encode('utf_16be')
list(u16BigEndian)

[78, 202, 89, 41, 89, 125, 153, 19]

In [2]:
#有關endian的議題只會影響超過一個byte的字組的編碼，像是utf16 utf32，而utf8則沒這個問題

In [1]:
import sys, locale

#開啟檔案時，如果省略指定encoding，那編碼會是locale.getpreferredencoding()的結果
#不建議更改locale.getpreferredencoding()的結果，而應該在每次編碼解碼時指定encoding

expressions = """
        locale.getpreferredencoding()
        type(my_file)
        my_file.encoding
        sys.stdout.isatty()
        sys.stdout.encoding
        sys.stdin.isatty()
        sys.stdin.encoding
        sys.stderr.isatty()
        sys.stderr.encoding
        sys.getdefaultencoding()
        sys.getfilesystemencoding()
    """

my_file = open('000_dummy', 'w')

for expression in expressions.split():
    value = eval(expression)
    print(expression.rjust(30), '->', repr(value))

 locale.getpreferredencoding() -> 'cp950'
                 type(my_file) -> <class '_io.TextIOWrapper'>
              my_file.encoding -> 'cp950'
           sys.stdout.isatty() -> False
           sys.stdout.encoding -> 'UTF-8'
            sys.stdin.isatty() -> False
            sys.stdin.encoding -> 'utf-8'
           sys.stderr.isatty() -> False
           sys.stderr.encoding -> 'UTF-8'
      sys.getdefaultencoding() -> 'utf-8'
   sys.getfilesystemencoding() -> 'utf-8'


In [2]:
#正規化
#正規化可解決有些文字或符號看起來一樣，但比較起來不一樣的問題

from unicodedata import normalize

def nfc_equal(str1, str2):
    return normalize('NFC', str1) == normalize('NFC', str2)

s1 = 'café'
s2 = 'cafe\u0301'

s1, s2, s1 == s2, nfc_equal(s1, s2), nfc_equal('A', 'a')

('café', 'café', False, True, False)

In [4]:
#normalize的第一個引數可輸入 NFC NFD NFKC NFKD，
#最常用的是NFC，是絕大多數應用程式的首選
#NFKC NFKD是更嚴格的正規化，但可能會造成資料曲解或遺失

In [1]:
#unicode的中繼資料
import unicodedata
import re

re_digit = re.compile(r'\d')

sample = '1\xbc\xb2\u0969\u136b\u216b\u2466\u2480\u3285'

for char in sample:
    print('U+%04x' % ord(char),                       # <1>
          char.center(6),                             # <2>
          're_dig' if re_digit.match(char) else '-',  # <3>
          'isdig' if char.isdigit() else '-',         # <4>
          'isnum' if char.isnumeric() else '-',       # <5>
          format(unicodedata.numeric(char), '5.2f'),  # <6>
          unicodedata.name(char),                     # <7>
          sep='\t')

U+0031	  1   	re_dig	isdig	isnum	 1.00	DIGIT ONE
U+00bc	  ¼   	-	-	isnum	 0.25	VULGAR FRACTION ONE QUARTER
U+00b2	  ²   	-	isdig	isnum	 2.00	SUPERSCRIPT TWO
U+0969	  ३   	re_dig	isdig	isnum	 3.00	DEVANAGARI DIGIT THREE
U+136b	  ፫   	-	isdig	isnum	 3.00	ETHIOPIC DIGIT THREE
U+216b	  Ⅻ   	-	-	isnum	12.00	ROMAN NUMERAL TWELVE
U+2466	  ⑦   	-	isdig	isnum	 7.00	CIRCLED DIGIT SEVEN
U+2480	  ⒀   	-	-	isnum	13.00	PARENTHESIZED NUMBER THIRTEEN
U+3285	  ㊅   	-	-	isnum	 6.00	CIRCLED IDEOGRAPH SIX
