In [5]:
s = 'cafe'
print(len(s))

b = s.encode('utf8')
print(b)
b.decode('utf8')
b

4
b'cafe'


b'cafe'

In [10]:
cafe = bytes('café', encoding='utf_8')
print(cafe[0])
print(cafe[:1])

cafe_arr = bytearray(cafe)
cafe_arr[-1:]

99
b'c'


bytearray(b'\xa9')

**Обработка UnicodeEncodeError**


In [18]:
city = 'São Paulo'
print(city.encode('utf_8'))
print(city.encode('utf_16'))
print(city.encode('iso8859_1'))
# city.encode('cp437')
print(city.encode('cp437', errors='ignore'))
print(city.encode('cp437', errors='replace'))
print(city.encode('cp437', errors='xmlcharrefreplace'))

b'S\xc3\xa3o Paulo'
b'\xff\xfeS\x00\xe3\x00o\x00 \x00P\x00a\x00u\x00l\x00o\x00'
b'S\xe3o Paulo'
b'So Paulo'
b'S?o Paulo'
b'S&#227;o Paulo'


**Обработка UnicodeDecodeError**

In [23]:
octets = b'Montr\xe9al'
print(octets.decode('cp1252'))
print(octets.decode('iso8859_7'))
print(octets.decode('koi8_r'))
print(octets.decode('utf_8', errors='replace'))

Montréal
Montrιal
MontrИal
Montr�al


**Исключение SyntaxError при загрузке модулей
с неожиданной кодировкой**

In [24]:
# coding: cp1252

print('Olá, Mundo!')

Olá, Mundo!


## Как определить кодировку последовательности байтов

In [29]:
from chardet import chardetect

ImportError: cannot import name 'chardetect' from 'chardet' (/usr/lib/python3.10/site-packages/chardet/__init__.py)

**BOM: полезный крокозябр**

In [31]:
u16 = 'El Niño'.encode('utf_16')
u16

''' b'\xff\xfe' - порядок байтов прямой''' 


" b'ÿþ' "

In [36]:
u16le = 'El Niño'.encode('utf_16le')
print(list(u16))
print(list(u16le))

u16be = 'El Niño'.encode('utf_16be')

list(u16be)

[255, 254, 69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111, 0]
[69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111, 0]


[0, 69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111]

In [37]:
u8e = 'El Niño'.encode('utf_8')
list(u8e)

[69, 108, 32, 78, 105, 195, 177, 111]

## Обработка текстовых файлов

In [41]:
fp = open('cafe.txt', 'w', encoding='utf_8').write('café')
open('cafe.txt').read()

'café'

In [49]:
import os

os.stat('cafe.txt').st_size

fp2 = open('cafe.txt')
fp2.encoding
fp2.read()

fp3 = open('cafe.txt', encoding='cp1252')
fp3.encoding
fp3.read()

'cafÃ©'

In [50]:
fp4 = open('cafe.txt', 'rb')
fp4.read()

b'caf\xc3\xa9'

**Остерегайтесь кодировок по умолчанию**

In [57]:
import locale
import sys

expressions = """
 locale.getpreferredencoding()
 type(my_file)
 my_file.encoding
 sys.stdout.isatty()
 sys.stdout.encoding
 sys.stdin.isatty()
 sys.stdin.encoding
 sys.stderr.isatty()
 sys.stderr.encoding
 sys.getdefaultencoding()
 sys.getfilesystemencoding()
 """

my_file = open('dummy', 'w')

for expression in expressions.split():
    value = eval(expression)
    print(f'{expression:>30} -> {value!r}')

 locale.getpreferredencoding() -> 'UTF-8'
                 type(my_file) -> <class '_io.TextIOWrapper'>
              my_file.encoding -> 'UTF-8'
           sys.stdout.isatty() -> False
           sys.stdout.encoding -> 'UTF-8'
            sys.stdin.isatty() -> False
            sys.stdin.encoding -> 'utf-8'
           sys.stderr.isatty() -> False
           sys.stderr.encoding -> 'UTF-8'
      sys.getdefaultencoding() -> 'utf-8'
   sys.getfilesystemencoding() -> 'utf-8'


In [60]:
import sys
from unicodedata import name

print(sys.version)
print()
print('sys.stout.isatty():', sys.stdout.isatty())
print('sys.stdout.encoding:', sys.stdout.encoding)
print()

test_chars = [
    '\N{HORIZONTAL ELLIPSIS}',
    '\N{INFINITY}',
    '\N{CIRCLED NUMBER FORTY TWO}',
    
]

for char in test_chars:
    print(f'Trying to output {name(char)}:')
    print(char)

3.10.10 (main, Mar  5 2023, 22:26:53) [GCC 12.2.1 20230201]

sys.stout.isatty(): False
sys.stdout.encoding: UTF-8

Trying to output HORIZONTAL ELLIPSIS:
…
Trying to output INFINITY:
∞
Trying to output CIRCLED NUMBER FORTY TWO:
㊷


In [63]:
locale.getpreferredencoding()

'UTF-8'

## Нормализация Unicode для надежного сравнения

In [66]:
s1 = 'café'
s2 = 'cafe\N{COMBINING ACUTE ACCENT}'
print(s1, s2)
print(len(s1), len(s2))
print(s1 == s2)

café café
4 5
False


In [68]:
from unicodedata import normalize

print(len(normalize('NFC', s1)), len(normalize('NFC', s2)))
print(len(normalize('NFD', s1)), len(normalize('NFD', s2)))

4 4
5 5


In [73]:
from unicodedata import normalize, name
ohm = '\u2126'
print(name(ohm))
ohm_c = normalize('NFC', ohm)
print(name(ohm_c))
print(ohm == ohm_c)
normalize('NFC', ohm) == normalize('NFC', ohm_c)

OHM SIGN
GREEK CAPITAL LETTER OMEGA
False


True

In [82]:
from unicodedata import normalize, name

half = '\N{VULGAR FRACTION ONE HALF}'
print(half)
print(normalize('NFKC', half))
for char in normalize('NFKC', half):
    print(char, name(char), sep='\t')
    
four_squared = '4²'
normalize('NFKC', four_squared)

micro = 'μ'
micro_kc = normalize('NFKC', micro)
print(micro, micro_kc)
print(ord(micro), ord(micro_kc))
print(name(micro), name(micro_kc))

½
1⁄2
1	DIGIT ONE
⁄	FRACTION SLASH
2	DIGIT TWO
μ μ
956 956
GREEK SMALL LETTER MU GREEK SMALL LETTER MU


## Сворачивание регистра

In [86]:
micro = 'μ'
print(name(micro))
micro_cf = micro.casefold()
name(micro_cf)

eszett = 'ß'
print(name(eszett))

eszett_cf = eszett.casefold()
print(eszett_cf)

GREEK SMALL LETTER MU
LATIN SMALL LETTER SHARP S
ss


## Служебные функции для сравнения нормализованного
## текста

In [97]:
s1 = 'cafe'
s2 = 'cafe\u0301'
print(s1, s2)


from unicodedata import normalize

def nfc_equal(str1, str2):
    return normalize('NFC', str1) == normalize('NFC', str2)
    
def fold_equal(str1, str2):
    return (normalize('NFC',str1).casefold() == 
           normalize('NFC',str2).casefold()) 

fold_equal('A', 'a')

normalize('NFC', s2)

cafe café


'café'

**Экстремальная «нормализация»: удаление диакритических
знаков**

In [108]:
import unicodedata
import string

def shave_marks(txt):
    """Remove all diacritic marks"""
    norm_txt = unicodedata.normalize('NFD', txt)
#     print(norm_txt)
#     norm_txt = normalize('NFC', txt)
    shaved = ''.join(c for c in norm_txt
                    if not unicodedata.combining(c))
    return unicodedata.normalize('NFC', shaved)

order = '"Herr Voß: • ½ cup of OEtker™ caffè latte • bowl of açaí."'

shave_marks(order)

Greek = 'Ζέφυρος, Zéfiro'
shave_marks(Greek)


'Ζεφυρος, Zefiro'

In [115]:
def shave_marks_latin(txt):
    """Удалить все диактрические знаки для базовых символов набора Latin"""
    norm_txt = unicodedata.normalize('NFD', txt)
    latin_base = False
    preserve = []
    for c in norm_txt:
        if unicodedata.combining(c) and latin_base:
            continue
        preserve.append(c)
        
        if not unicodedata.combining(c):
            latin_base = c in string.ascii_letters
    
    shaved = ''.join(preserve)
    
    return unicodedata.normalize('NFC', shaved)

shave_marks_latin('"Herr Voß: • ½ cup of OEtker™ caffè latte • bowl of açaí."')

'"Herr Voß: • ½ cup of OEtker™ caffe latte • bowl of acai."'

In [116]:
def shave_marks_latin(txt):
    norm_txt = unicodedata.normalize('UFD', txt)
    latin_base = False
    preserve = []
    for char in norm_txt:
        if unicodedata.combining(char) and latin_base:
            continue
        preverse.append(char)
        
        if not unicodedata.combining(char):
            latin_base = char in string.ascii_letters
            
    shaved = ''.join(preverse)
    
    return unicodedata.normalize('NFC', shaved)

In [135]:
single_map = str.maketrans("""‚ƒ„ˆ‹''""•––˜›""",
                            """'f"^<''""---~>""")

multi_map = str.maketrans({
'€': 'EUR',
 '…': '...',
 'Æ': 'AE',
 'æ': 'ae',
 'O': 'OE',
 'o': 'oe',
 '™': '(TM)',
 '‰': '<per mille>',
 '†': '**',
 '‡': '***',   
})

multi_map.update(single_map)

print(multi_map)

def dewinize(txt):
    return txt.translate(multi_map)

def asciize(txt):
    no_marks = shave_marks_latin(dewinize(txt))
    no_marks = no_marks.replace('ß', 'ss') 
    return unicodedata.normalize('NFK C', no_marks)

order = '"Herr Voß: • ½ cup of OEtker™ caffè latte • bowl of açaí."'

dewinize(order)

asciize(order)

{8364: 'EUR', 8230: '...', 198: 'AE', 230: 'ae', 79: 'OE', 111: 'oe', 8482: '(TM)', 8240: '<per mille>', 8224: '**', 8225: '***', 8218: 39, 402: 102, 8222: 34, 710: 94, 8249: 60, 39: 39, 34: 34, 8226: 45, 8211: 45, 732: 126, 8250: 62}


ValueError: invalid normalization form

## Сортировка Unicode-текстов

In [143]:
import locale

my_locale = locale.setlocale(locale.LC_COLLATE, 'en_US.UTF8')
print(my_locale)
fruits = ['caju', 'atemoia', 'cajá', 'açaí', 'acerola']
sorted_fruits = sorted(fruits, key=locale.strxfrm)
print(sorted_fruits)

en_US.UTF8
['açaí', 'acerola', 'atemoia', 'cajá', 'caju']


In [139]:


locale.getdefaultlocale()

('en_US', 'UTF-8')

In [None]:
## Сортировка с помощью алгоритма упорядочивания
## Unicode

In [147]:
import pyuca
coll = pyuca.Collator()
fuits = ['caju', 'atemoia', 'cajá', 'açaí', 'acerola']
sorted_fruits = sorted(fruits, key=coll.sort_key)
sorted_fruits


['açaí', 'acerola', 'atemoia', 'cajá', 'caju']

**База данных Unicode**

***Поиск символов по имени***

In [148]:
from unicodedata import name
print(name('A'))

LATIN CAPITAL LETTER A


In [None]:
import sys
import unicodedata

START, END = ord(' '), sys.maxunicode + 1

def find(*query_words, start=START, end=END):
    query = {w.upper() for w in query_words}
    for code in range(start, end):
        char = chr(code)
        name = unicodedata.name(char, None)
        if name and query.issubset(name.split()):
            print(f'U+{code:04X}\t{char}\t{name})
                  
def main(words):
      if words:
          find(*words)
      else:
          print('Please provide words to find')

In [155]:
START, END = ord(' '), sys.maxunicode + 1

def find(*query_words, start=START, end=END):
    query = {w.upper() for w in query_words}
    for code in range(start, end):
        char = chr(code)
        name = unicodedata.name(char, None)
        if name and query.issubset(name.split()):
            print(f'U+{code:04X}\t{char}\t{name}')
        

## Символы, связанные с числами

In [163]:
import unicodedata
import re

re_digit = re.compile(r'\d')

sample = '1\xbc\xb2\u0969\u136b\u216b\u2466\u2480\u3285'

for char in sample:
    print(f'U+{ord(char):04x}',
         char.center(6),
         're_dig' if re_digit.match(char) else '-',
         'isdig' if char.isdigit() else '-',
         'isnum' if char.isnumeric() else '-',
         f'{unicodedata.numeric(char):5.2f}',
         unicodedata.name(char),
         sep='\t')
    

U+0031	  1   	re_dig	isdig	isnum	 1.00	DIGIT ONE
U+00bc	  ¼   	-	-	isnum	 0.25	VULGAR FRACTION ONE QUARTER
U+00b2	  ²   	-	isdig	isnum	 2.00	SUPERSCRIPT TWO
U+0969	  ३   	re_dig	isdig	isnum	 3.00	DEVANAGARI DIGIT THREE
U+136b	  ፫   	-	isdig	isnum	 3.00	ETHIOPIC DIGIT THREE
U+216b	  Ⅻ   	-	-	isnum	12.00	ROMAN NUMERAL TWELVE
U+2466	  ⑦   	-	isdig	isnum	 7.00	CIRCLED DIGIT SEVEN
U+2480	  ⒀   	-	-	isnum	13.00	PARENTHESIZED NUMBER THIRTEEN
U+3285	  ㊅   	-	-	isnum	 6.00	CIRCLED IDEOGRAPH SIX


## Двухрежимный API

**str и bytes в регулярных выражениях**

In [166]:
import re

re_numbers_str = re.compile(r'\d+')
re_words_str = re.compile(r'\w+')
re_numbers_bytes = re.compile(rb'\d+')
re_words_bytes = re.compile(rb'\w+')

text_str = ("Ramanujan saw \u0be7\u0bed\u0be8\u0bef"
 " as 1729 = 1³ + 12³ = 9³ + 10³.")

text_bytes = text_str.encode('utf_8')

print(f'Text\n {text_str!r}')
print('Numbers')
print(' str :', re_numbers_str.findall(text_str))
print(' bytes ', re_numbers_bytes.findall(text_bytes))
print('Words')
print(' str : ', re_words_str.findall(text_str))
print(' bytes :', re_words_bytes.findall(text_bytes))
      

Text
 'Ramanujan saw ௧௭௨௯ as 1729 = 1³ + 12³ = 9³ + 10³.'
Numbers
 str : ['௧௭௨௯', '1729', '1', '12', '9', '10']
 bytes  [b'1729', b'1', b'12', b'9', b'10']
Words
 str :  ['Ramanujan', 'saw', '௧௭௨௯', 'as', '1729', '1³', '12³', '9³', '10³']
 bytes : [b'Ramanujan', b'saw', b'as', b'1729', b'1', b'12', b'9', b'10']


**str и bytes в функциях из модуля os**

In [176]:
import os

os.listdir('.')

os.listdir(b'.')



[b'Python Cookbook \xd0\xa7\xd0\xb0\xd1\x81\xd1\x82\xd1\x8c \xd0\xbf\xd0\xb5\xd1\x80\xd0\xb2\xd0\xb0\xd1\x8f.ipynb',
 b'floats.bin',
 b'dummy',
 b'\xd0\x93\xd0\xbb\xd0\xb0\xd0\xb2\xd0\xb0 4 Unicode-\xd1\x82\xd0\xb5\xd0\xba\xd1\x81\xd1\x82 \xd0\xb8 \xd0\xb1\xd0\xb0\xd0\xb9\xd1\x82\xd1\x8b.ipynb',
 b'cafe.txt',
 b'\xd0\x9f\xd0\xbe\xd0\xbb\xd1\x8c\xd0\xb7\xd0\xbe\xd0\xb2\xd0\xb0\xd1\x82\xd0\xb5\xd0\xbb\xd1\x8c\xd1\x81\xd0\xba\xd0\xb8\xd0\xb5 \xd0\xb0\xd1\x82\xd1\x80\xd0\xb8\xd0\xb1\xd1\x83\xd1\x82\xd1\x8b \xd0\xb2 Python.ipynb',
 b'Untitled.ipynb',
 b'.ipynb_checkpoints']

In [175]:
from pathlib import Path

for path in Path.cwd().iterdir():
    print(path)

/home/clarence/Python  к вершинам мастерства/Python Cookbook Часть первая.ipynb
/home/clarence/Python  к вершинам мастерства/floats.bin
/home/clarence/Python  к вершинам мастерства/dummy
/home/clarence/Python  к вершинам мастерства/Глава 4 Unicode-текст и байты.ipynb
/home/clarence/Python  к вершинам мастерства/cafe.txt
/home/clarence/Python  к вершинам мастерства/Пользовательские атрибуты в Python.ipynb
/home/clarence/Python  к вершинам мастерства/Untitled.ipynb
/home/clarence/Python  к вершинам мастерства/.ipynb_checkpoints
