### Characters & Bytes

In [1]:
s = 'café'
len(s)

4

In [2]:
b = s.encode('utf8')
b

b'caf\xc3\xa9'

In [3]:
b2 = bytes(b'\xc3\xa9')
b3 = bytes([0xc3, 0xa9])
b2

b'\xc3\xa9'

In [4]:
print(b2.decode('utf8'))
print(b3.decode('utf8'))

é
é


In [5]:
b3 = b'c'
b3

b'c'

In [6]:
type(b3)

bytes

In [7]:
print(b3)
print(b3.decode('utf8'))
# Hex code point
print(b3.hex())
# Decimal Unicode point
print(ord(b3))
b4 = bytes([0x63])
print(b4.decode('utf8'))
b5 = b'\x63'
print(type(b5))
print(b5)
print(list(b3))

b'c'
c
63
99
c
<class 'bytes'>
b'c'
[99]


In [8]:
b4 = bytes([0x64])
b4

b'd'

In [9]:
type(b)

bytes

In [10]:
len(b)

5

In [11]:
b.decode('utf8')

'café'

In [12]:
cafe = bytes('café', encoding='utf_8')
cafe

b'caf\xc3\xa9'

In [13]:
cafe[0]

99

In [14]:
print(cafe[0:4])
print(cafe[0:5])
print(cafe[3])
print(cafe[4])

b'caf\xc3'
b'caf\xc3\xa9'
195
169


In [15]:
cafe_arr = bytearray(cafe)
cafe_arr

bytearray(b'caf\xc3\xa9')

In [16]:
bytearray(b'caf\xc3\xa9')

bytearray(b'caf\xc3\xa9')

In [17]:
cafe_arr[4:5]

bytearray(b'\xa9')

In [18]:
bytes(range(0,128)).decode('utf8')

'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f'

In [19]:
import array
numbers = array.array('h', [-2, -1, 0, 1, 2])
octets = bytes(numbers)
octets

b'\xfe\xff\xff\xff\x00\x00\x01\x00\x02\x00'

In [20]:
type(octets)

bytes

### Encoder / Decoder

In [21]:
for codec in ['latin_1', 'utf_8', 'utf_16']:
    print(codec, 'El Niño'.encode(codec), sep='\t')

latin_1	b'El Ni\xf1o'
utf_8	b'El Ni\xc3\xb1o'
utf_16	b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00'


In [22]:
b = b'\xc3\xa9'
b

b'\xc3\xa9'

In [23]:
b.decode('utf_8')

'é'

In [24]:
b.decode('cp1252')

'Ã©'

In [25]:
b = b'\xe9'
b

b'\xe9'

In [26]:
b.decode('cp1252')

'é'

In [27]:
b.decode('iso8859_7')

'ι'

In [28]:
b.decode('utf8', errors='replace')

'�'

In [29]:
u16 = 'El Niño'.encode('utf16')
u16

b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00'

In [30]:
list(u16)

[255, 254, 69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111, 0]

In [31]:
b6 = bytes([69])
b6

b'E'

In [34]:
u16le = 'El Niño'.encode('utf_16le')
u16be = 'El Niño'.encode('utf_16be')
print(list(u16le))
print(list(u16be))

[69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111, 0]
[0, 69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111]


### Handling Text Files

In [35]:
fp = open('cafe.txt', 'w', encoding='utf_8')
fp

<_io.TextIOWrapper name='cafe.txt' mode='w' encoding='utf_8'>

In [36]:
fp.write('café')

4

In [37]:
fp.close()

In [39]:
import os
os.stat('cafe.txt').st_size

5

In [40]:
fp2 = open('cafe.txt')
fp2

<_io.TextIOWrapper name='cafe.txt' mode='r' encoding='UTF-8'>

In [41]:
fp2.read()

'café'

In [42]:
fp2 = open('cafe.txt', encoding='cp1252')
fp2

<_io.TextIOWrapper name='cafe.txt' mode='r' encoding='cp1252'>

In [43]:
fp2.read()

'cafÃ©'

In [46]:
fp3 = open('cafe.txt', 'rb')
fp3

<_io.BufferedReader name='cafe.txt'>

In [47]:
fp3.read()

b'caf\xc3\xa9'

### Normalizing Unicode for Comparison

In [48]:
from unicodedata import normalize, name

In [49]:
ohm = '\u2126'
ohm

'Ω'

In [50]:
name(ohm)

'OHM SIGN'

In [51]:
ohm_c = normalize('NFC', ohm)
ohm_c

'Ω'

In [52]:
name(ohm_c)

'GREEK CAPITAL LETTER OMEGA'

In [53]:
normalize('NFC', ohm) == normalize('NFC', ohm_c)

True

In [69]:
bytes([0xE2, 0x84, 0xA6]).decode('utf-8')

'Ω'

In [78]:
ord(bytes([0xE2, 0x84, 0xA6]).decode('utf-8'))

8486

In [79]:
u"\u2126"

'Ω'

In [93]:
"\u2126"

'Ω'

In [94]:
"\u2126" == u"\u2126"

True

In [80]:
ord(u"\u2126")

8486

In [92]:
'e\u0301'

'é'

In [99]:
'''abc
def
ghi'''

'abc\ndef\nghi'

In [100]:
s = ''
s.maketrans()