# CHAPTER 37 - UNICODE AND BYTE STRINGS

## STRING BASICS

In [2]:
ord('a')

97

In [3]:
hex(97)

'0x61'

In [4]:
chr(97)

'a'

In [5]:
0xC4

196

In [6]:
chr(196)

'Ä'

In [7]:
S = ' ni'

In [8]:
S.encode('ascii'), S.encode('latin1'), S.encode('utf8')

(b' ni', b' ni', b' ni')

In [9]:
S.encode('utf16'), len(S.encode('utf16')), len(S.encode('utf8'))

(b'\xff\xfe \x00n\x00i\x00', 8, 3)

In [10]:
B = b'spam'

In [11]:
S = 'eggs'

In [12]:
type(B), type(S)

(bytes, str)

In [13]:
B[1:], S[1:]

(b'pam', 'ggs')

In [14]:
list(B), list(S)

([115, 112, 97, 109], ['e', 'g', 'g', 's'])

In [16]:
B[0]= 'x'

TypeError: 'bytes' object does not support item assignment

In [17]:
S[0] = 'x'

TypeError: 'str' object does not support item assignment

In [19]:
# bytes prefix works on single, double, triple quotes, raw
>>> B = B"""
xxxx
yyyy
"""

In [20]:
B

b'\nxxxx\nyyyy\n'

In [21]:
U = u'spam'

In [22]:
type(U)

str

In [23]:
U[0]

's'

In [24]:
list(U)

['s', 'p', 'a', 'm']

In [25]:
S = 'eggs'

In [26]:
S.encode()

b'eggs'

In [27]:
bytes(S, encoding='ascii')

b'eggs'

In [28]:
B = b'spam'B

In [29]:
B

b'spam'

In [30]:
B.decode()

'spam'

In [31]:
str(B, encoding='ascii')

'spam'

In [32]:
import sys

In [33]:
sys.platform

'linux'

In [34]:
sys.getdefaultencoding()

'utf-8'

In [35]:
bytes(S)

TypeError: string argument without an encoding

In [36]:
str(B)

"b'spam'"

In [37]:
len(str(B))

7

## CODING UNICODE STRINGS 

In [44]:
ord('X')

88

In [45]:
chr(88)

'X'

In [46]:
S = 'XYZ'

In [47]:
len(S)

3

In [48]:
[ord(c) for c in S]

[88, 89, 90]

In [49]:
S.encode('ascii')

b'XYZ'

In [50]:
S.encode('latin-1')

b'XYZ'

In [51]:
S.encode('utf-8')

b'XYZ'

In [52]:
chr(0xc4)

'Ä'

In [53]:
chr(0xe8)

'è'

In [54]:
S = '\xc4\xe8'

In [55]:
S

'Äè'

In [56]:
S = '\u00c4\u00e8'

In [57]:
S

'Äè'

In [58]:
len(S)

2

In [59]:
S = '\U000000c4\U000000e8'

In [60]:
S

'Äè'

In [61]:
len(S)

2

In [62]:
S = '\u00c4\u00e8'

In [63]:
S

'Äè'

In [64]:
len(S)

2

In [65]:
S.encode('ascii')

UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-1: ordinal not in range(128)

In [66]:
S.encode('latin-1')

b'\xc4\xe8'

In [67]:
S.encode('utf-8')

b'\xc3\x84\xc3\xa8'

In [68]:
len(S.encode('latin-1'))

2

In [69]:
len(S.encode('utf-8'))

4

In [70]:
B = b'\xc4\xe8'

In [71]:
B

b'\xc4\xe8'

In [72]:
len(B)

2

In [73]:
B.decode('latin-1')

'Äè'

In [74]:
B = b'\xc3\x84\xc3\xa8'

In [75]:
len(B)

4

In [76]:
B.decode('utf-8')

'Äè'

In [77]:
len(B.decode('utf-8'))

2

In [78]:
S = 'A\u00c4B\U000000e8C'

In [79]:
S

'AÄBèC'

In [80]:
len(S)

5

In [81]:
S.encode('latin-1')

b'A\xc4B\xe8C'

In [82]:
len(S.encode('latin-1'))

5

In [83]:
S.encode('utf-8')

b'A\xc3\x84B\xc3\xa8C'

In [84]:
len(S.encode('utf-8'))

7

In [85]:
S = 'A' + chr(0xC4) + 'B' + chr(0xE8) + 'C'

In [86]:
S

'AÄBèC'

In [87]:
S.encode('cp500')

b'\xc1c\xc2T\xc3'

In [88]:
S.encode('cp850')

b'A\x8eB\x8aC'

In [89]:
S = 'spam'

In [90]:
S.encode('latin-1')

b'spam'

In [91]:
S.encode('utf-8')

b'spam'

In [92]:
S.encode('cp500')

b'\xa2\x97\x81\x94'

In [93]:
S.encode('cp850')

b'spam'

In [94]:
S = 'A\u00c4B\U000000e8C'

In [95]:
S.encode('utf-16')

b'\xff\xfeA\x00\xc4\x00B\x00\xe8\x00C\x00'

In [97]:
S

'AÄBèC'

In [98]:
S.encode('utf-32')

b'\xff\xfe\x00\x00A\x00\x00\x00\xc4\x00\x00\x00B\x00\x00\x00\xe8\x00\x00\x00C\x00\x00\x00'