# CHAPTER 37 - UNICODE AND BYTE STRINGS

## STRING BASICS

In [2]:
ord('a')

97

In [3]:
hex(97)

'0x61'

In [4]:
chr(97)

'a'

In [5]:
0xC4

196

In [6]:
chr(196)

'Ä'

In [7]:
S = ' ni'

In [8]:
S.encode('ascii'), S.encode('latin1'), S.encode('utf8')

(b' ni', b' ni', b' ni')

In [9]:
S.encode('utf16'), len(S.encode('utf16')), len(S.encode('utf8'))

(b'\xff\xfe \x00n\x00i\x00', 8, 3)

In [10]:
B = b'spam'

In [11]:
S = 'eggs'

In [12]:
type(B), type(S)

(bytes, str)

In [13]:
B[1:], S[1:]

(b'pam', 'ggs')

In [14]:
list(B), list(S)

([115, 112, 97, 109], ['e', 'g', 'g', 's'])

In [16]:
B[0]= 'x'

TypeError: 'bytes' object does not support item assignment

In [17]:
S[0] = 'x'

TypeError: 'str' object does not support item assignment

In [19]:
# bytes prefix works on single, double, triple quotes, raw
>>> B = B"""
xxxx
yyyy
"""

In [20]:
B

b'\nxxxx\nyyyy\n'

In [21]:
U = u'spam'

In [22]:
type(U)

str

In [23]:
U[0]

's'

In [24]:
list(U)

['s', 'p', 'a', 'm']

In [25]:
S = 'eggs'

In [26]:
S.encode()

b'eggs'

In [27]:
bytes(S, encoding='ascii')

b'eggs'

In [28]:
B = b'spam'B

In [29]:
B

b'spam'

In [30]:
B.decode()

'spam'

In [31]:
str(B, encoding='ascii')

'spam'

In [32]:
import sys

In [33]:
sys.platform

'linux'

In [34]:
sys.getdefaultencoding()

'utf-8'

In [35]:
bytes(S)

TypeError: string argument without an encoding

In [36]:
str(B)

"b'spam'"

In [37]:
len(str(B))

7

## CODING UNICODE STRINGS 

In [44]:
ord('X')

88

In [45]:
chr(88)

'X'

In [46]:
S = 'XYZ'

In [47]:
len(S)

3

In [48]:
[ord(c) for c in S]

[88, 89, 90]

In [49]:
S.encode('ascii')

b'XYZ'

In [50]:
S.encode('latin-1')

b'XYZ'

In [51]:
S.encode('utf-8')

b'XYZ'

In [52]:
chr(0xc4)

'Ä'

In [53]:
chr(0xe8)

'è'

In [54]:
S = '\xc4\xe8'

In [55]:
S

'Äè'

In [56]:
S = '\u00c4\u00e8'

In [57]:
S

'Äè'

In [58]:
len(S)

2

In [59]:
S = '\U000000c4\U000000e8'

In [60]:
S

'Äè'

In [61]:
len(S)

2

In [62]:
S = '\u00c4\u00e8'

In [63]:
S

'Äè'

In [64]:
len(S)

2

In [65]:
S.encode('ascii')

UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-1: ordinal not in range(128)

In [66]:
S.encode('latin-1')

b'\xc4\xe8'

In [67]:
S.encode('utf-8')

b'\xc3\x84\xc3\xa8'

In [68]:
len(S.encode('latin-1'))

2

In [69]:
len(S.encode('utf-8'))

4

In [70]:
B = b'\xc4\xe8'

In [71]:
B

b'\xc4\xe8'

In [72]:
len(B)

2

In [73]:
B.decode('latin-1')

'Äè'

In [74]:
B = b'\xc3\x84\xc3\xa8'

In [75]:
len(B)

4

In [76]:
B.decode('utf-8')

'Äè'

In [77]:
len(B.decode('utf-8'))

2

In [78]:
S = 'A\u00c4B\U000000e8C'

In [79]:
S

'AÄBèC'

In [80]:
len(S)

5

In [81]:
S.encode('latin-1')

b'A\xc4B\xe8C'

In [82]:
len(S.encode('latin-1'))

5

In [83]:
S.encode('utf-8')

b'A\xc3\x84B\xc3\xa8C'

In [84]:
len(S.encode('utf-8'))

7

In [85]:
S = 'A' + chr(0xC4) + 'B' + chr(0xE8) + 'C'

In [86]:
S

'AÄBèC'

In [87]:
S.encode('cp500')

b'\xc1c\xc2T\xc3'

In [88]:
S.encode('cp850')

b'A\x8eB\x8aC'

In [89]:
S = 'spam'

In [90]:
S.encode('latin-1')

b'spam'

In [91]:
S.encode('utf-8')

b'spam'

In [92]:
S.encode('cp500')

b'\xa2\x97\x81\x94'

In [93]:
S.encode('cp850')

b'spam'

In [94]:
S = 'A\u00c4B\U000000e8C'

In [95]:
S.encode('utf-16')

b'\xff\xfeA\x00\xc4\x00B\x00\xe8\x00C\x00'

In [97]:
S

'AÄBèC'

In [98]:
S.encode('utf-32')

b'\xff\xfe\x00\x00A\x00\x00\x00\xc4\x00\x00\x00B\x00\x00\x00\xe8\x00\x00\x00C\x00\x00\x00'

In [99]:
S = 'A\xC4B\xE8C'

In [103]:
S

'AÄBèC'

In [100]:
S = 'A\u00C4B\U000000E8C'

In [104]:
S

'AÄBèC'

In [101]:
B = b'A\xC4B\xE8C'

In [105]:
B

b'A\\u00C4B\\U000000E8C'

In [102]:
B = b'A\u00C4B\U000000E8C'

In [106]:
B

b'A\\u00C4B\\U000000E8C'

In [107]:
B = b'A\xC4B\xE8C'

In [109]:
B

b'A\xc4B\xe8C'

In [110]:
print(B)

b'A\xc4B\xe8C'


In [111]:
B.decode('latin-1')

'AÄBèC'

In [112]:
S = 'AÄBèC'

In [113]:
B = b'AÄBèC'

SyntaxError: bytes can only contain ASCII literal characters. (<ipython-input-113-796aad809d0f>, line 1)

In [114]:
B = b'A\xC4B\xE8C'

In [115]:
B

b'A\xc4B\xe8C'

In [119]:
B.decode()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc4 in position 1: invalid continuation byte

In [116]:
B.decode('latin-1')

'AÄBèC'

In [120]:
B = b'A\xc3\x84B\xc3\xa8C'


In [121]:
S = B.decode('utf-8')

In [122]:
S

'AÄBèC'

In [124]:
T = S.encode('cp500')

In [125]:
T

b'\xc1c\xc2T\xc3'

In [126]:
U = T.decode('cp500')

In [127]:
U

'AÄBèC'

In [128]:
U.encode()

b'A\xc3\x84B\xc3\xa8C'

In [129]:
S = 'A\xC4B\xE8C'

In [130]:
U = u'A\xC4B\xE8C'

In [131]:
S + U

'AÄBèCAÄBèC'

In [132]:
'abc' + U

'abcAÄBèC'

In [7]:
# Attributes in str but not bytes
set(dir('abc')) - set(dir(b'abc'))

{'casefold',
 'encode',
 'format',
 'format_map',
 'isdecimal',
 'isidentifier',
 'isnumeric',
 'isprintable'}

In [6]:
# Attributes in bytes but not str
set(dir(b'abc')) - set(dir('abc'))

{'decode', 'fromhex', 'hex'}

In [8]:
B = b'spam'

In [9]:
B

b'spam'

In [10]:
type(B)

bytes

In [11]:
B[0]

115

In [14]:
chr(B[0])

's'

In [15]:
list(B)

[115, 112, 97, 109]

In [16]:
B[1:]

b'pam'

In [17]:
B[:-1]

b'spa'

In [18]:
B + b'1mn'

b'spam1mn'

## OTHER WAYS TO MAKE BYTES OBJECTS

In [20]:
B = b'abc'

In [21]:
B

b'abc'

In [22]:
B = bytes('abc', 'ascii')

In [23]:
B

b'abc'

In [24]:
B = 'spam'.encode()

In [25]:
B

b'spam'

In [26]:
S = B.decode()

In [27]:
S

'spam'

## MISING STRING TYPES

In [28]:
B = b'spam'

In [29]:
B.replace(b'pa', b'XY')

b'sXYm'

In [30]:
B = B'spam'

In [31]:
B

b'spam'

In [32]:
B.replace(bystes('pa'), bytes('xy'))

NameError: name 'bystes' is not defined

In [33]:
B.replace(bytes('pa', 'ascii'), bytes('xy', 'utf-8'))

b'sxym'

In [34]:
b'ab' + 'cd'

TypeError: can't concat str to bytes

In [35]:
b'ab'.decode() + 'cd'

'abcd'

In [36]:
b'ab' + 'cd'.encode()

b'abcd'

In [37]:
b'ab' + bytes('cd', 'ascii')

b'abcd'

## BYTEARRAYS IN ACTIONS

In [39]:
S = 'spam'

In [41]:
C = bytearray(S, 'ascii')

In [42]:
C

bytearray(b'spam')

In [43]:
B = b'spam'

In [49]:
C = bytearray(B)

In [50]:
C[0]

115

In [51]:
C[0] = 'x'

TypeError: 'str' object cannot be interpreted as an integer

In [52]:
C[0] = b'x'

TypeError: 'bytes' object cannot be interpreted as an integer

In [58]:
C[0] = ord('x')

In [59]:
C

bytearray(b'xpam')

In [60]:
C[1] = b'Y'[0]

In [61]:
C

bytearray(b'xYam')

In [62]:
b'Y'[0]

89

In [63]:
b'Y'

b'Y'

In [64]:
type(b'Y')

bytes

In [67]:
type(b'Y'[0])

int

In [70]:
# in bytes but not bytearray
set(dir(b'abc')) - set(dir(bytearray(b'abc')))

{'__getnewargs__'}

In [71]:
# in bytearray but not bytes
set(dir(bytearray(b'abc'))) - set(dir(b'abc'))

{'__alloc__',
 '__delitem__',
 '__iadd__',
 '__imul__',
 '__setitem__',
 'append',
 'clear',
 'copy',
 'extend',
 'insert',
 'pop',
 'remove',
 'reverse'}

In [73]:
C

bytearray(b'xYam')

In [74]:
C.append(b'LMN')

TypeError: 'bytes' object cannot be interpreted as an integer

In [75]:
C.append(ord('L'))

In [76]:
C

bytearray(b'xYamL')

In [77]:
C.extend(b'MNO')

In [78]:
C

bytearray(b'xYamLMNO')

In [79]:
C + b'!#'

bytearray(b'xYamLMNO!#')

In [80]:
C[0]

120

In [81]:
C[1:]

bytearray(b'YamLMNO')

In [82]:
len(C)

8

In [83]:
C.replace('xY', 'sp')

TypeError: a bytes-like object is required, not 'str'

In [84]:
C.replace(b'xY', b'sp')

bytearray(b'spamLMNO')

In [85]:
C

bytearray(b'xYamLMNO')

In [86]:
C * 4

bytearray(b'xYamLMNOxYamLMNOxYamLMNOxYamLMNO')

In [87]:
B

b'spam'

In [88]:
list(B)

[115, 112, 97, 109]

In [89]:
C

bytearray(b'xYamLMNO')

In [90]:
list(C)

[120, 89, 97, 109, 76, 77, 78, 79]

In [91]:
S

'spam'

In [92]:
list(S)

['s', 'p', 'a', 'm']

## USING TEXT AND BINARY FILES

In [94]:
file = open('files/temp', )

In [95]:
size = file.write('abxc\n')

In [96]:
file.close()

In [97]:
file = open('files/temp')

In [98]:
text = file.read()

In [99]:
text

'abxc\n'

In [106]:
open('files/temp', 'w').write('abd\n') # Write in text mode: adds \r

4

In [107]:
open('files/temp', 'r').read() # Read in text mode: drops \r

'abd\n'

In [108]:
open('files/temp', 'rb').read() # Read in binary mode: verbatim

b'abd\n'

In [109]:
open('files/temp', 'wb').write('abc\n') # Write in binary mode

TypeError: a bytes-like object is required, not 'str'

In [112]:
open('files/temp', 'wb').write(b'abc\n') # Binary mode output, provide a bytes

4

In [113]:
open('files/temp', 'r').read() # Text mode input, returns a str

'abc\n'

In [115]:
open('files/temp', 'rb').read() # Binary mode input, returns a bytes

b'abc\n'

In [116]:
open('files/temp', 'wb').write(b'a\x00c')

3

In [117]:
open('files/temp', 'r').read()

'a\x00c'

In [118]:
open('files/temp', 'rb').read()

b'a\x00c'

## USING UNICODE FILES

In [120]:
S = 'A\xc4B\xe8C'

In [121]:
S

'AÄBèC'

In [122]:
len(S)

5

In [123]:
L = S.encode('latin-1')

In [125]:
L

b'A\xc4B\xe8C'

In [126]:
len(L)

5

In [127]:
U = S.encode('utf-8')

In [128]:
len(U)

7

In [129]:
open('files/latindata', 'w', encoding='latin-1').write(S)

5

In [130]:
open('files/utf8data', 'w', encoding='utf-8').write(S)

5

In [131]:
open('files/latindata', 'rb').read()

b'A\xc4B\xe8C'

In [132]:
open('files/utf8data', 'rb').read()

b'A\xc3\x84B\xc3\xa8C'

In [134]:
open('files/latindata', 'r', encoding='latin-1').read()

'AÄBèC'

In [135]:
open('files/utf8data', 'r', encoding='utf-8').read()

'AÄBèC'

In [136]:
X = open('files/latindata', 'rb').read()

In [137]:
X.decode('latin-1')

'AÄBèC'

In [139]:
X = open('files/utf8data', 'rb').read()

In [140]:
X.decode()

'AÄBèC'

In [141]:
X

b'A\xc3\x84B\xc3\xa8C'

## UNICODE FILENAMES AND STREAMS

In [143]:
import sys

In [144]:
sys.getdefaultencoding(), sys.getfilesystemencoding()

('utf-8', 'utf-8')

In [146]:
f = open('files/xxx\u00A5', 'w')

In [147]:
f.write('\xA5999\n')

5

In [148]:
f.close()

In [149]:
print(open('files/xxx\u00A5').read())

¥999



In [150]:
import glob

In [159]:
glob.glob('files/*\u00A5*')

['files/xxx¥']

In [161]:
glob.glob('files/b"*\u00A5*"')

[]

## THE re PATTERN-MATCHING MODULE

In [163]:
import re

In [164]:
S = 'Bugger all down here on earth!'

In [165]:
B = b'Bugger all down here on earth!'

In [167]:
re.match('(.*) down (.*) on (.*)' ,S).groups()

('Bugger all', 'here', 'earth!')

In [169]:
re.match(b'(.*) down (.*) on (.*)' ,B).groups()

(b'Bugger all', b'here', b'earth!')

## THE STRUCT BINARY DATA MODULE

In [171]:
from struct import pack

In [172]:
pack('>i4sh', 7, b'spam', 8)

b'\x00\x00\x00\x07spam\x00\x08'

In [173]:
import struct

In [174]:
B = struct.pack('>i4sh', 7, b'spam', 8)

In [175]:
B

b'\x00\x00\x00\x07spam\x00\x08'

In [176]:
vals = struct.unpack('>i4sh', B)

In [177]:
vals

(7, b'spam', 8)

In [179]:
vals = struct.unpack('>i4sh', B.decode())

TypeError: a bytes-like object is required, not 'str'

In [180]:
F = open('files/data.bin', 'wb')

In [181]:
data = struct.pack('>i4sh', 7, b'spam', 8)

In [182]:
F.write(data)

10

In [183]:
F.close()

In [185]:
F = open('files/data.bin', 'rb')

In [186]:
data = F.read()

In [187]:
data

b'\x00\x00\x00\x07spam\x00\x08'

In [188]:
values = struct.unpack('>i4sh', data)

In [189]:
values

(7, b'spam', 8)

In [190]:
values

(7, b'spam', 8)

In [191]:
bin(values[0])

'0b111'

In [192]:
values[0] & 0x01

1

In [193]:
values[0] | 0b1010

15

In [194]:
bin(values[0] | 0b1010)

'0b1111'

In [195]:
bin(values[0] ^ 0b1010)

'0b1101'

In [196]:
bool(values[0] & 0b100)

True

In [197]:
bool(values[0] & 0b1000)

False

In [198]:
values[1]

b'spam'

In [199]:
values[1][0]

115

In [200]:
values[1][1:]

b'pam'

In [201]:
bin(values[1][0])

'0b1110011'

In [202]:
bin(values[1][0] | 0b1100)

'0b1111111'

In [203]:
values[1][0] | 0b1100

127

## THE PICKLE OBJECT SERIALIZATION MODULUE

In [206]:
import pickle

In [207]:
pickle.dumps([1, 2, 3])

b'\x80\x04\x95\x0b\x00\x00\x00\x00\x00\x00\x00]\x94(K\x01K\x02K\x03e.'

In [209]:
pickle.dumps([1, 2, 3], protocol=0)

b'(lp0\nI1\naI2\naI3\na.'

In [210]:
pickle.dump([1, 2, 3], open('files/temp', 'w'))

TypeError: write() argument must be str, not bytes

In [212]:
pickle.dump([1, 2, 3], open('files/temp', 'w'), protocol=0)

TypeError: write() argument must be str, not bytes

In [214]:
pickle.dump([1, 2, 3], open('files/temp', 'wb'))

In [215]:
open('files/temp', 'r').read()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte

In [216]:
pickle.dump([1, 2, 3], open('files/temp', 'wb'))

In [218]:
pickle.load(open('files/temp', 'rb'))

[1, 2, 3]

In [219]:
open('files/temp', 'rb').read()

b'\x80\x04\x95\x0b\x00\x00\x00\x00\x00\x00\x00]\x94(K\x01K\x02K\x03e.'

## XML PARSING TOOLS

In [221]:
import re

In [224]:
text = open('files/mybooks.xml').read()

In [225]:
found = re.findall('<title>(.*)</title>', text)

In [226]:
found

['Learning Python', 'Programming Python', 'Python Pocket Reference']

In [227]:
from xml.dom.minidom import parse, Node

In [229]:
xmltree = parse('files/mybooks.xml')

In [230]:
for node1 in xmltree.getElementsByTagName('title'):
    for node2 in node1.childNodes:
        if node2.nodeType == Node.TEXT_NODE:
            print(node2.data)

Learning Python
Programming Python
Python Pocket Reference


In [231]:
import xml.sax.handler