In [1]:
# Load some definitions
import binascii

def bytestring(s):
    l = [binascii.hexlify(bytes(chr(x), 'latin1')) for x in s]
    return b" ".join(l)
    
def utf8(u):
    return bytestring(u.encode('utf-8'))
    
def utf16(u):
    return bytestring(u.encode('utf-16le'))

What is a string?

In [2]:
type("A String")

str

In [3]:
b = b"A Byte String"
u = u"A Unicode String"
type(b), type(u)

(bytes, str)

Adding `bytes` and `str`

In [4]:
b"bytes_" + "str"

TypeError: can't concat bytes to str

In [5]:
s1 = b"bytes_".decode() + "str"
print(type(s1), s1)
s2 = b"bytes_" + "str".encode()
print(type(s2), s2)

<class 'str'> bytes_str
<class 'bytes'> b'bytes_str'


Encoding and Decoding

In [6]:
latin1_string = b"\xdc\xf1\xee\xe7\xf8d\xe9"
print(bytestring(latin1_string))

b'dc f1 ee e7 f8 64 e9'


In [7]:
latin1_string.encode('ascii')

AttributeError: 'bytes' object has no attribute 'encode'

In [8]:
unicode_string = latin1_string.decode('latin1')
utf8_string = unicode_string.encode('utf8')
print(unicode_string)
print(utf8_string)
print(bytestring(utf8_string))

Üñîçødé
b'\xc3\x9c\xc3\xb1\xc3\xae\xc3\xa7\xc3\xb8d\xc3\xa9'
b'c3 9c c3 b1 c3 ae c3 a7 c3 b8 64 c3 a9'


In [9]:
import sys, locale

expressions = """
        locale.getpreferredencoding()
        type(my_file)
        my_file.encoding
        sys.stdout.isatty()
        sys.stdout.encoding
        sys.stdin.isatty()
        sys.stdin.encoding
        sys.stderr.isatty()
        sys.stderr.encoding
        sys.getdefaultencoding()
        sys.getfilesystemencoding()
    """

def get_text_settings():
    my_file = open('/tmp/dummy', 'w')

    for expression in expressions.split():
        value = eval(expression)
        print(expression.rjust(30), '->', repr(value))

In [10]:
get_text_settings()

 locale.getpreferredencoding() -> 'UTF-8'
                 type(my_file) -> <class '_io.TextIOWrapper'>
              my_file.encoding -> 'UTF-8'
           sys.stdout.isatty() -> False
           sys.stdout.encoding -> 'UTF-8'
            sys.stdin.isatty() -> False
            sys.stdin.encoding -> 'UTF-8'
           sys.stderr.isatty() -> False
           sys.stderr.encoding -> 'UTF-8'
      sys.getdefaultencoding() -> 'utf-8'
   sys.getfilesystemencoding() -> 'utf-8'


Gotcha: Normalization

In [11]:
s1 = "café"
s2 = "café"
print(len(s1), len(s2))
print(s1 == s2)

4 5
False


In [12]:
print(utf8(s1))
print(utf8(s2))

b'63 61 66 c3 a9'
b'63 61 66 65 cc 81'


Handing Unicode in the Standard Library 

In [13]:
import os
def find(path):
    for dirpath, dirnames, filenames in os.walk(path):
        for filename in filenames:
            print(repr(os.path.join(dirpath, filename)))

In [14]:
find('.')

'./Helpers.ipynb'
'./Python 2 Strings.ipynb'
'./Python 3 Strings.ipynb'
'./.ipynb_checkpoints/Helpers-checkpoint.ipynb'
'./.ipynb_checkpoints/Python 2 Strings-checkpoint.ipynb'
'./.ipynb_checkpoints/Python 3 Strings-checkpoint.ipynb'


In [15]:
find(b'.')

b'./Helpers.ipynb'
b'./Python 2 Strings.ipynb'
b'./Python 3 Strings.ipynb'
b'./.ipynb_checkpoints/Helpers-checkpoint.ipynb'
b'./.ipynb_checkpoints/Python 2 Strings-checkpoint.ipynb'
b'./.ipynb_checkpoints/Python 3 Strings-checkpoint.ipynb'
