String* dropped StringsAsBytes and global encoding

construct · Feb 16, 2018 · 764d108 · 764d108
1 parent be182b8
commit 764d108
Show file tree

Hide file tree

Showing 11 changed files with 78 additions and 129 deletions.
diff --git a/construct/__init__.py b/construct/__init__.py
@@ -89,7 +89,6 @@
     'FormatFieldError',
     'FuncPath',
     'globalfullprinting',
-    'globalstringencoding',
     'GreedyBytes',
     'GreedyRange',
     'GreedyString',
@@ -154,7 +153,6 @@
     'SelectError',
     'Sequence',
     'setglobalfullprinting',
-    'setglobalstringencoding',
     'Short',
     'Single',
     'SizeofError',
@@ -166,7 +164,6 @@
     'StringError',
     'StringNullTerminated',
     'StringPaddedTrimmed',
-    'StringsAsBytes',
     'Struct',
     'Subconstruct',
     'sum_',

diff --git a/construct/core.py b/construct/core.py
@@ -1306,57 +1306,34 @@ def _emitdecompiled(self, code):
 #===============================================================================
 # strings
 #===============================================================================
-globalstringencoding = None
-
-
-@singleton
-class StringsAsBytes:
-    """
-    Used for marking String* classes to not encode/decode bytes (allows using `str` on Python 2).
-    """
-    pass
-
 
+#: Explicitly supported encodings (by String and CString classes).
+#:
 possiblestringencodings = dict(
-    StringsAsBytes=1,
     ascii=1,
-    utf8=1, utf_8=1, U8=1,
-    utf16=2, utf_16=2, U16=2, utf_16_be=2, utf_16_le=2,
-    utf32=4, utf_32=4, U32=4, utf_32_be=4, utf_32_le=4,
+    utf8=1, utf_8=1, u8=1,
+    utf16=2, utf_16=2, u16=2, utf_16_be=2, utf_16_le=2,
+    utf32=4, utf_32=4, u32=4, utf_32_be=4, utf_32_le=4,
 )
 
 
-def selectencoding(localencoding):
+def selectencoding(encoding):
     """Used internally."""
-    encoding = localencoding or globalstringencoding
     if not encoding:
         raise StringError("String* classes require explicit encoding")
     return encoding
 
 
 def calculateunits(encoding):
     """Used internally."""
-    if encoding is StringsAsBytes:
-        encoding = "StringsAsBytes"
+    encoding = encoding.replace("-","_").lower()
     if encoding not in possiblestringencodings:
-        raise StringError("encoding not implemented: %r" % (encoding,))
+        raise StringError("encoding %r not among %r" % (encoding, possiblestringencodings,))
     unitsize = possiblestringencodings[encoding]
     finalunit = b"\x00" * unitsize
     return unitsize, finalunit
 
 
-def setglobalstringencoding(encoding):
-    r"""
-    Sets the encoding globally for all String PascalString CString GreedyString instances. Note that encoding specified expiciltly in a particular construct supersedes it. Note also that global encoding is applied during parsing and building (not class instantiation).
-
-    See :class:`~construct.core.StringsAsBytes` for non-encoding, allowing using `str` on Python 2.
-
-    :param encoding: string like "utf8", or StringsAsBytes, or None (disable global override)
-    """
-    global globalstringencoding
-    globalstringencoding = encoding
-
-
 class StringEncoded(Adapter):
     """Used internally."""
     __slots__ = ["encoding"]
@@ -1366,29 +1343,15 @@ def __init__(self, subcon, encoding):
         self.encoding = selectencoding(encoding)
 
     def _decode(self, obj, context, path):
-        encoding = self.encoding
-        if isinstance(encoding, str):
-            return obj.decode(encoding)
-        if isinstance(encoding, StringsAsBytes.__class__):
-            return obj
+        return obj.decode(self.encoding)
 
     def _encode(self, obj, context, path):
-        encoding = self.encoding
-        if isinstance(encoding, str):
-            if not isinstance(obj, unicodestringtype):
-                raise StringError("string encoding failed, expected unicode string")
-            return obj.encode(encoding)
-        if isinstance(encoding, StringsAsBytes.__class__):
-            if not isinstance(obj, bytestringtype):
-                raise StringError("string encoding failed, expected byte string")
-            return obj
+        if not isinstance(obj, unicodestringtype):
+            raise StringError("string encoding failed, expected unicode string")
+        return obj.encode(self.encoding)
 
     def _emitparse(self, code):
-        encoding = self.encoding
-        if isinstance(encoding, str):
-            return "(%s).decode(%r)" % (self.subcon._compileparse(code), encoding, )
-        if isinstance(encoding, StringsAsBytes.__class__):
-            return "(%s)" % (self.subcon._compileparse(code), )
+        return "(%s).decode(%r)" % (self.subcon._compileparse(code), self.encoding, )
 
 
 class StringPaddedTrimmed(Construct):
@@ -1431,7 +1394,7 @@ def _sizeof(self, context, path):
 
     def _emitparse(self, code):
         unitsize, finalunit = calculateunits(self.encoding)
-        code.append(r"""
+        code.append("""
             def parse_paddedtrimmedstring(io, length, unitsize, finalunit):
                 if length % unitsize:
                     raise StringError
@@ -1484,19 +1447,18 @@ def parse_nullterminatedstring(io, unitsize, finalunit):
         return "parse_nullterminatedstring(io, %s, %r)" % (unitsize, finalunit, )
 
 
-def String(length, encoding=None):
+def String(length, encoding):
     r"""
     Configurable, fixed-length or variable-length string field.
 
     When parsing, the byte string is stripped of null bytes (per encoding unit), then decoded. Length is an integer or context lambda. When building, the string is encoded, then trimmed to specified length minus encoding unit, then padded to specified length. Size is same as length parameter.
 
-    .. warning:: String and CString only support encodings explicitly listed in :func:`~construct.core.possiblestringencodings` .
+    .. warning:: String and CString only support encodings explicitly listed in :class:`~construct.core.possiblestringencodings` .
 
     :param length: integer or context lambda, length in bytes (not unicode characters)
-    :param encoding: string like "utf8" "utf16" "utf32", or StringsAsBytes, or None (use global override)
+    :param encoding: string like "utf8" "utf16" "utf32"
 
-    :raises StringError: String* classes require explicit encoding
-    :raises StringError: building a unicode string but no encoding
+    :raises StringError: building a non-unicode string
     :raises StringError: specified length or object for building is not a multiple of unit
     :raises StringError: selected encoding is not on supported list
 
@@ -1531,17 +1493,16 @@ def String(length, encoding=None):
     return StringEncoded(StringPaddedTrimmed(length, encoding), encoding)
 
 
-def PascalString(lengthfield, encoding=None):
+def PascalString(lengthfield, encoding):
     r"""
     Length-prefixed string. The length field can be variable length (such as VarInt) or fixed length (such as Int64ub). VarInt is recommended when designing new protocols. Stored length is in bytes, not characters. Size is not defined.
 
     :class:`~construct.core.VarInt` is recommended for new protocols, as it is more compact and never overflows.
 
     :param lengthfield: Construct instance, field used to parse and build the length (like VarInt Int64ub)
-    :param encoding: string like "utf8" "utf16" "utf32", or StringsAsBytes, or None (use global override)
+    :param encoding: string like "utf8" "utf16" "utf32"
 
-    :raises StringError: String* classes require explicit encoding
-    :raises StringError: building a unicode string but no encoding
+    :raises StringError: building a non-unicode string
 
     Example::
 
@@ -1554,16 +1515,15 @@ def PascalString(lengthfield, encoding=None):
     return StringEncoded(Prefixed(lengthfield, GreedyBytes), encoding)
 
 
-def CString(encoding=None):
+def CString(encoding):
     r"""
     String ending in a terminating null byte (or null bytes in case of UTF16 UTF32).
 
-    .. warning:: String and CString only support encodings explicitly listed in :func:`~construct.core.possiblestringencodings` .
+    .. warning:: String and CString only support encodings explicitly listed in :class:`~construct.core.possiblestringencodings` .
 
-    :param encoding: string like "utf8" "utf16" "utf32", or StringsAsBytes, or None (use global override)
+    :param encoding: string like "utf8" "utf16" "utf32"
 
-    :raises StringError: String* classes require explicit encoding
-    :raises StringError: building a unicode string but no encoding
+    :raises StringError: building a non-unicode string
     :raises StringError: object for building is not a multiple of unit
     :raises StringError: selected encoding is not on supported list
 
@@ -1578,16 +1538,15 @@ def CString(encoding=None):
     return StringEncoded(StringNullTerminated(encoding), encoding)
 
 
-def GreedyString(encoding=None):
+def GreedyString(encoding):
     r"""
     String that reads entire stream until EOF, and writes a given string as-is. If no encoding is specified, this is essentially GreedyBytes.
 
     Analog to :class:`~construct.core.GreedyBytes` , and identical when no enoding is used.
 
-    :param encoding: string like "utf8" "utf16" "utf32", or StringsAsBytes, or None (use global override)
+    :param encoding: string like "utf8" "utf16" "utf32"
 
-    :raises StringError: String* classes require explicit encoding
-    :raises StringError: building a unicode string but no encoding
+    :raises StringError: building a non-unicode string
     :raises StreamError: stream failed when reading until EOF
 
     Example::

diff --git a/construct/examples/formats/executable/elf32.py b/construct/examples/formats/executable/elf32.py
@@ -31,7 +31,7 @@ def elf32_body(ElfInt16, ElfInt32):
 
     elf32_section_header = Struct(
         "name_offset" / ElfInt32,
-        "name" / Pointer(this._.strtab_data_offset + this.name_offset, CString(encoding=StringsAsBytes)),
+        "name" / Pointer(this._.strtab_data_offset + this.name_offset, CString(encoding="utf8")),
         "type" / Enum(ElfInt32, 
             NULL = 0,
             PROGBITS = 1,

diff --git a/construct/examples/formats/executable/pe32.py b/construct/examples/formats/executable/pe32.py
@@ -88,7 +88,7 @@ def _decode(self, obj, context, path):
 )
 
 symbol_table = "symbol_table" / Struct(
-    "name" / String(8, encoding=StringsAsBytes),
+    "name" / String(8, encoding="utf8"),
     "value" / Int32ul,
     "section_number" / Enum(
         ExprAdapter(Int16sl,
@@ -293,7 +293,7 @@ def _decode(self, obj, context, path):
 )
 
 section = "section" / Struct(
-    "name" / String(8, encoding=StringsAsBytes),
+    "name" / String(8, encoding="utf8"),
     "virtual_size" / Int32ul,
     "virtual_address" / Int32ul,
     "raw_data_size" / Int32ul,

diff --git a/construct/examples/formats/graphics/emf.py b/construct/examples/formats/graphics/emf.py
@@ -135,7 +135,8 @@
     "device_width_mm" / Int32sl,          # Width of reference device in millimeters
     "device_height_mm" / Int32sl,         # Height of reference device in millimeters
 
-    "description" / Pointer(this.description_offset, String(this.description_size * 2, encoding=StringsAsBytes)),
+    "description" / Pointer(this.description_offset, 
+        String(this.description_size * 2, "utf8")),
 
     # padding up to end of record
     Padding(this.record_size - 88),

diff --git a/construct/examples/formats/graphics/gif.py b/construct/examples/formats/graphics/gif.py
@@ -15,7 +15,7 @@
 from construct import *
 
 
-data_sub_block = PascalString(Int8ul, StringsAsBytes)
+data_sub_block = PascalString(Int8ul, "utf8")
 
 gif_logical_screen = Struct(
     "width" / Int16ul,
@@ -39,8 +39,8 @@
 
 application_extension = Struct(
     "block_size" / Const(11, Int8ul),
-    "application_identifier" / String(8, StringsAsBytes),
-    "application_auth_code" / String(3, StringsAsBytes),
+    "application_identifier" / String(8, "utf8"),
+    "application_auth_code" / String(3, "utf8"),
     "data_sub_block" / data_sub_block,
     "block_terminator" / Int8ul,
 )

diff --git a/construct/examples/formats/graphics/png.py b/construct/examples/formats/graphics/png.py
@@ -62,7 +62,7 @@
 # 11.3.3.3: iCCP - Embedded ICC profile
 #===============================================================================
 iccp_info = "iccp_info" / Struct(
-    "name" / CString(StringsAsBytes),
+    "name" / CString("utf8"),
     compression_method,
     "compressed_profile" / Bytes(this._.length - (len_(this.name) + 2)),
 )
@@ -95,15 +95,15 @@
 # 11.3.4.3: tEXt - Textual data
 #===============================================================================
 text_info = "text_info" / Struct(
-    "keyword" / CString(StringsAsBytes),
+    "keyword" / CString("utf8"),
     "text" / Bytes(this._.length - (len_(this.keyword) + 1)),
 )
 
 #===============================================================================
 # 11.3.4.4: zTXt - Compressed textual data
 #===============================================================================
 ztxt_info = "ztxt_info" / Struct(
-    "keyword" / CString(StringsAsBytes),
+    "keyword" / CString("utf8"),
     compression_method,
     # As with iCCP, length is chunk length, minus length of
     # keyword, minus two: one byte for the null terminator,
@@ -115,11 +115,11 @@
 # 11.3.4.5: iTXt - International textual data
 #===============================================================================
 itxt_info = "itxt_info" / Struct(
-    "keyword" / CString(StringsAsBytes),
+    "keyword" / CString("utf8"),
     "compression_flag" / Byte,
     compression_method,
-    "language_tag" / CString(StringsAsBytes),
-    "translated_keyword" / CString(StringsAsBytes),
+    "language_tag" / CString("utf8"),
+    "translated_keyword" / CString("utf8"),
     "text" / Bytes(this._.length - (len_(this.keyword) + len_(this.language_tag) + len_(this.translated_keyword) + 5)),
 )
 
@@ -158,7 +158,7 @@ def splt_info_data_length(ctx):
     return (ctx._.length - len(ctx.name) - 2) // entry_size
 
 splt_info = "data" / Struct(
-    "name" / CString(StringsAsBytes),
+    "name" / CString("utf8"),
     "sample_depth" / Byte,
     "table" / Array(splt_info_data_length,
         IfThenElse(this.sample_depth == 8,

diff --git a/docs/advanced.rst b/docs/advanced.rst
@@ -46,6 +46,12 @@ b'\x01\x00\x00\x00'
 Bytes and bits
 ==============
 
+.. warning::
+
+    Python 3 known problem:
+
+    Unprefixed string literals like "data" are on Python 3 interpreted as unicode. This casues failures when using fields like `Bytes`.
+
 "Strings" of bytes (`str` in PY2 and `bytes` in PY3) can be moved around as-is. Bits are discussed in a later chapter.
 
 >>> Bytes(5).build(b"12345")
@@ -62,21 +68,15 @@ b'39217839219...'
 Strings
 ========
 
-.. warning::
-
-    Python 3 known problem:
-
-    Unprefixed string literals like "data" are on Python 3 interpreted as unicode (not bytes). If you look at the documentation on this site, you will notice that most examples use b"\\x00" literals (so called b-strings). Unicode strings are processed by String* classes, and require explicit encoding like "utf8".
-
 .. warning::
 
     Python 2 known problem:
 
-    Encoding needs to be specified explicitly, although  :func:`~construct.core.setglobalstringencoding` can be used for that as well. Encodings like UTF8 UTF16 UTF32 are recommended. `StringsAsBytes` can be used to specify non-encoding (to allow `str` on Python 2).
+    Unprefixed string literals like "text" are on Python 2 interpreted as bytes. This casues failures when using fields that operate on unicode objects like String* classes.
 
 .. note::
 
-    Encodings like UTF16 UTF32 (including little-endian) work fine with all String* classes.
+    Encodings like UTF8 UTF16 UTF32 (including little-endian) work fine with all String* classes. However two of them, String and CString, support only encodings listed exclusively in :class:`~construct.core.possiblestringencodings` .
 
 String is a fixed-length construct that pads built string with null bytes, and strips those same null bytes when parsing. Strings can also be trimmed when building. If you supply a too long string, the construct will chop it off apart instead of raising a StringError.
 

diff --git a/docs/api/strings.rst b/docs/api/strings.rst
@@ -2,8 +2,6 @@
 Core API: Strings
 ===================
 
-.. autofunction:: construct.setglobalstringencoding
-.. autofunction:: construct.StringsAsBytes
 .. autodata:: construct.possiblestringencodings
 .. autofunction:: construct.String
 .. autofunction:: construct.PascalString