Skip to content

Commit

Permalink
String* dropped StringsAsBytes and global encoding
Browse files Browse the repository at this point in the history
  • Loading branch information
arekbulski committed Feb 16, 2018
1 parent be182b8 commit 764d108
Show file tree
Hide file tree
Showing 11 changed files with 78 additions and 129 deletions.
3 changes: 0 additions & 3 deletions construct/__init__.py
Expand Up @@ -89,7 +89,6 @@
'FormatFieldError',
'FuncPath',
'globalfullprinting',
'globalstringencoding',
'GreedyBytes',
'GreedyRange',
'GreedyString',
Expand Down Expand Up @@ -154,7 +153,6 @@
'SelectError',
'Sequence',
'setglobalfullprinting',
'setglobalstringencoding',
'Short',
'Single',
'SizeofError',
Expand All @@ -166,7 +164,6 @@
'StringError',
'StringNullTerminated',
'StringPaddedTrimmed',
'StringsAsBytes',
'Struct',
'Subconstruct',
'sum_',
Expand Down
97 changes: 28 additions & 69 deletions construct/core.py
Expand Up @@ -1306,57 +1306,34 @@ def _emitdecompiled(self, code):
#===============================================================================
# strings
#===============================================================================
globalstringencoding = None


@singleton
class StringsAsBytes:
"""
Used for marking String* classes to not encode/decode bytes (allows using `str` on Python 2).
"""
pass


#: Explicitly supported encodings (by String and CString classes).
#:
possiblestringencodings = dict(
StringsAsBytes=1,
ascii=1,
utf8=1, utf_8=1, U8=1,
utf16=2, utf_16=2, U16=2, utf_16_be=2, utf_16_le=2,
utf32=4, utf_32=4, U32=4, utf_32_be=4, utf_32_le=4,
utf8=1, utf_8=1, u8=1,
utf16=2, utf_16=2, u16=2, utf_16_be=2, utf_16_le=2,
utf32=4, utf_32=4, u32=4, utf_32_be=4, utf_32_le=4,
)


def selectencoding(localencoding):
def selectencoding(encoding):
"""Used internally."""
encoding = localencoding or globalstringencoding
if not encoding:
raise StringError("String* classes require explicit encoding")
return encoding


def calculateunits(encoding):
"""Used internally."""
if encoding is StringsAsBytes:
encoding = "StringsAsBytes"
encoding = encoding.replace("-","_").lower()
if encoding not in possiblestringencodings:
raise StringError("encoding not implemented: %r" % (encoding,))
raise StringError("encoding %r not among %r" % (encoding, possiblestringencodings,))
unitsize = possiblestringencodings[encoding]
finalunit = b"\x00" * unitsize
return unitsize, finalunit


def setglobalstringencoding(encoding):
r"""
Sets the encoding globally for all String PascalString CString GreedyString instances. Note that encoding specified expiciltly in a particular construct supersedes it. Note also that global encoding is applied during parsing and building (not class instantiation).
See :class:`~construct.core.StringsAsBytes` for non-encoding, allowing using `str` on Python 2.
:param encoding: string like "utf8", or StringsAsBytes, or None (disable global override)
"""
global globalstringencoding
globalstringencoding = encoding


class StringEncoded(Adapter):
"""Used internally."""
__slots__ = ["encoding"]
Expand All @@ -1366,29 +1343,15 @@ def __init__(self, subcon, encoding):
self.encoding = selectencoding(encoding)

def _decode(self, obj, context, path):
encoding = self.encoding
if isinstance(encoding, str):
return obj.decode(encoding)
if isinstance(encoding, StringsAsBytes.__class__):
return obj
return obj.decode(self.encoding)

def _encode(self, obj, context, path):
encoding = self.encoding
if isinstance(encoding, str):
if not isinstance(obj, unicodestringtype):
raise StringError("string encoding failed, expected unicode string")
return obj.encode(encoding)
if isinstance(encoding, StringsAsBytes.__class__):
if not isinstance(obj, bytestringtype):
raise StringError("string encoding failed, expected byte string")
return obj
if not isinstance(obj, unicodestringtype):
raise StringError("string encoding failed, expected unicode string")
return obj.encode(self.encoding)

def _emitparse(self, code):
encoding = self.encoding
if isinstance(encoding, str):
return "(%s).decode(%r)" % (self.subcon._compileparse(code), encoding, )
if isinstance(encoding, StringsAsBytes.__class__):
return "(%s)" % (self.subcon._compileparse(code), )
return "(%s).decode(%r)" % (self.subcon._compileparse(code), self.encoding, )


class StringPaddedTrimmed(Construct):
Expand Down Expand Up @@ -1431,7 +1394,7 @@ def _sizeof(self, context, path):

def _emitparse(self, code):
unitsize, finalunit = calculateunits(self.encoding)
code.append(r"""
code.append("""
def parse_paddedtrimmedstring(io, length, unitsize, finalunit):
if length % unitsize:
raise StringError
Expand Down Expand Up @@ -1484,19 +1447,18 @@ def parse_nullterminatedstring(io, unitsize, finalunit):
return "parse_nullterminatedstring(io, %s, %r)" % (unitsize, finalunit, )


def String(length, encoding=None):
def String(length, encoding):
r"""
Configurable, fixed-length or variable-length string field.
When parsing, the byte string is stripped of null bytes (per encoding unit), then decoded. Length is an integer or context lambda. When building, the string is encoded, then trimmed to specified length minus encoding unit, then padded to specified length. Size is same as length parameter.
.. warning:: String and CString only support encodings explicitly listed in :func:`~construct.core.possiblestringencodings` .
.. warning:: String and CString only support encodings explicitly listed in :class:`~construct.core.possiblestringencodings` .
:param length: integer or context lambda, length in bytes (not unicode characters)
:param encoding: string like "utf8" "utf16" "utf32", or StringsAsBytes, or None (use global override)
:param encoding: string like "utf8" "utf16" "utf32"
:raises StringError: String* classes require explicit encoding
:raises StringError: building a unicode string but no encoding
:raises StringError: building a non-unicode string
:raises StringError: specified length or object for building is not a multiple of unit
:raises StringError: selected encoding is not on supported list
Expand Down Expand Up @@ -1531,17 +1493,16 @@ def String(length, encoding=None):
return StringEncoded(StringPaddedTrimmed(length, encoding), encoding)


def PascalString(lengthfield, encoding=None):
def PascalString(lengthfield, encoding):
r"""
Length-prefixed string. The length field can be variable length (such as VarInt) or fixed length (such as Int64ub). VarInt is recommended when designing new protocols. Stored length is in bytes, not characters. Size is not defined.
:class:`~construct.core.VarInt` is recommended for new protocols, as it is more compact and never overflows.
:param lengthfield: Construct instance, field used to parse and build the length (like VarInt Int64ub)
:param encoding: string like "utf8" "utf16" "utf32", or StringsAsBytes, or None (use global override)
:param encoding: string like "utf8" "utf16" "utf32"
:raises StringError: String* classes require explicit encoding
:raises StringError: building a unicode string but no encoding
:raises StringError: building a non-unicode string
Example::
Expand All @@ -1554,16 +1515,15 @@ def PascalString(lengthfield, encoding=None):
return StringEncoded(Prefixed(lengthfield, GreedyBytes), encoding)


def CString(encoding=None):
def CString(encoding):
r"""
String ending in a terminating null byte (or null bytes in case of UTF16 UTF32).
.. warning:: String and CString only support encodings explicitly listed in :func:`~construct.core.possiblestringencodings` .
.. warning:: String and CString only support encodings explicitly listed in :class:`~construct.core.possiblestringencodings` .
:param encoding: string like "utf8" "utf16" "utf32", or StringsAsBytes, or None (use global override)
:param encoding: string like "utf8" "utf16" "utf32"
:raises StringError: String* classes require explicit encoding
:raises StringError: building a unicode string but no encoding
:raises StringError: building a non-unicode string
:raises StringError: object for building is not a multiple of unit
:raises StringError: selected encoding is not on supported list
Expand All @@ -1578,16 +1538,15 @@ def CString(encoding=None):
return StringEncoded(StringNullTerminated(encoding), encoding)


def GreedyString(encoding=None):
def GreedyString(encoding):
r"""
String that reads entire stream until EOF, and writes a given string as-is. If no encoding is specified, this is essentially GreedyBytes.
Analog to :class:`~construct.core.GreedyBytes` , and identical when no enoding is used.
:param encoding: string like "utf8" "utf16" "utf32", or StringsAsBytes, or None (use global override)
:param encoding: string like "utf8" "utf16" "utf32"
:raises StringError: String* classes require explicit encoding
:raises StringError: building a unicode string but no encoding
:raises StringError: building a non-unicode string
:raises StreamError: stream failed when reading until EOF
Example::
Expand Down
2 changes: 1 addition & 1 deletion construct/examples/formats/executable/elf32.py
Expand Up @@ -31,7 +31,7 @@ def elf32_body(ElfInt16, ElfInt32):

elf32_section_header = Struct(
"name_offset" / ElfInt32,
"name" / Pointer(this._.strtab_data_offset + this.name_offset, CString(encoding=StringsAsBytes)),
"name" / Pointer(this._.strtab_data_offset + this.name_offset, CString(encoding="utf8")),
"type" / Enum(ElfInt32,
NULL = 0,
PROGBITS = 1,
Expand Down
4 changes: 2 additions & 2 deletions construct/examples/formats/executable/pe32.py
Expand Up @@ -88,7 +88,7 @@ def _decode(self, obj, context, path):
)

symbol_table = "symbol_table" / Struct(
"name" / String(8, encoding=StringsAsBytes),
"name" / String(8, encoding="utf8"),
"value" / Int32ul,
"section_number" / Enum(
ExprAdapter(Int16sl,
Expand Down Expand Up @@ -293,7 +293,7 @@ def _decode(self, obj, context, path):
)

section = "section" / Struct(
"name" / String(8, encoding=StringsAsBytes),
"name" / String(8, encoding="utf8"),
"virtual_size" / Int32ul,
"virtual_address" / Int32ul,
"raw_data_size" / Int32ul,
Expand Down
3 changes: 2 additions & 1 deletion construct/examples/formats/graphics/emf.py
Expand Up @@ -135,7 +135,8 @@
"device_width_mm" / Int32sl, # Width of reference device in millimeters
"device_height_mm" / Int32sl, # Height of reference device in millimeters

"description" / Pointer(this.description_offset, String(this.description_size * 2, encoding=StringsAsBytes)),
"description" / Pointer(this.description_offset,
String(this.description_size * 2, "utf8")),

# padding up to end of record
Padding(this.record_size - 88),
Expand Down
6 changes: 3 additions & 3 deletions construct/examples/formats/graphics/gif.py
Expand Up @@ -15,7 +15,7 @@
from construct import *


data_sub_block = PascalString(Int8ul, StringsAsBytes)
data_sub_block = PascalString(Int8ul, "utf8")

gif_logical_screen = Struct(
"width" / Int16ul,
Expand All @@ -39,8 +39,8 @@

application_extension = Struct(
"block_size" / Const(11, Int8ul),
"application_identifier" / String(8, StringsAsBytes),
"application_auth_code" / String(3, StringsAsBytes),
"application_identifier" / String(8, "utf8"),
"application_auth_code" / String(3, "utf8"),
"data_sub_block" / data_sub_block,
"block_terminator" / Int8ul,
)
Expand Down
14 changes: 7 additions & 7 deletions construct/examples/formats/graphics/png.py
Expand Up @@ -62,7 +62,7 @@
# 11.3.3.3: iCCP - Embedded ICC profile
#===============================================================================
iccp_info = "iccp_info" / Struct(
"name" / CString(StringsAsBytes),
"name" / CString("utf8"),
compression_method,
"compressed_profile" / Bytes(this._.length - (len_(this.name) + 2)),
)
Expand Down Expand Up @@ -95,15 +95,15 @@
# 11.3.4.3: tEXt - Textual data
#===============================================================================
text_info = "text_info" / Struct(
"keyword" / CString(StringsAsBytes),
"keyword" / CString("utf8"),
"text" / Bytes(this._.length - (len_(this.keyword) + 1)),
)

#===============================================================================
# 11.3.4.4: zTXt - Compressed textual data
#===============================================================================
ztxt_info = "ztxt_info" / Struct(
"keyword" / CString(StringsAsBytes),
"keyword" / CString("utf8"),
compression_method,
# As with iCCP, length is chunk length, minus length of
# keyword, minus two: one byte for the null terminator,
Expand All @@ -115,11 +115,11 @@
# 11.3.4.5: iTXt - International textual data
#===============================================================================
itxt_info = "itxt_info" / Struct(
"keyword" / CString(StringsAsBytes),
"keyword" / CString("utf8"),
"compression_flag" / Byte,
compression_method,
"language_tag" / CString(StringsAsBytes),
"translated_keyword" / CString(StringsAsBytes),
"language_tag" / CString("utf8"),
"translated_keyword" / CString("utf8"),
"text" / Bytes(this._.length - (len_(this.keyword) + len_(this.language_tag) + len_(this.translated_keyword) + 5)),
)

Expand Down Expand Up @@ -158,7 +158,7 @@ def splt_info_data_length(ctx):
return (ctx._.length - len(ctx.name) - 2) // entry_size

splt_info = "data" / Struct(
"name" / CString(StringsAsBytes),
"name" / CString("utf8"),
"sample_depth" / Byte,
"table" / Array(splt_info_data_length,
IfThenElse(this.sample_depth == 8,
Expand Down
16 changes: 8 additions & 8 deletions docs/advanced.rst
Expand Up @@ -46,6 +46,12 @@ b'\x01\x00\x00\x00'
Bytes and bits
==============

.. warning::

Python 3 known problem:

Unprefixed string literals like "data" are on Python 3 interpreted as unicode. This casues failures when using fields like `Bytes`.

"Strings" of bytes (`str` in PY2 and `bytes` in PY3) can be moved around as-is. Bits are discussed in a later chapter.

>>> Bytes(5).build(b"12345")
Expand All @@ -62,21 +68,15 @@ b'39217839219...'
Strings
========

.. warning::

Python 3 known problem:

Unprefixed string literals like "data" are on Python 3 interpreted as unicode (not bytes). If you look at the documentation on this site, you will notice that most examples use b"\\x00" literals (so called b-strings). Unicode strings are processed by String* classes, and require explicit encoding like "utf8".

.. warning::

Python 2 known problem:

Encoding needs to be specified explicitly, although :func:`~construct.core.setglobalstringencoding` can be used for that as well. Encodings like UTF8 UTF16 UTF32 are recommended. `StringsAsBytes` can be used to specify non-encoding (to allow `str` on Python 2).
Unprefixed string literals like "text" are on Python 2 interpreted as bytes. This casues failures when using fields that operate on unicode objects like String* classes.

.. note::

Encodings like UTF16 UTF32 (including little-endian) work fine with all String* classes.
Encodings like UTF8 UTF16 UTF32 (including little-endian) work fine with all String* classes. However two of them, String and CString, support only encodings listed exclusively in :class:`~construct.core.possiblestringencodings` .

String is a fixed-length construct that pads built string with null bytes, and strips those same null bytes when parsing. Strings can also be trimmed when building. If you supply a too long string, the construct will chop it off apart instead of raising a StringError.

Expand Down
2 changes: 0 additions & 2 deletions docs/api/strings.rst
Expand Up @@ -2,8 +2,6 @@
Core API: Strings
===================

.. autofunction:: construct.setglobalstringencoding
.. autofunction:: construct.StringsAsBytes
.. autodata:: construct.possiblestringencodings
.. autofunction:: construct.String
.. autofunction:: construct.PascalString
Expand Down

0 comments on commit 764d108

Please sign in to comment.