Skip to content

Commit

Permalink
PaddedString CString reimplemented; StringNullTerminated StringPadded…
Browse files Browse the repository at this point in the history
…Trimmed removed
  • Loading branch information
arekbulski committed Mar 30, 2018
1 parent def7a82 commit b1d9c1f
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 123 deletions.
2 changes: 0 additions & 2 deletions construct/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,6 @@
'StreamError',
'StringEncoded',
'StringError',
'StringNullTerminated',
'StringPaddedTrimmed',
'Struct',
'Subconstruct',
'sum_',
Expand Down
132 changes: 12 additions & 120 deletions construct/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1469,29 +1469,22 @@ def _emitprimitivetype(self, ksy, bitwise):
)


def selectencoding(encoding):
"""Used internally."""
if not encoding:
raise StringError("String* classes require explicit encoding")
return encoding


def calculateunits(encoding):
def encodingunit(encoding):
"""Used internally."""
encoding = encoding.replace("-","_").lower()
if encoding not in possiblestringencodings:
raise StringError("encoding %r not among %r" % (encoding, possiblestringencodings,))
unitsize = possiblestringencodings[encoding]
finalunit = bytes(unitsize)
return unitsize, finalunit
raise StringError("encoding %r not found among %r" % (encoding, possiblestringencodings,))
return bytes(possiblestringencodings[encoding])


class StringEncoded(Adapter):
"""Used internally."""

def __init__(self, subcon, encoding):
super(StringEncoded, self).__init__(subcon)
self.encoding = selectencoding(encoding)
if not encoding:
raise StringError("String* classes require explicit encoding")
self.encoding = encoding

def _decode(self, obj, context, path):
return obj.decode(self.encoding)
Expand All @@ -1507,114 +1500,18 @@ def _emitparse(self, code):
return "(%s).decode(%r)" % (self.subcon._compileparse(code), self.encoding, )


class StringPaddedTrimmed(Construct):
"""Used internally."""

def __init__(self, length, encoding):
super(StringPaddedTrimmed, self).__init__()
self.length = length
self.encoding = selectencoding(encoding)

def _parse(self, stream, context, path):
length = evaluate(self.length, context)
unitsize, finalunit = calculateunits(self.encoding)
if length % unitsize:
raise StringError("byte length must be multiple of encoding-unit, %s" % (unitsize,))
obj = _read_stream(stream, length)
endsat = 0
objlen = len(obj)
while endsat+unitsize <= objlen and obj[endsat:endsat+unitsize] != finalunit:
endsat += unitsize
return obj[:endsat]

def _build(self, obj, stream, context, path):
originalobj = obj
length = evaluate(self.length, context)
unitsize, finalunit = calculateunits(self.encoding)

if length % unitsize:
raise StringError("byte length must be multiple of encoding-unit, %s" % (unitsize,))
if len(obj) % unitsize:
raise StringError("string length must be multiple of encoding-unit, %s" % (unitsize,))
if len(obj) > length-unitsize:
obj = obj[:length-unitsize]
obj = obj.ljust(length, b"\x00")
_write_stream(stream, obj, length)
return originalobj

def _sizeof(self, context, path):
return evaluate(self.length, context)

def _emitparse(self, code):
unitsize, finalunit = calculateunits(self.encoding)
code.append("""
def parse_paddedtrimmedstring(io, length, unitsize, finalunit):
if length % unitsize:
raise StringError
obj = read_bytes(io, length)
endsat = 0
objlen = len(obj)
while endsat+unitsize <= objlen and obj[endsat:endsat+unitsize] != finalunit:
endsat += unitsize
return obj[:endsat]
""")
return "parse_paddedtrimmedstring(io, %s, %s, %r)" % (self.length, unitsize, finalunit, )


class StringNullTerminated(Construct):
"""Used internally."""

def __init__(self, encoding):
super(StringNullTerminated, self).__init__()
self.encoding = selectencoding(encoding)

def _parse(self, stream, context, path):
unitsize, finalunit = calculateunits(self.encoding)
result = []
while True:
unit = _read_stream(stream, unitsize)
if unit == finalunit:
break
result.append(unit)
return b"".join(result)

def _build(self, obj, stream, context, path):
originalobj = obj
unitsize, finalunit = calculateunits(self.encoding)
if len(obj) % unitsize:
raise StringError("string length must be multiple of encoding-unit, %s" % (unitsize,))
data = obj + finalunit
_write_stream(stream, data)
return originalobj

def _emitparse(self, code):
unitsize, finalunit = calculateunits(self.encoding)
code.append("""
def parse_nullterminatedstring(io, unitsize, finalunit):
result = []
while True:
unit = read_bytes(io, unitsize)
if unit == finalunit:
break
result.append(unit)
return b"".join(result)
""")
return "parse_nullterminatedstring(io, %s, %r)" % (unitsize, finalunit, )


def PaddedString(length, encoding):
r"""
Configurable, fixed-length or variable-length string field.
When parsing, the byte string is stripped of null bytes (per encoding unit), then decoded. Length is an integer or context lambda. When building, the string is encoded, then trimmed to specified length minus encoding unit, then padded to specified length. Size is same as length parameter.
When parsing, the byte string is stripped of null bytes (per encoding unit), then decoded. Length is an integer or context lambda. When building, the string is encoded and then padded to specified length. If encoded string is larger then the specified length, it fails with PaddingError. Size is same as length parameter.
.. warning:: PaddedString and CString only support encodings explicitly listed in :class:`~construct.core.possiblestringencodings` .
:param length: integer or context lambda, length in bytes (not unicode characters)
:param encoding: string like: utf8 utf16 utf32 ascii
:raises StringError: building a non-unicode string
:raises StringError: specified length or object for building is not a multiple of unit
:raises StringError: selected encoding is not on supported list
Can propagate any exception from the lambda, possibly non-ConstructError.
Expand All @@ -1627,7 +1524,7 @@ def PaddedString(length, encoding):
>>> d.parse(_)
u'Афон'
"""
macro = StringEncoded(StringPaddedTrimmed(length, encoding), encoding)
macro = StringEncoded(FixedSized(length, NullStripped(GreedyBytes, pad=encodingunit(encoding))), encoding)
def _emitfulltype(ksy, bitwise):
return dict(size=length, type="strz", encoding=encoding)
macro._emitfulltype = _emitfulltype
Expand All @@ -1636,9 +1533,7 @@ def _emitfulltype(ksy, bitwise):

def PascalString(lengthfield, encoding):
r"""
Length-prefixed string. The length field can be variable length (such as VarInt) or fixed length (such as Int64ub). VarInt is recommended when designing new protocols. Stored length is in bytes, not characters. Size is not defined.
:class:`~construct.core.VarInt` is recommended for new protocols, as it is more compact and never overflows.
Length-prefixed string. The length field can be variable length (such as VarInt) or fixed length (such as Int64ub). :class:`~construct.core.VarInt` is recommended when designing new protocols. Stored length is in bytes, not characters. Size is not defined.
:param lengthfield: Construct instance, field used to parse and build the length (like VarInt Int64ub)
:param encoding: string like: utf8 utf16 utf32 ascii
Expand Down Expand Up @@ -1672,7 +1567,6 @@ def CString(encoding):
:param encoding: string like: utf8 utf16 utf32 ascii
:raises StringError: building a non-unicode string
:raises StringError: object for building is not a multiple of unit
:raises StringError: selected encoding is not on supported list
Example::
Expand All @@ -1683,7 +1577,7 @@ def CString(encoding):
>>> d.parse(_)
u'Афон'
"""
macro = StringEncoded(StringNullTerminated(encoding), encoding)
macro = StringEncoded(NullTerminated(GreedyBytes, term=encodingunit(encoding)), encoding)
def _emitfulltype(ksy, bitwise):
return dict(type="strz", encoding=encoding)
macro._emitfulltype = _emitfulltype
Expand All @@ -1692,9 +1586,7 @@ def _emitfulltype(ksy, bitwise):

def GreedyString(encoding):
r"""
String that reads entire stream until EOF, and writes a given string as-is. If no encoding is specified, this is essentially GreedyBytes.
Analog to :class:`~construct.core.GreedyBytes` , and identical when no enoding is used.
String that reads entire stream until EOF, and writes a given string as-is. Analog to :class:`~construct.core.GreedyBytes` but also applies unicode-to-bytes encoding.
:param encoding: string like: utf8 utf16 utf32 ascii
Expand Down Expand Up @@ -4695,7 +4587,7 @@ def _build(self, obj, stream, context, path):
data = stream2.getvalue()
pad = length - len(data)
if pad < 0:
raise PaddingError("subcon build %d bytes but was allowed only %d" % (offset2-offset1, length, ))
raise PaddingError("subcon build %d bytes but was allowed only %d" % (len(data), length))
_write_stream(stream, data)
_write_stream(stream, bytes(pad))
return buildret
Expand Down
2 changes: 2 additions & 0 deletions docs/transition29.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ Compiled added (used internally)

String* require explicit encodings, all of them support UTF16 UTF32 encodings, but PaddedString CString dropped some parameters and support only encodings explicitly listed in `possiblestringencodings` (`see tutorial page <https://construct.readthedocs.io/en/latest/advanced.html#strings>`_)

PaddedString CString classes reimplemented using NullTerminated NullStripped

String* build empty strings into empty bytes (despite for example UTF16 encoding empty string into 2 bytes marker)

String class renamed to PaddedString
Expand Down
3 changes: 2 additions & 1 deletion tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,8 @@ def test_paddedstring():

d = PaddedString(100, "ascii")
assert d.parse(b"X"*100) == u"X"*100
assert d.build(u"X"*100) == b"X"*99+b"\x00"
assert d.build(u"X"*100) == b"X"*100
assert raises(d.build, u"X"*200) == PaddingError

for e,us in [("utf8",1),("utf16",2),("utf_16_le",2),("utf32",4),("utf_32_le",4)]:
s = u"Афон"
Expand Down

0 comments on commit b1d9c1f

Please sign in to comment.