From 3987fb38f7cae2fb675ebe44f5aa645559f5ce2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Germ=C3=A1n=20Larra=C3=ADn?= Date: Fri, 30 Nov 2018 17:45:05 -0300 Subject: [PATCH 1/3] requirements: add 'lxml' "lxml is a Pythonic, mature binding for the libxml2 and libxslt libraries. It provides safe and convenient access to these libraries using the ElementTree API." https://lxml.de/ https://github.com/lxml/lxml --- requirements/base.txt | 1 + setup.py | 1 + 2 files changed, 2 insertions(+) diff --git a/requirements/base.txt b/requirements/base.txt index 1d19d0e6..aec4d6d9 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -2,6 +2,7 @@ # note: it is mandatory to register all dependencies of the required packages. # Required packages: +lxml==4.2.5 marshmallow==2.16.3 pytz==2018.9 diff --git a/setup.py b/setup.py index 6e939b35..2cabda76 100755 --- a/setup.py +++ b/setup.py @@ -23,6 +23,7 @@ def get_version(*file_paths: Sequence[str]) -> str: # TODO: add reasonable upper-bound for some of these packages? requirements = [ + 'lxml>=4.2.5', 'marshmallow>=2.16.3', 'pytz>=2018.7', ] From 6bac1df59852b8cb519628a76b7adbd6b12e5efe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Germ=C3=A1n=20Larra=C3=ADn?= Date: Wed, 5 Dec 2018 11:39:14 -0300 Subject: [PATCH 2/3] requirements: add 'defusedxml' "XML bomb protection for Python stdlib modules". https://github.com/tiran/defusedxml --- requirements/base.txt | 1 + setup.py | 1 + 2 files changed, 2 insertions(+) diff --git a/requirements/base.txt b/requirements/base.txt index aec4d6d9..0cb7cfe3 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -2,6 +2,7 @@ # note: it is mandatory to register all dependencies of the required packages. # Required packages: +defusedxml==0.5.0 lxml==4.2.5 marshmallow==2.16.3 pytz==2018.9 diff --git a/setup.py b/setup.py index 2cabda76..da24544f 100755 --- a/setup.py +++ b/setup.py @@ -23,6 +23,7 @@ def get_version(*file_paths: Sequence[str]) -> str: # TODO: add reasonable upper-bound for some of these packages? requirements = [ + 'defusedxml>=0.5.0', 'lxml>=4.2.5', 'marshmallow>=2.16.3', 'pytz>=2018.7', From 93de7f9b27983bfcd9ae07c1b18e2bb235cabdd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Germ=C3=A1n=20Larra=C3=ADn?= Date: Tue, 18 Dec 2018 15:44:14 -0300 Subject: [PATCH 3/3] libs: add module `xml_utils` Add helper functions (plus some related exception classes). - `parse_untrusted_xml`: handles the numerous issues and concerns related to parsing untrusted XML data. - `read_xml_schema`: instantiate an XML schema object from a file. - `validate_xml_doc`: validate an XML document against an XML schema. Sources of files in 'test_data/xml/attacks': - https://en.wikipedia.org/wiki/Billion_laughs_attack#Code_example - https://pypi.org/project/defusedxml/#billion-laughs-exponential-entity-expansion - https://pypi.org/project/defusedxml/#external-entity-expansion-remote - https://pypi.org/project/defusedxml/#quadratic-blowup-entity-expansion Some tests have not been implemented. --- cl_sii/libs/xml_utils.py | 239 ++++++++++++++++++ setup.cfg | 6 + .../xml/attacks/billion-laughs-1.xml | 16 ++ .../xml/attacks/billion-laughs-2.xml | 8 + .../external-entity-expansion-remote.xml | 5 + .../quadratic-blowup-entity-expansion.xml | 5 + tests/test_data/xml/trivial.xml | 5 + tests/test_xml_utils.py | 101 ++++++++ tests/utils.py | 15 ++ 9 files changed, 400 insertions(+) create mode 100644 cl_sii/libs/xml_utils.py create mode 100644 tests/test_data/xml/attacks/billion-laughs-1.xml create mode 100644 tests/test_data/xml/attacks/billion-laughs-2.xml create mode 100644 tests/test_data/xml/attacks/external-entity-expansion-remote.xml create mode 100644 tests/test_data/xml/attacks/quadratic-blowup-entity-expansion.xml create mode 100644 tests/test_data/xml/trivial.xml create mode 100644 tests/test_xml_utils.py create mode 100644 tests/utils.py diff --git a/cl_sii/libs/xml_utils.py b/cl_sii/libs/xml_utils.py new file mode 100644 index 00000000..1a9fd0a3 --- /dev/null +++ b/cl_sii/libs/xml_utils.py @@ -0,0 +1,239 @@ +import logging +import os + +import defusedxml +import defusedxml.lxml +import lxml.etree +import xml.parsers.expat +import xml.parsers.expat.errors + + +logger = logging.getLogger(__name__) + + +############################################################################### +# exceptions +############################################################################### + +class BaseXmlParsingError(Exception): + + """ + Base class for all XML parsing errors. + """ + + +class XmlSyntaxError(BaseXmlParsingError): + + """ + The value to be parsed is syntactically invalid XML. + + It is also possible that some cases of maliciously constructed data are + reported as syntactically invalid XML e.g. a "billion laughs" attack. + + """ + + +class XmlFeatureForbidden(BaseXmlParsingError): + + """ + The parsed XML contains/uses a feature that is forbidden. + + Usually an XML feature is forbidden for security reasons, to prevent + some attack vectors. + + .. seealso:: + https://docs.python.org/3/library/xml.html#xml-vulnerabilities + + """ + + +class UnknownXmlParsingError(BaseXmlParsingError): + + """ + An unkwnown XML parsing error or for which there is no handling implementation. + + It is useful because the XML parsing process indirectly uses many + (standard and 3rd party) libraries, some of them with native + implementations and/or with a lot of obscure Python magic. + + """ + + +class XmlSchemaDocValidationError(Exception): + + """ + XML document did not be validate against an XML schema. + + """ + + +############################################################################### +# functions +############################################################################### + +def parse_untrusted_xml(value: bytes) -> lxml.etree.ElementBase: + """ + Parse XML-encoded content in value. + + .. note:: + It is ok to use it for parsing untrusted or unauthenticated data. + See https://docs.python.org/3/library/xml.html#xml-vulnerabilities + + .. warning:: + It is possible that for some cases of maliciously constructed data an + ``XmlSyntaxError`` will be raised instead of a ``XmlFeatureForbidden`` + exception. + + :raises TypeError: + :raises XmlSyntaxError: if it is not syntactically valid XML + :raises XmlFeatureForbidden: if the parsed XML document contains/uses a + feature that is forbidden + :raises UnknownXmlParsingError: unkwnown XML parsing error or for which + there is no handling implementation + + """ + # TODO: limit input max size (it might not be straightforward if value is a generator, which + # would be desirable). + + if not isinstance(value, bytes): + raise TypeError("Value to be parsed as XML must be bytes.") + + # note: with this call, 'defusedxml' will + # - create a custom parser (instance of 'lxml.etree.XMLParser'), which is what will + # fundamentally add safety to the parsing (e.g. using 'defusedxml.lxml.RestrictedElement' + # as a custom version of 'lxml.etree.ElementBase'), + # - call the original 'lxml.etree.fromstring' (binary code), + # - run 'defusedxml.lxml.check_docinfo'. + + # warning: do NOT change the exception handling order. + try: + + xml_root_em = defusedxml.lxml.fromstring( + text=value, + parser=None, # default: None (a custom one will be created) + base_url=None, # default: None + forbid_dtd=False, # default: False (allow Document Type Definition) + forbid_entities=True, # default: True (forbid Entity definitions/declarations) + ) # type: lxml.etree.ElementBase + + except (defusedxml.DTDForbidden, + defusedxml.EntitiesForbidden, + defusedxml.ExternalReferenceForbidden) as exc: + # note: we'd rather use 'defusedxml.DefusedXmlException' but that would catch + # 'defusedxml.NotSupportedError' as well + + raise XmlFeatureForbidden("XML uses or contains a forbidden feature.") from exc + + except lxml.etree.XMLSyntaxError as exc: + # note: the MRO of this exception class is: + # - XMLSyntaxError: "Syntax error while parsing an XML document." + # - ParseError: "Syntax error while parsing an XML document." + # note: do not confuse it with the almost identically named 'lxml.etree.ParserError' + # ("Internal lxml parser error"), whose parent class *is not* 'LxmlSyntaxError'. + # - LxmlSyntaxError: "Base class for all syntax errors." + # - LxmlError: "Main exception base class for lxml. All other exceptions inherit from + # this one. + # - lxml.etree.Error: "Common base class for all non-exit exceptions." + + # 'exc.msg' is a user-friendly error msg and includes the reference to line and column + # e.g. "Detected an entity reference loop, line 1, column 7". + # Thus we do not need these attributes: (exc.position, exc.lineno, exc.offset) + exc_msg = "XML syntax error. {}.".format(exc.msg) + raise XmlSyntaxError(exc_msg) from exc + + except xml.parsers.expat.ExpatError as exc: + # TODO: if this is reached it means we should improve this exception handler (even if + # it is just to raise the same exception with a different message) because + # it is a good idea to determine whether the source of the problem really is the + # XML-encoded content. + + # https://docs.python.org/3/library/pyexpat.html#expaterror-exceptions + # https://docs.python.org/3/library/pyexpat.html#xml.parsers.expat.errors.messages + # e.g. + # "unknown encoding" + # "mismatched tag" + # "parsing aborted" + # "out of memory" + + # For sanity crop the XML-encoded content to max 1 KiB (arbitrary value). + log_msg = "Unexpected XML 'ExpatError' at line {} offset {}: {}. Content: %s".format( + exc.lineno, exc.offset, xml.parsers.expat.errors.messages[exc.code]) + logger.exception(log_msg, str(value[:1024])) + + exc_msg = "Unexpected error while parsing value as XML. Line {}, offset {}.".format( + exc.lineno, exc.offset) + raise UnknownXmlParsingError(exc_msg) from exc + + except lxml.etree.LxmlError as exc: + # TODO: if this is reached it means we should add another exception handler (even if + # it is just to raise the same exception with the same message) because it is a good + # idea to determine whether the source of the problem really is the response content. + + # For sanity crop the XML-encoded content to max 1 KiB (arbitrary value). + log_msg = "Unexpected 'LxmlError' that is not an 'XMLSyntaxError'. Content: %s" + logger.exception(log_msg, str(value[:1024])) + + exc_msg = "Unexpected error while parsing value as XML." + raise UnknownXmlParsingError(exc_msg) from exc + + except ValueError as exc: + # TODO: if this is reached it means we should add another exception handler (even if + # it is just to raise the same exception with the same message) because it is a good + # idea to determine whether the source of the problem really is the response content. + + # For sanity crop the XML-encoded content to max 1 KiB (arbitrary value). + log_msg = "Unexpected error while parsing value as XML. Content: %s" + logger.exception(log_msg, str(value[:1024])) + + exc_msg = "Unexpected error while parsing value as XML." + raise UnknownXmlParsingError(exc_msg) from exc + + return xml_root_em + + +def read_xml_schema(filename: str) -> lxml.etree.XMLSchema: + """ + Instantiate an XML schema object from a file. + + :raises ValueError: if there is no file at ``filename`` + + """ + if os.path.exists(filename) and os.path.isfile(filename): + return lxml.etree.XMLSchema(file=filename) + raise ValueError("XML schema file not found.", filename) + + +def validate_xml_doc(xml_schema: lxml.etree.XMLSchema, xml_doc: lxml.etree.ElementBase) -> None: + """ + Validate ``xml_doc`` against XML schema ``xml_schema``. + + :raises XmlSchemaDocValidationError: if ``xml_doc`` did not be validate + against ``xml_schema`` + + """ + # There are several ways to validate 'xml_doc' according to an 'xml_schema'. + # Different calls and what happens if validation passes or fails: + # - xml_schema.assert_(xml_doc): nothign / raises 'AssertionError' + # - xml_schema.assertValid(xml_doc): nothing / raises 'DocumentInvalid' + # - xml_schema.validate(xml_doc): returns True / returns False + + try: + xml_schema.assertValid(xml_doc) + except lxml.etree.DocumentInvalid as exc: + # note: 'exc.error_log' and 'xml_schema.error_log' are the same object + # (type 'lxml.etree._ListErrorLog'). + + # TODO: advanced error details parsing, without leaking too much information. + # xml_error_log = exc.error_log # type: lxml.etree._ListErrorLog + # last_xml_error = exc.error_log.last_error # type: lxml.etree._LogEntry + # last_xml_error_xml_doc_line = last_xml_error.line + + # TODO: does 'xml_schema.error_log' persist? is it necessary to clear it afterwards? + # `xml_schema._clear_error_log()` + + # Simplest and safest way to get the error message. + # Error example: + # "Element 'DTE': No matching global declaration available for the validation root., line 2" # noqa: E501 + validation_error_msg = str(exc) + + raise XmlSchemaDocValidationError(validation_error_msg) from exc diff --git a/setup.cfg b/setup.cfg index 93144517..79258364 100644 --- a/setup.cfg +++ b/setup.cfg @@ -27,9 +27,15 @@ disallow_untyped_defs = True check_untyped_defs = True warn_return_any = True +[mypy-defusedxml.*] +ignore_missing_imports = True + [mypy-django.*] ignore_missing_imports = True +[mypy-lxml.*] +ignore_missing_imports = True + [mypy-marshmallow.*] ignore_missing_imports = True diff --git a/tests/test_data/xml/attacks/billion-laughs-1.xml b/tests/test_data/xml/attacks/billion-laughs-1.xml new file mode 100644 index 00000000..0cb7b935 --- /dev/null +++ b/tests/test_data/xml/attacks/billion-laughs-1.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + +]> +&lol9; diff --git a/tests/test_data/xml/attacks/billion-laughs-2.xml b/tests/test_data/xml/attacks/billion-laughs-2.xml new file mode 100644 index 00000000..14c8ae80 --- /dev/null +++ b/tests/test_data/xml/attacks/billion-laughs-2.xml @@ -0,0 +1,8 @@ + + + + + +]> +&d; diff --git a/tests/test_data/xml/attacks/external-entity-expansion-remote.xml b/tests/test_data/xml/attacks/external-entity-expansion-remote.xml new file mode 100644 index 00000000..b5eb9a10 --- /dev/null +++ b/tests/test_data/xml/attacks/external-entity-expansion-remote.xml @@ -0,0 +1,5 @@ + + +]> + diff --git a/tests/test_data/xml/attacks/quadratic-blowup-entity-expansion.xml b/tests/test_data/xml/attacks/quadratic-blowup-entity-expansion.xml new file mode 100644 index 00000000..84bd11c1 --- /dev/null +++ b/tests/test_data/xml/attacks/quadratic-blowup-entity-expansion.xml @@ -0,0 +1,5 @@ + + +]> +&a;&a;&a;... repeat diff --git a/tests/test_data/xml/trivial.xml b/tests/test_data/xml/trivial.xml new file mode 100644 index 00000000..69a940b8 --- /dev/null +++ b/tests/test_data/xml/trivial.xml @@ -0,0 +1,5 @@ + + text + texttail + + diff --git a/tests/test_xml_utils.py b/tests/test_xml_utils.py new file mode 100644 index 00000000..bb04316e --- /dev/null +++ b/tests/test_xml_utils.py @@ -0,0 +1,101 @@ +import unittest + +import lxml.etree + +from cl_sii.libs.xml_utils import ( # noqa: F401 + XmlSyntaxError, XmlFeatureForbidden, + parse_untrusted_xml, read_xml_schema, validate_xml_doc, +) + +from .utils import read_test_file_bytes + + +class FunctionParseUntrustedXmlTests(unittest.TestCase): + + def test_parse_untrusted_xml_valid(self) -> None: + value = ( + b'\n' + b' text\n' + b' texttail\n' + b' \n' + b'') + xml = parse_untrusted_xml(value) + self.assertIsInstance(xml, lxml.etree.ElementBase) + # print(xml) + self.assertEqual( + lxml.etree.tostring(xml, pretty_print=False), + value) + + def test_bytes_text(self) -> None: + value = b'not xml' # type: ignore + with self.assertRaises(XmlSyntaxError) as cm: + parse_untrusted_xml(value) + + self.assertSequenceEqual( + cm.exception.args, + ("XML syntax error. Start tag expected, '<' not found, line 1, column 1.", ) + ) + + def test_attack_billion_laughs_1(self) -> None: + value = read_test_file_bytes('test_data/xml/attacks/billion-laughs-1.xml') + with self.assertRaises(XmlSyntaxError) as cm: + parse_untrusted_xml(value) + + self.assertSequenceEqual( + cm.exception.args, + ("XML syntax error. Detected an entity reference loop, line 1, column 7.", ) + ) + + def test_attack_billion_laughs_2(self) -> None: + value = read_test_file_bytes('test_data/xml/attacks/billion-laughs-2.xml') + with self.assertRaises(XmlSyntaxError) as cm: + parse_untrusted_xml(value) + + self.assertSequenceEqual( + cm.exception.args, + ("XML syntax error. Detected an entity reference loop, line 1, column 4.", ) + ) + + def test_attack_quadratic_blowup(self) -> None: + value = read_test_file_bytes('test_data/xml/attacks/quadratic-blowup-entity-expansion.xml') + with self.assertRaises(XmlFeatureForbidden) as cm: + parse_untrusted_xml(value) + + self.assertSequenceEqual( + cm.exception.args, + ("XML uses or contains a forbidden feature.", ) + ) + + def test_attack_external_entity_expansion_remote(self) -> None: + value = read_test_file_bytes('test_data/xml/attacks/external-entity-expansion-remote.xml') + with self.assertRaises(XmlFeatureForbidden) as cm: + parse_untrusted_xml(value) + + self.assertSequenceEqual( + cm.exception.args, + ("XML uses or contains a forbidden feature.", ) + ) + + def test_type_error(self) -> None: + value = 1 # type: ignore + with self.assertRaises(TypeError) as cm: + parse_untrusted_xml(value) + + self.assertSequenceEqual( + cm.exception.args, + ("Value to be parsed as XML must be bytes.", ) + ) + + +class FunctionReadXmlSchemaTest(unittest.TestCase): + + # TODO: implement + + pass + + +class FunctionValidateXmlDocTest(unittest.TestCase): + + # TODO: implement + + pass diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 00000000..ae424d5d --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,15 @@ +import os + + +_TESTS_DIR_PATH = os.path.dirname(__file__) + + +def read_test_file_bytes(path: str) -> bytes: + filepath = os.path.join( + _TESTS_DIR_PATH, + path, + ) + with open(filepath, mode='rb') as file: + content = file.read() + + return content