Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 87 additions & 16 deletions cl_sii/dte/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
>>> from cl_sii.dte import parse
>>> from cl_sii.libs import xml_utils

>>> with open('/dir/my_file.xml', mode='rb') as f:
>>> xml_file_path = '/dir/my_file.xml'
>>> with open(xml_file_path, mode='rb') as f:
... xml_doc = xml_utils.parse_untrusted_xml(f.read())

>>> parse.clean_dte_xml(xml_doc)
Expand All @@ -16,11 +17,12 @@
>>> dte_struct = parse.parse_dte_xml(xml_doc)

"""
import io
import logging
import os
from dataclasses import MISSING, _MISSING_TYPE
from datetime import date
from typing import Optional, Union
from typing import Optional, Tuple, Union

import lxml.etree

Expand All @@ -33,8 +35,18 @@
logger = logging.getLogger(__name__)


DTE_XMLNS = 'http://www.sii.cl/SiiDte'
"""
XML namespace for DTE element in DTE XML schema.

Ref: target namespace in 'DTE_v10.xsd' and 'EnvioDTE_v10.xsd'.

* cl_sii/data/ref/factura_electronica/schemas-xml/DTE_v10.xsd#L19 (f57a326)
* cl_sii/data/ref/factura_electronica/schemas-xml/EnvioDTE_v10.xsd#L14 (f57a326)
"""

DTE_XMLNS_MAP = {
'sii-dte': 'http://www.sii.cl/SiiDte',
'sii-dte': DTE_XMLNS,
}
"""
Mapping from XML namespace prefix to full name, for DTE processing.
Expand All @@ -59,29 +71,36 @@
# main functions
###############################################################################

def clean_dte_xml(xml_doc: lxml.etree.ElementBase) -> bool:
def clean_dte_xml(
xml_doc: lxml.etree.ElementBase,
set_missing_xmlns: bool = False,
remove_doc_personalizado: bool = True,
) -> Tuple[lxml.etree.ElementBase, bool]:
"""
Remove some non-compliant (DTE XML schema) data from ``xml_doc``.
Apply changes to ``xml_doc`` towards compliance to DTE XML schema.

.. seealso:: :data:`DTE_XML_SCHEMA_OBJ`

Not all non-compliant data is removed; only some corresponding to popular
modifications but non-compliant nonetheless.
There is a kwarg to enable/disable each kind of change.

The object is modified in-place.
.. warning::
Do not assume the ``xml_doc``object is modified in-place because in
some cases it will be replaced (i.e. a entirely different object).

:returns: whether ``xml_doc`` was modified or not
:returns: new ``xml_doc`` and whether it was modified or not

"""
modified = False

xml_etree = xml_doc.getroottree()
if set_missing_xmlns:
xml_doc, _modified = _set_dte_xml_missing_xmlns(xml_doc)
modified = modified or _modified

# Remove non-standard but popular element 'DocPersonalizado'.
xml_em = xml_etree.find('sii-dte:DocPersonalizado', namespaces=DTE_XMLNS_MAP)
if xml_em is not None:
modified = True
xml_doc.remove(xml_em)
if remove_doc_personalizado:
xml_doc, _modified = _remove_dte_xml_doc_personalizado(xml_doc)
modified = modified or _modified

return modified
return xml_doc, modified


def validate_dte_xml(xml_doc: lxml.etree.ElementBase) -> None:
Expand Down Expand Up @@ -125,6 +144,58 @@ def parse_dte_xml(xml_doc: lxml.etree.ElementBase) -> data_models.DteDataL2:
# helpers
###############################################################################

def _set_dte_xml_missing_xmlns(
xml_doc: lxml.etree.ElementBase,
) -> Tuple[lxml.etree.ElementBase, bool]:

# source: name of the XML element without namespace.
# cl_sii/data/ref/factura_electronica/schemas-xml/DTE_v10.xsd#L22 (f57a326)
# cl_sii/data/ref/factura_electronica/schemas-xml/EnvioDTE_v10.xsd#L92 (f57a326)
em_tag_simple = 'DTE'

em_namespace = DTE_XMLNS
em_tag_namespaced = '{%s}%s' % (em_namespace, em_tag_simple)

# Tag of 'DTE' should be ...
assert em_tag_namespaced == '{http://www.sii.cl/SiiDte}DTE'

modified = False

root_em = xml_doc.getroottree().getroot()
root_em_tag = root_em.tag

if root_em_tag == em_tag_namespaced:
pass
elif root_em_tag == em_tag_simple:
modified = True
root_em.set('xmlns', em_namespace)
f = io.BytesIO()
xml_utils.write_xml_doc(xml_doc, f)
new_xml_doc_bytes = f.getvalue()
xml_doc = xml_utils.parse_untrusted_xml(new_xml_doc_bytes)
else:
exc_msg = "XML root element tag does not match the expected simple or namespaced name."
raise Exception(exc_msg, em_tag_simple, em_tag_namespaced, root_em_tag)

return xml_doc, modified


def _remove_dte_xml_doc_personalizado(
xml_doc: lxml.etree.ElementBase,
) -> Tuple[lxml.etree.ElementBase, bool]:
# Remove non-standard but popular element 'DocPersonalizado', it if exists.

modified = False
em_path = 'sii-dte:DocPersonalizado'

xml_em = xml_doc.getroottree().find(em_path, namespaces=DTE_XMLNS_MAP)
if xml_em is not None:
modified = True
xml_doc.remove(xml_em)

return xml_doc, modified


def _get_tipo_dte(xml_etree: lxml.etree.ElementTree) -> constants.TipoDteEnum:
em_path = 'sii-dte:Documento/sii-dte:Encabezado/sii-dte:IdDoc/sii-dte:TipoDTE'

Expand Down
43 changes: 43 additions & 0 deletions cl_sii/libs/xml_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
import os
from typing import IO

import defusedxml
import defusedxml.lxml
Expand Down Expand Up @@ -237,3 +238,45 @@ def validate_xml_doc(xml_schema: lxml.etree.XMLSchema, xml_doc: lxml.etree.Eleme
validation_error_msg = str(exc)

raise XmlSchemaDocValidationError(validation_error_msg) from exc


def write_xml_doc(xml_doc: lxml.etree.ElementBase, output: IO[bytes]) -> None:
"""
Write ``xml_doc`` to bytes stream ``output``.

In this context, "write" means "serialize", so there are a number of
observations on that regard:

* Encoding will be preserved.
* XML declaration (``<?xml ... ?>``) will be included.
* Quoting of each XML declaration attribute's value may change
i.e. from ``"`` to ``'`` or viceversa.
* In self-closing tags, the whitespace between the last attribute
and the closing (``/>``) may be removed e.g.
``<DigestMethod Algorithm="http://www.w3.org/2000/09/xmldsig#sha1" />`` to
``<DigestMethod Algorithm="http://www.w3.org/2000/09/xmldsig#sha1"/>``
* No pretty-print.

For a temporary bytes stream in memory you may create a
:class:`io.BytesIO` object.

"""
# note: use `IO[X]` for arguments and `TextIO`/`BinaryIO` for return types (says GVR).
# https://github.com/python/typing/issues/518#issuecomment-350903120

xml_etree: lxml.etree.ElementTree = xml_doc.getroottree()

# See:
# https://lxml.de/api/lxml.etree._ElementTree-class.html#write
xml_etree.write(
file=output,
encoding=xml_etree.docinfo.encoding,
# alternatives: 'xml', 'html', 'text' or 'c14n'
method='xml',
# note: include XML declaration (`<?xml ... ?>`).
xml_declaration=True,
pretty_print=False,
# note: we are not sure what this does.
# default: True.
with_tail=True,
)
110 changes: 110 additions & 0 deletions scripts/clean_dte_xml_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#!/usr/bin/env python
"""
Clean DTE XML files.


Example for a single file::

./scripts/clean_dte_xml_file.py file \
'tests/test_data/sii-dte/DTE--76354771-K--33--170.xml' \
'tests/test_data/sii-dte/DTE--76354771-K--33--170-clean.xml'


Example for all files in a directory::

./scripts/clean_dte_xml_file.py dir 'tests/test_data/sii-dte/'


"""
import difflib
import os
import pathlib
import sys
from typing import Iterable

try:
import cl_sii # noqa: F401
except ImportError:
# If package 'cl-sii' is not installed, try appending the project repo directory to the
# Python path, assuming thath we are in the project repo. If not, it will fail nonetheless.
sys.path.append(os.path.dirname(os.path.abspath(__name__)))
import cl_sii # noqa: F401

import cl_sii.dte.parse
from cl_sii.libs import xml_utils


# TODO: log messages instead of print.


def clean_dte_xml_file(input_file_path: str, output_file_path: str) -> Iterable[bytes]:
with open(input_file_path, mode='rb') as f:
file_bytes = f.read()

xml_doc = xml_utils.parse_untrusted_xml(file_bytes)

xml_doc_cleaned, modified = cl_sii.dte.parse.clean_dte_xml(
xml_doc,
set_missing_xmlns=True,
remove_doc_personalizado=True,
)

cl_sii.dte.parse.validate_dte_xml(xml_doc_cleaned)

with open(output_file_path, 'w+b') as f:
xml_utils.write_xml_doc(xml_doc_cleaned, f)

with open(output_file_path, mode='rb') as f:
file_bytes_rewritten = f.read()

# note: another way to compute the difference in a similar format is
# `diff -Naur $input_file_path $output_file_path`
file_bytes_diff_gen = difflib.diff_bytes(
dfunc=difflib.unified_diff,
a=file_bytes.splitlines(),
b=file_bytes_rewritten.splitlines())

return file_bytes_diff_gen


def main_single_file(input_file_path: str, output_file_path: str) -> None:
file_bytes_diff_gen = clean_dte_xml_file(
input_file_path=input_file_path,
output_file_path=output_file_path)

for diff_line in file_bytes_diff_gen:
print(diff_line)


def main_dir_files(input_files_dir_path: str) -> None:
for p in pathlib.Path(input_files_dir_path).iterdir():
if not p.is_file():
continue

# e.g. 'an example.xml' -> 'an example.clean.xml'
input_file_path = str(p)
output_file_path = str(p.with_suffix(f'.clean{p.suffix}'))

print(f"\n\nWill clean file '{input_file_path}' and save it to '{output_file_path}'.")
file_bytes_diff_gen = clean_dte_xml_file(
input_file_path=input_file_path,
output_file_path=output_file_path)

print(f"Difference between input and output files:")
diff_line = None
for diff_line in file_bytes_diff_gen:
print(diff_line)
if diff_line is None:
print(f"No difference.")


if __name__ == '__main__':
if sys.argv[1] == 'file':
main_single_file(
input_file_path=sys.argv[2],
output_file_path=sys.argv[3])
elif sys.argv[1] == 'dir':
main_dir_files(
input_files_dir_path=sys.argv[2])
else:
raise ValueError(f"Invalid option: '{sys.argv[1]}'")
Loading