Merge pull request #53 from fyndata/develop

Release v0.6.3
cordada · May 25, 2019 · 7aad935 · 7aad935
2 parents dba26db + 4bea003
commit 7aad935
Show file tree

Hide file tree

Showing 14 changed files with 795 additions and 7 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.6.2
+current_version = 0.6.3
 commit = True
 tag = True
 

diff --git a/HISTORY.rst b/HISTORY.rst
@@ -3,6 +3,15 @@
 History
 -------
 
+0.6.3 (2019-05-24)
++++++++++++++++++++++++
+
+* (PR #52, 2019-05-24) rcv: add module ``parse_csv``
+* (PR #51, 2019-05-24) libs: add module ``rows_processing``
+* (PR #50, 2019-05-24) libs: add module ``csv_utils``
+* (PR #49, 2019-05-24) libs.mm_utils: add ``validate_no_unexpected_input_fields``
+* (PR #48, 2019-05-24) dte.data_models: add ``DteDataL2.as_dte_data_l1``
+
 0.6.2 (2019-05-15)
 +++++++++++++++++++++++
 

diff --git a/cl_sii/__init__.py b/cl_sii/__init__.py
@@ -5,4 +5,4 @@
 """
 
 
-__version__ = '0.6.2'
+__version__ = '0.6.3'
diff --git a/cl_sii/dte/data_models.py b/cl_sii/dte/data_models.py
@@ -439,3 +439,12 @@ def __post_init__(self) -> None:
                 raise TypeError("Inappropriate type of 'receptor_email'.")
             validate_clean_str(self.receptor_email)
             validate_non_empty_str(self.receptor_email)
+
+    def as_dte_data_l1(self) -> DteDataL1:
+        return DteDataL1(
+            emisor_rut=self.emisor_rut,
+            tipo_dte=self.tipo_dte,
+            folio=self.folio,
+            fecha_emision_date=self.fecha_emision_date,
+            receptor_rut=self.receptor_rut,
+            monto_total=self.monto_total)
diff --git a/cl_sii/libs/csv_utils.py b/cl_sii/libs/csv_utils.py
@@ -0,0 +1,49 @@
+import csv
+from typing import IO, Sequence, Type, Union
+
+
+def create_csv_dict_reader(
+    text_stream: IO[str],
+    csv_dialect: Type[csv.Dialect],
+    row_dict_extra_fields_key: Union[str, None] = None,
+    expected_fields_strict: bool = True,
+    expected_field_names: Sequence[str] = None,
+) -> csv.DictReader:
+    """
+    Create a CSV dict reader with custom options.
+
+    :param text_stream:
+    :param row_dict_extra_fields_key:
+        CSV row dict key under which the extra data in the row will be saved
+    :param csv_dialect:
+    :param expected_fields_strict:
+    :param expected_field_names:
+        (required if ``expected_field_names`` is True)
+    :return: a CSV DictReader
+
+    """
+    # note: mypy wrongly complains: it does not accept 'fieldnames' to be None but that value
+    #   is completely acceptable, and it even is the default!
+    #   > error: Argument "fieldnames" to "DictReader" has incompatible type "None"; expected
+    #   > "Sequence[str]"
+    # note: mypy wrongly complains:
+    #   > Argument "dialect" to "DictReader" has incompatible type "Type[Dialect]";
+    #   > expected "Union[str, Dialect]"
+    csv_reader = csv.DictReader(  # type: ignore
+        text_stream,
+        fieldnames=None,  # the values of the first row will be used as the fieldnames
+        restkey=row_dict_extra_fields_key,
+        dialect=csv_dialect,
+    )
+
+    if expected_fields_strict:
+        if expected_field_names:
+            if tuple(csv_reader.fieldnames) != expected_field_names:
+                raise ValueError(
+                    "CSV file field names do not match those expected, or their order.",
+                    csv_reader.fieldnames)
+        else:
+            raise ValueError(
+                "Param 'expected_field_names' is required if 'expected_fields_strict' is True.")
+
+    return csv_reader
diff --git a/cl_sii/libs/mm_utils.py b/cl_sii/libs/mm_utils.py
@@ -6,6 +6,48 @@
 import marshmallow.utils
 
 
+###############################################################################
+# validators
+###############################################################################
+
+def validate_no_unexpected_input_fields(
+    schema: marshmallow.Schema,
+    data: dict,
+    original_data: dict,
+) -> None:
+    """
+    Fail validation if there was an unexpected input field.
+
+    Usage::
+
+        class MySchema(marshmallow.Schema):
+
+            class Meta:
+                strict = True
+
+            folio = marshmallow.fields.Integer()
+
+            @marshmallow.validates_schema(pass_original=True)
+            def validate_schema(self, data: dict, original_data: dict) -> None:
+                validate_no_unexpected_input_fields(self, data, original_data)
+
+    """
+    # Original inspiration from
+    #   https://marshmallow.readthedocs.io/en/2.x-line/extending.html#validating-original-input-data
+    fields_name_or_load_from = {
+        field.name if field.load_from is None else field.load_from
+        for field_key, field in schema.fields.items()
+    }
+    unexpected_input_fields = set(original_data) - fields_name_or_load_from
+    if unexpected_input_fields:
+        raise marshmallow.ValidationError(
+            "Unexpected input field.", field_names=list(unexpected_input_fields))
+
+
+###############################################################################
+# fields
+###############################################################################
+
 class CustomMarshmallowDateField(marshmallow.fields.Field):
     """
     A formatted date string.

diff --git a/cl_sii/libs/rows_processing.py b/cl_sii/libs/rows_processing.py
@@ -0,0 +1,153 @@
+import csv
+import logging
+
+from typing import Dict, Iterable, Sequence, Tuple
+
+import marshmallow
+
+
+logger = logging.getLogger(__name__)
+
+
+class MaxRowsExceeded(RuntimeError):
+
+    """
+    The maximum number of rows has been exceeded.
+    """
+
+
+###############################################################################
+# iterators
+###############################################################################
+
+def csv_rows_mm_deserialization_iterator(
+    csv_reader: csv.DictReader,
+    row_schema: marshmallow.Schema,
+    n_rows_offset: int = 0,
+    max_n_rows: int = None,
+    fields_to_remove_names: Sequence[str] = None,
+) -> Iterable[Tuple[int, Dict[str, object], Dict[str, object], dict]]:
+    """
+    Marshmallow deserialization iterator over CSV rows.
+
+    Iterate over ``csv_reader``, deserialize each row using ``row_schema``
+    and yield the data before and after deserialization, plus any
+    validation/deserialization errors.
+
+    .. note:: The CSV header row is omitted, obviously.
+
+    :param csv_reader:
+    :param row_schema:
+        Marshmallow schema for deserializing each CSV row
+    :param n_rows_offset:
+        (optional) number of rows to skip (and not deserialize)
+    :param max_n_rows:
+        (optional) max number of rows to deserialize (raise exception
+        if exceeded); ``None`` means no limit
+    :param fields_to_remove_names:
+        (optional) the name of each field that must be removed (if it exists)
+        from the row
+    :returns:
+        yields a tuple of (``row_ix`` (1-based), ``row_data``,
+        ``deserialized_row_data``, ``validation_errors``)
+    :raises MaxRowsExceeded:
+        number of data rows processed exceeded ``max_n_rows``
+    :raises RuntimeError:
+        on CSV error when iterating over ``csv_reader``
+
+    """
+    # note: mypy complaint is wrong because a 'csv.DictReader' object can be iterated over
+    #   and yields instances of 'Dict[str, object]'.
+    #   > Incompatible types in assignment (expression has type "DictReader", variable has type
+    #   > "Iterable[Dict[str, object]]")
+    rows_iterator: Iterable[Dict[str, object]] = csv_reader  # type: ignore
+    iterator = rows_mm_deserialization_iterator(
+        rows_iterator, row_schema, n_rows_offset, max_n_rows, fields_to_remove_names)
+
+    try:
+        # note: we chose not to use 'yield from' to be explicit about what we are yielding.
+        for row_ix, row_data, deserialized_row_data, validation_errors in iterator:
+            yield row_ix, row_data, deserialized_row_data, validation_errors
+    except csv.Error as exc:
+        exc_msg = f"CSV error for line {csv_reader.line_num} of CSV file."
+        raise RuntimeError(exc_msg) from exc
+
+
+def rows_mm_deserialization_iterator(
+    rows_iterator: Iterable[Dict[str, object]],
+    row_schema: marshmallow.Schema,
+    n_rows_offset: int = 0,
+    max_n_rows: int = None,
+    fields_to_remove_names: Sequence[str] = None,
+) -> Iterable[Tuple[int, Dict[str, object], Dict[str, object], dict]]:
+    """
+    Marshmallow deserialization iterator.
+
+    Iterate over ``rows_iterator``, deserialize each row using ``row_schema``
+    and yield the data before and after deserialization, plus any
+    validation/deserialization errors.
+
+    :param rows_iterator:
+    :param row_schema:
+        Marshmallow schema for deserializing each row
+    :param n_rows_offset:
+        (optional) number of rows to skip (and not deserialize)
+    :param max_n_rows:
+        (optional) max number of rows to deserialize (raise exception
+        if exceeded); ``None`` means no limit
+    :param fields_to_remove_names:
+        (optional) the name of each field that must be removed (if it exists)
+        from the row
+    :returns:
+        yields a tuple of (``row_ix`` (1-based), ``row_data``,
+        ``deserialized_row_data``, ``validation_errors``)
+    :raises MaxRowsExceeded:
+        number of data rows processed exceeded ``max_n_rows``
+
+    """
+    if not n_rows_offset >= 0:
+        raise ValueError("Param 'n_rows_offset' must be an integer >= 0.")
+
+    fields_to_remove_names = fields_to_remove_names or ()
+
+    for row_ix, row_data in enumerate(rows_iterator, start=1):
+        if max_n_rows is not None and row_ix > max_n_rows + n_rows_offset:
+            raise MaxRowsExceeded(f"Exceeded 'max_n_rows' limit: {max_n_rows}.")
+
+        if row_ix <= n_rows_offset:
+            continue
+
+        for _field_name in fields_to_remove_names:
+            row_data.pop(_field_name, None)
+
+        try:
+            mm_result: marshmallow.UnmarshalResult = row_schema.load(row_data)
+            deserialized_row_data: dict = mm_result.data
+            raised_validation_errors: dict = {}
+            returned_validation_errors: dict = mm_result.errors
+        except marshmallow.ValidationError as exc:
+            deserialized_row_data = {}
+            raised_validation_errors = dict(exc.normalized_messages())
+            returned_validation_errors = {}
+
+        validation_errors = raised_validation_errors
+        if returned_validation_errors:
+            if row_schema.strict:
+                # 'marshmallow.schema.BaseSchema':
+                # > :param bool strict: If `True`, raise errors if invalid data are passed in
+                # > instead of failing silently and storing the errors.
+                logger.error(
+                    "Marshmallow schema is 'strict' but validation errors were returned by "
+                    "method 'load' ('UnmarshalResult.errors') instead of being raised. "
+                    "Errors: %s",
+                    repr(returned_validation_errors))
+            if raised_validation_errors:
+                logger.fatal(
+                    "Programming error: either returned or raised validation errors "
+                    "(depending on 'strict') but never both. "
+                    "Returned errors: %s. Raised errors: %s",
+                    repr(returned_validation_errors), repr(raised_validation_errors))
+
+            validation_errors.update(returned_validation_errors)
+
+        yield row_ix, row_data, deserialized_row_data, validation_errors