feat(xmlupload): check if the encoding in the text-prop is consistent (…

…DEV-3296) (#818)
dasch-swiss · Feb 22, 2024 · 4ae8469 · 4ae8469
1 parent 9947607
commit 4ae8469
Show file tree

Hide file tree

Showing 6 changed files with 517 additions and 8 deletions.
diff --git a/src/dsp_tools/utils/xml_utils.py b/src/dsp_tools/utils/xml_utils.py
@@ -133,8 +133,8 @@ def _parse_xml_file(input_file: Union[str, Path]) -> etree._ElementTree[etree._E
 
 
 def remove_namespaces_from_xml(
-    data_xml: Union[etree._ElementTree[etree._Element], etree._Element],
-) -> Union[etree._ElementTree[etree._Element], etree._Element]:
+    data_xml: etree._Element,
+) -> etree._Element:
     """
     This function removes all the namespaces from an XML file.
 

diff --git a/src/dsp_tools/utils/xml_validation.py b/src/dsp_tools/utils/xml_validation.py
@@ -11,6 +11,10 @@
 from dsp_tools.models.exceptions import InputError
 from dsp_tools.utils.create_logger import get_logger
 from dsp_tools.utils.xml_utils import parse_and_remove_comments_from_xml_file, remove_namespaces_from_xml
+from dsp_tools.utils.xml_validation_models import (
+    InconsistentTextValueEncodings,
+    TextValueData,
+)
 
 logger = get_logger(__name__)
 
@@ -47,6 +51,10 @@ def validate_xml(input_file: Union[str, Path, etree._ElementTree[Any]]) -> bool:
     if not all_good:
         problems.append(msg)
 
+    all_good, msg = _find_mixed_encodings_in_one_text_prop(xml_no_namespace)
+    if not all_good:
+        problems.append(msg)
+
     if len(problems) > 0:
         err_msg = grand_separator.join(problems)
         logger.error(err_msg, exc_info=True)
@@ -59,7 +67,7 @@ def validate_xml(input_file: Union[str, Path, etree._ElementTree[Any]]) -> bool:
 
 def _parse_schema_and_data_files(
     input_file: Union[str, Path, etree._ElementTree[Any]],
-) -> tuple[Union[etree._ElementTree[etree._Element], etree._Element], etree.XMLSchema]:
+) -> tuple[etree._Element, etree.XMLSchema]:
     with importlib.resources.files("dsp_tools").joinpath("resources/schema/data.xsd").open(
         encoding="utf-8"
     ) as schema_file:
@@ -68,9 +76,7 @@ def _parse_schema_and_data_files(
     return data_xml, xmlschema
 
 
-def _validate_xml_against_schema(
-    xmlschema: etree.XMLSchema, data_xml: Union[etree._ElementTree[etree._Element], etree._Element]
-) -> tuple[bool, str]:
+def _validate_xml_against_schema(xmlschema: etree.XMLSchema, data_xml: etree._Element) -> tuple[bool, str]:
     if not xmlschema.validate(data_xml):
         error_msg = "The XML file cannot be uploaded due to the following validation error(s):"
         for error in xmlschema.error_log:
@@ -81,7 +87,7 @@ def _validate_xml_against_schema(
 
 
 def _find_xml_tags_in_simple_text_elements(
-    xml_no_namespace: Union[etree._ElementTree[etree._Element], etree._Element],
+    xml_no_namespace: etree._Element,
 ) -> tuple[bool, str]:
     """
     Makes sure that there are no XML tags in simple texts.
@@ -119,3 +125,73 @@ def _find_xml_tags_in_simple_text_elements(
         err_msg += list_separator + list_separator.join(resources_with_illegal_xml_tags)
         return False, err_msg
     return True, ""
+
+
+def _find_mixed_encodings_in_one_text_prop(
+    xml_no_namespace: etree._Element,
+) -> tuple[bool, str]:
+    problems = check_if_only_one_encoding_is_used_per_prop_in_root(xml_no_namespace)
+    if not problems:
+        return True, ""
+    msg, df = InconsistentTextValueEncodings(problems).execute_problem_protocol()
+    if df is not None:
+        csv_path = Path(f"XML_syntax_errors_{datetime.now().strftime('%Y-%m-%d_%H%M%S')}.csv")
+        msg = f"\nAll the problems are listed in the file: '{csv_path.absolute()}'" + msg
+        df.to_csv(csv_path)
+    return False, msg
+
+
+def check_if_only_one_encoding_is_used_per_prop_in_root(
+    root: etree._Element,
+) -> list[TextValueData]:
+    """
+    Check if all the encodings in the <text> elements are consistent within one <text-prop>
+
+    This is correct:
+    ```
+    <text-prop name=":hasSimpleText">
+        <text encoding="utf8">Text 1</text>
+        <text encoding="utf8">Text 2</text>
+    </text-prop>
+    ```
+
+    This is wrong:
+    ```
+    <text-prop name=":hasSimpleText">
+        <text encoding="utf8">Text 1</text>
+        <text encoding="xml">Text 2</text>
+    </text-prop>
+    ```
+
+    Args:
+        root: root of the data xml document
+
+    Returns:
+          A list of all the inconsistent <text-props>
+    """
+    text_props = _get_all_ids_and_encodings_from_root(root)
+    return _find_all_text_props_with_multiple_encodings(text_props)
+
+
+def _get_all_ids_and_encodings_from_root(
+    root: etree._Element,
+) -> list[TextValueData]:
+    res_list = []
+    for res_input in root.iterchildren(tag="resource"):
+        res_list.extend(_get_encodings_from_one_resource(res_input))
+    return res_list
+
+
+def _get_encodings_from_one_resource(resource: etree._Element) -> list[TextValueData]:
+    res_id = resource.attrib["id"]
+    return [_get_encodings_from_one_property(res_id, child) for child in list(resource.iterchildren(tag="text-prop"))]
+
+
+def _get_encodings_from_one_property(res_id: str, prop: etree._Element) -> TextValueData:
+    prop_name = prop.attrib["name"]
+    encodings = {x.attrib["encoding"] for x in prop.iterchildren()}
+    return TextValueData(res_id, prop_name, encodings)
+
+
+def _find_all_text_props_with_multiple_encodings(text_props: list[TextValueData]) -> list[TextValueData]:
+    return [x for x in text_props if not len(x.encoding) == 1]
diff --git a/src/dsp_tools/utils/xml_validation_models.py b/src/dsp_tools/utils/xml_validation_models.py
@@ -0,0 +1,68 @@
+from dataclasses import dataclass
+
+import pandas as pd
+
+list_separator = "\n    - "
+medium_separator = "\n----------------------------\n"
+grand_separator = "\n\n---------------------------------------\n\n"
+
+maximum_prints = 50
+
+
+@dataclass
+class TextValueData:
+    resource_id: str
+    property_name: str
+    encoding: set[str]
+
+
+@dataclass
+class InconsistentTextValueEncodings:
+    """
+    This class implements the `Problem` protocol
+    for resources and properties that contain invalid encodings.
+
+    An invalid encoding would be a <text-prop> element, that contains
+    <text encoding="utf8">
+    and
+    <text encoding="xml">
+    """
+
+    problematic_resources: list[TextValueData]
+
+    def execute_problem_protocol(self) -> tuple[str, pd.DataFrame | None]:
+        """
+        This method composes an error message for the user.
+        If the number of errors exceeds `maximum_prints`,
+        the errors are additionally returned as a dataframe that can be saved as a CSV file.
+
+        Returns:
+            the error message, and optionally a dataframe with the errors
+        """
+        base_msg = (
+            "\nSome <text-prop> elements contain <text> elements that use both 'xml' and 'utf8' encoding.\n"
+            "Only one encoding type can be used within one <text-prop> element."
+        )
+        df = self._get_problems_as_df()
+        if len(df) > maximum_prints:
+            return base_msg, df
+        return base_msg + grand_separator + _make_msg_from_df(df), None
+
+    def _get_problems_as_df(self) -> pd.DataFrame:
+        df = pd.DataFrame(
+            {
+                "Resource ID": [x.resource_id for x in self.problematic_resources],
+                "Property Name": [x.property_name for x in self.problematic_resources],
+            }
+        )
+        return df.sort_values(by=["Resource ID", "Property Name"], ignore_index=True)
+
+
+def _make_msg_from_df(df: pd.DataFrame) -> str:
+    groups = df.groupby(by="Resource ID")
+    return medium_separator.join([_make_msg_for_one_resource(str(_id), res_df) for _id, res_df in groups])
+
+
+def _make_msg_for_one_resource(res_id: str, res_df: pd.DataFrame) -> str:
+    problems = [f"Property Name: '{p}'" for p in res_df["Property Name"].tolist()]
+    return f"Resource ID: '{res_id}'{list_separator}{list_separator.join(problems)}"
diff --git a/test/unittests/commands/xmlupload/test_ontology_lookup_models.py b/test/unittests/commands/xmlupload/test_ontology_lookup_models.py
@@ -1,3 +1,4 @@
+import pytest
 from pytest_unordered import unordered
 
 from dsp_tools.commands.xmlupload.models.ontology_lookup_models import (
@@ -163,3 +164,7 @@ def test_remove_prefixes_knora_properties() -> None:
     test_elements = ["knora-api:attachedToUser", "knora-api:deletedBy"]
     res = _remove_prefixes(test_elements)
     assert unordered(res) == ["attachedToUser", "deletedBy"]
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])