From 4ae8469262805d5983cf370a8842a65c7ae265ae Mon Sep 17 00:00:00 2001 From: Nora-Olivia-Ammann <103038637+Nora-Olivia-Ammann@users.noreply.github.com> Date: Thu, 22 Feb 2024 13:42:25 +0100 Subject: [PATCH] feat(xmlupload): check if the encoding in the text-prop is consistent (DEV-3296) (#818) --- src/dsp_tools/utils/xml_utils.py | 4 +- src/dsp_tools/utils/xml_validation.py | 86 ++++- src/dsp_tools/utils/xml_validation_models.py | 68 ++++ .../xmlupload/test_ontology_lookup_models.py | 5 + .../utils/test_xml_validation_low_level.py | 297 +++++++++++++++++- .../utils/test_xml_validation_models.py | 65 ++++ 6 files changed, 517 insertions(+), 8 deletions(-) create mode 100644 src/dsp_tools/utils/xml_validation_models.py create mode 100644 test/unittests/utils/test_xml_validation_models.py diff --git a/src/dsp_tools/utils/xml_utils.py b/src/dsp_tools/utils/xml_utils.py index 79098e707..ee7e91afb 100644 --- a/src/dsp_tools/utils/xml_utils.py +++ b/src/dsp_tools/utils/xml_utils.py @@ -133,8 +133,8 @@ def _parse_xml_file(input_file: Union[str, Path]) -> etree._ElementTree[etree._E def remove_namespaces_from_xml( - data_xml: Union[etree._ElementTree[etree._Element], etree._Element], -) -> Union[etree._ElementTree[etree._Element], etree._Element]: + data_xml: etree._Element, +) -> etree._Element: """ This function removes all the namespaces from an XML file. diff --git a/src/dsp_tools/utils/xml_validation.py b/src/dsp_tools/utils/xml_validation.py index 3fad3093e..c3db34db0 100644 --- a/src/dsp_tools/utils/xml_validation.py +++ b/src/dsp_tools/utils/xml_validation.py @@ -11,6 +11,10 @@ from dsp_tools.models.exceptions import InputError from dsp_tools.utils.create_logger import get_logger from dsp_tools.utils.xml_utils import parse_and_remove_comments_from_xml_file, remove_namespaces_from_xml +from dsp_tools.utils.xml_validation_models import ( + InconsistentTextValueEncodings, + TextValueData, +) logger = get_logger(__name__) @@ -47,6 +51,10 @@ def validate_xml(input_file: Union[str, Path, etree._ElementTree[Any]]) -> bool: if not all_good: problems.append(msg) + all_good, msg = _find_mixed_encodings_in_one_text_prop(xml_no_namespace) + if not all_good: + problems.append(msg) + if len(problems) > 0: err_msg = grand_separator.join(problems) logger.error(err_msg, exc_info=True) @@ -59,7 +67,7 @@ def validate_xml(input_file: Union[str, Path, etree._ElementTree[Any]]) -> bool: def _parse_schema_and_data_files( input_file: Union[str, Path, etree._ElementTree[Any]], -) -> tuple[Union[etree._ElementTree[etree._Element], etree._Element], etree.XMLSchema]: +) -> tuple[etree._Element, etree.XMLSchema]: with importlib.resources.files("dsp_tools").joinpath("resources/schema/data.xsd").open( encoding="utf-8" ) as schema_file: @@ -68,9 +76,7 @@ def _parse_schema_and_data_files( return data_xml, xmlschema -def _validate_xml_against_schema( - xmlschema: etree.XMLSchema, data_xml: Union[etree._ElementTree[etree._Element], etree._Element] -) -> tuple[bool, str]: +def _validate_xml_against_schema(xmlschema: etree.XMLSchema, data_xml: etree._Element) -> tuple[bool, str]: if not xmlschema.validate(data_xml): error_msg = "The XML file cannot be uploaded due to the following validation error(s):" for error in xmlschema.error_log: @@ -81,7 +87,7 @@ def _validate_xml_against_schema( def _find_xml_tags_in_simple_text_elements( - xml_no_namespace: Union[etree._ElementTree[etree._Element], etree._Element], + xml_no_namespace: etree._Element, ) -> tuple[bool, str]: """ Makes sure that there are no XML tags in simple texts. @@ -119,3 +125,73 @@ def _find_xml_tags_in_simple_text_elements( err_msg += list_separator + list_separator.join(resources_with_illegal_xml_tags) return False, err_msg return True, "" + + +def _find_mixed_encodings_in_one_text_prop( + xml_no_namespace: etree._Element, +) -> tuple[bool, str]: + problems = check_if_only_one_encoding_is_used_per_prop_in_root(xml_no_namespace) + if not problems: + return True, "" + msg, df = InconsistentTextValueEncodings(problems).execute_problem_protocol() + if df is not None: + csv_path = Path(f"XML_syntax_errors_{datetime.now().strftime('%Y-%m-%d_%H%M%S')}.csv") + msg = f"\nAll the problems are listed in the file: '{csv_path.absolute()}'" + msg + df.to_csv(csv_path) + return False, msg + + +def check_if_only_one_encoding_is_used_per_prop_in_root( + root: etree._Element, +) -> list[TextValueData]: + """ + Check if all the encodings in the elements are consistent within one + + This is correct: + ``` + + Text 1 + Text 2 + + ``` + + This is wrong: + ``` + + Text 1 + Text 2 + + ``` + + Args: + root: root of the data xml document + + Returns: + A list of all the inconsistent + """ + text_props = _get_all_ids_and_encodings_from_root(root) + return _find_all_text_props_with_multiple_encodings(text_props) + + +def _get_all_ids_and_encodings_from_root( + root: etree._Element, +) -> list[TextValueData]: + res_list = [] + for res_input in root.iterchildren(tag="resource"): + res_list.extend(_get_encodings_from_one_resource(res_input)) + return res_list + + +def _get_encodings_from_one_resource(resource: etree._Element) -> list[TextValueData]: + res_id = resource.attrib["id"] + return [_get_encodings_from_one_property(res_id, child) for child in list(resource.iterchildren(tag="text-prop"))] + + +def _get_encodings_from_one_property(res_id: str, prop: etree._Element) -> TextValueData: + prop_name = prop.attrib["name"] + encodings = {x.attrib["encoding"] for x in prop.iterchildren()} + return TextValueData(res_id, prop_name, encodings) + + +def _find_all_text_props_with_multiple_encodings(text_props: list[TextValueData]) -> list[TextValueData]: + return [x for x in text_props if not len(x.encoding) == 1] diff --git a/src/dsp_tools/utils/xml_validation_models.py b/src/dsp_tools/utils/xml_validation_models.py new file mode 100644 index 000000000..44f9faf70 --- /dev/null +++ b/src/dsp_tools/utils/xml_validation_models.py @@ -0,0 +1,68 @@ +from dataclasses import dataclass + +import pandas as pd + +list_separator = "\n - " +medium_separator = "\n----------------------------\n" +grand_separator = "\n\n---------------------------------------\n\n" + +maximum_prints = 50 + + +@dataclass +class TextValueData: + resource_id: str + property_name: str + encoding: set[str] + + +@dataclass +class InconsistentTextValueEncodings: + """ + This class implements the `Problem` protocol + for resources and properties that contain invalid encodings. + + An invalid encoding would be a element, that contains + + and + + """ + + problematic_resources: list[TextValueData] + + def execute_problem_protocol(self) -> tuple[str, pd.DataFrame | None]: + """ + This method composes an error message for the user. + If the number of errors exceeds `maximum_prints`, + the errors are additionally returned as a dataframe that can be saved as a CSV file. + + Returns: + the error message, and optionally a dataframe with the errors + """ + base_msg = ( + "\nSome elements contain elements that use both 'xml' and 'utf8' encoding.\n" + "Only one encoding type can be used within one element." + ) + df = self._get_problems_as_df() + if len(df) > maximum_prints: + return base_msg, df + return base_msg + grand_separator + _make_msg_from_df(df), None + + def _get_problems_as_df(self) -> pd.DataFrame: + df = pd.DataFrame( + { + "Resource ID": [x.resource_id for x in self.problematic_resources], + "Property Name": [x.property_name for x in self.problematic_resources], + } + ) + return df.sort_values(by=["Resource ID", "Property Name"], ignore_index=True) + + +def _make_msg_from_df(df: pd.DataFrame) -> str: + groups = df.groupby(by="Resource ID") + return medium_separator.join([_make_msg_for_one_resource(str(_id), res_df) for _id, res_df in groups]) + + +def _make_msg_for_one_resource(res_id: str, res_df: pd.DataFrame) -> str: + problems = [f"Property Name: '{p}'" for p in res_df["Property Name"].tolist()] + return f"Resource ID: '{res_id}'{list_separator}{list_separator.join(problems)}" diff --git a/test/unittests/commands/xmlupload/test_ontology_lookup_models.py b/test/unittests/commands/xmlupload/test_ontology_lookup_models.py index d2bdfafff..1e91f6c3e 100644 --- a/test/unittests/commands/xmlupload/test_ontology_lookup_models.py +++ b/test/unittests/commands/xmlupload/test_ontology_lookup_models.py @@ -1,3 +1,4 @@ +import pytest from pytest_unordered import unordered from dsp_tools.commands.xmlupload.models.ontology_lookup_models import ( @@ -163,3 +164,7 @@ def test_remove_prefixes_knora_properties() -> None: test_elements = ["knora-api:attachedToUser", "knora-api:deletedBy"] res = _remove_prefixes(test_elements) assert unordered(res) == ["attachedToUser", "deletedBy"] + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/test/unittests/utils/test_xml_validation_low_level.py b/test/unittests/utils/test_xml_validation_low_level.py index d24d44f84..6417387ad 100644 --- a/test/unittests/utils/test_xml_validation_low_level.py +++ b/test/unittests/utils/test_xml_validation_low_level.py @@ -1,7 +1,15 @@ import pytest from lxml import etree -from dsp_tools.utils.xml_validation import _find_xml_tags_in_simple_text_elements +from dsp_tools.utils.xml_validation import ( + _find_all_text_props_with_multiple_encodings, + _find_xml_tags_in_simple_text_elements, + _get_all_ids_and_encodings_from_root, + _get_encodings_from_one_property, + _get_encodings_from_one_resource, + check_if_only_one_encoding_is_used_per_prop_in_root, +) +from dsp_tools.utils.xml_validation_models import TextValueData class TestFindXMLTagsInUTF8: @@ -76,3 +84,290 @@ def test_find_xml_tags_in_simple_text_elements_forbidden_escapes_two(self) -> No if __name__ == "__main__": pytest.main([__file__]) + + +def test_find_all_text_props_with_multiple_encodings_problems() -> None: + test_props = [TextValueData("problem_id", "problem_prop", {"xml", "utf8"}), TextValueData("", "", {"utf8"})] + problem = _find_all_text_props_with_multiple_encodings(test_props)[0] + assert problem.resource_id == "problem_id" + assert problem.property_name == "problem_prop" + assert problem.encoding == {"xml", "utf8"} + + +def test_find_all_text_props_with_multiple_encodings_good() -> None: + test_props = [TextValueData("", "", {"xml"}), TextValueData("", "", {"utf8"})] + problem = _find_all_text_props_with_multiple_encodings(test_props) + assert not problem + + +def test_get_all_ids_prop_encoding_from_root_no_text() -> None: + test_ele = etree.fromstring( + """ + + + resB + + + + + + + resB + + + resA + + + """ + ) + res = _get_all_ids_and_encodings_from_root(test_ele) + assert not res + + +def test_get_all_ids_prop_encoding_from_root_with_text() -> None: + test_ele = etree.fromstring( + """ + + + https://dasch.swiss + + + Text + + + + + Text + + + + + resB + + + Text + + + + + resB + + + """ + ) + res = _get_all_ids_and_encodings_from_root(test_ele) + assert res[0].resource_id == "test_thing_1" + assert res[0].property_name == ":hasRichtext" + assert res[0].encoding == {"xml"} + assert res[1].resource_id == "resB" + assert res[1].property_name == ":hasSimpleText" + assert res[1].encoding == {"utf8"} + assert res[2].resource_id == "resC" + assert res[2].property_name == ":hasSimpleText" + assert res[2].encoding == {"utf8"} + + +class TestGetEncodingsOneResource: + def test_no_text(self) -> None: + test_props = etree.fromstring( + """ + + + https://dasch.swiss + + + true + + + """ + ) + res = _get_encodings_from_one_resource(test_props) + assert not res + + def test_one_text_prop(self) -> None: + test_props = etree.fromstring( + """ + + + https://dasch.swiss + + + true + + + Text + + + """ + ) + res = _get_encodings_from_one_resource(test_props)[0] + assert res.resource_id == "test_thing_1" + assert res.property_name == ":hasRichtext" + assert res.encoding == {"utf8"} + + def test_two_text_prop(self) -> None: + test_props = etree.fromstring( + """ + + + https://dasch.swiss + + + true + + + Text + + + Text + Text + + + """ + ) + res = _get_encodings_from_one_resource(test_props) + assert res[0].resource_id == "test_thing_1" + assert res[0].property_name == ":hasRichtext" + assert res[0].encoding == {"xml"} + assert res[1].resource_id == "test_thing_1" + assert res[1].property_name == ":hasSimpleText" + assert res[1].encoding == {"utf8"} + + +class TestGetEncodingOneProperty: + def test_richtext_several_text_ele(self) -> None: + test_prop = etree.fromstring( + """ + + < + + This text contains links to all resources: + test_thing_0 + + Text with an external link: Google + + """ + ) + res_info = _get_encodings_from_one_property("id", test_prop) + assert res_info.resource_id == "id" + assert res_info.property_name == ":hasRichtext" + assert res_info.encoding == {"xml"} + + def test_simple_several_text_ele(self) -> None: + test_prop = etree.fromstring( + """ + + Text + Text + + """ + ) + res_info = _get_encodings_from_one_property("id", test_prop) + assert res_info.resource_id == "id" + assert res_info.property_name == ":hasRichtext" + assert res_info.encoding == {"utf8"} + + def test_simple_one_text_ele(self) -> None: + test_prop = etree.fromstring( + """ + + Text + + """ + ) + res_info = _get_encodings_from_one_property("id", test_prop) + assert res_info.resource_id == "id" + assert res_info.property_name == ":hasRichtext" + assert res_info.encoding == {"utf8"} + + +def test_check_if_only_one_encoding_is_used_in_xml_good() -> None: + test_ele = etree.fromstring( + """ + + + https://dasch.swiss + + + Text + + + + + Text + + + + + resB + + + Text 1 + Text 2 + + + + + resB + + + """ + ) + res = check_if_only_one_encoding_is_used_per_prop_in_root(test_ele) + assert not res + + +def test_check_if_only_one_encoding_is_used_in_xml_problem() -> None: + test_ele = etree.fromstring( + """ + + + https://dasch.swiss + + + Text + + + + + Text + + + + + resB + + + Text 1 + Text 2 + + + + + resB + + + """ + ) + res = check_if_only_one_encoding_is_used_per_prop_in_root(test_ele)[0] + assert res.resource_id == "resC" + assert res.property_name == ":hasSimpleText" + assert res.encoding == {"xml", "utf8"} diff --git a/test/unittests/utils/test_xml_validation_models.py b/test/unittests/utils/test_xml_validation_models.py new file mode 100644 index 000000000..0a5b1d908 --- /dev/null +++ b/test/unittests/utils/test_xml_validation_models.py @@ -0,0 +1,65 @@ +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal + +from dsp_tools.utils.xml_validation_models import ( + InconsistentTextValueEncodings, + TextValueData, + _make_msg_for_one_resource, + _make_msg_from_df, +) + + +class TestInvalidTextValueEncodings: + def test_get_problems_as_df(self) -> None: + problems = InconsistentTextValueEncodings( + [ + TextValueData("id1", ":simple", {"utf8", "xml"}), + TextValueData("id1", ":rich", {"utf8", "xml"}), + TextValueData("id2", ":rich", {"utf8", "xml"}), + ] + ) + expected_df = pd.DataFrame( + { + "Resource ID": ["id1", "id1", "id2"], + "Property Name": [":rich", ":simple", ":rich"], + } + ) + res_df = problems._get_problems_as_df() + assert_frame_equal(res_df, expected_df) + + def test_make_msg_for_one_resource(self) -> None: + test_df = pd.DataFrame( + { + "Resource ID": ["id1", "id1"], + "Property Name": [":rich", ":simple"], + } + ) + res = _make_msg_for_one_resource("id1", test_df) + expected = "Resource ID: 'id1'\n" " - Property Name: ':rich'\n" " - Property Name: ':simple'" + assert res == expected + + def test_make_msg_from_df(self) -> None: + test_df = pd.DataFrame( + { + "Resource ID": ["id1", "id1", "id2", "id3"], + "Property Name": [":rich", ":simple", ":rich", ":mixed"], + } + ) + res = _make_msg_from_df(test_df) + expected = ( + "Resource ID: 'id1'\n" + " - Property Name: ':rich'\n" + " - Property Name: ':simple'" + "\n----------------------------\n" + "Resource ID: 'id2'\n" + " - Property Name: ':rich'" + "\n----------------------------\n" + "Resource ID: 'id3'\n" + " - Property Name: ':mixed'" + ) + assert res == expected + + +if __name__ == "__main__": + pytest.main([__file__])