dasch-swiss · Nora-Olivia-Ammann · Feb 16, 2024 · Feb 16, 2024 · Feb 16, 2024 · Feb 16, 2024
diff --git a/src/dsp_tools/cli/call_action.py b/src/dsp_tools/cli/call_action.py
@@ -21,7 +21,7 @@
 from dsp_tools.commands.xmlupload.upload_config import DiagnosticsConfig, UploadConfig
 from dsp_tools.commands.xmlupload.xmlupload import xmlupload
 from dsp_tools.utils.create_logger import get_logger
-from dsp_tools.utils.shared import validate_xml_against_schema
+from dsp_tools.utils.validate_data_xml import validate_xml
 
 logger = get_logger(__name__)
 
@@ -192,7 +192,7 @@ def _call_process_files(args: argparse.Namespace) -> bool:
 
 def _call_xmlupload(args: argparse.Namespace) -> bool:
     if args.validate_only:
-        return validate_xml_against_schema(args.xmlfile)
+        return validate_xml(args.xmlfile)
     else:
         return xmlupload(
             input_file=args.xmlfile,

diff --git a/src/dsp_tools/commands/excel2xml/excel2xml_lib.py b/src/dsp_tools/commands/excel2xml/excel2xml_lib.py
@@ -17,8 +17,9 @@
 from dsp_tools.models.datetimestamp import DateTimeStamp
 from dsp_tools.models.exceptions import BaseError
 from dsp_tools.utils.date_util import is_full_date
-from dsp_tools.utils.shared import check_notna, simplify_name, validate_xml_against_schema
+from dsp_tools.utils.shared import check_notna, simplify_name
 from dsp_tools.utils.uri_util import is_uri
+from dsp_tools.utils.validate_data_xml import validate_xml
 
 # ruff: noqa: E501 (line-too-long)
 
@@ -1939,7 +1940,7 @@ def write_xml(
     with open(filepath, "w", encoding="utf-8") as f:
         f.write(xml_string)
     try:
-        validate_xml_against_schema(input_file=filepath)
+        validate_xml(input_file=filepath)
         print(f"The XML file was successfully saved to {filepath}")
     except BaseError as err:
         warnings.warn(

diff --git a/src/dsp_tools/commands/xmlupload/read_validate_xml_file.py b/src/dsp_tools/commands/xmlupload/read_validate_xml_file.py
@@ -9,7 +9,7 @@
 from dsp_tools.models.exceptions import UserError
 from dsp_tools.utils.create_logger import get_logger
 from dsp_tools.utils.iri_util import is_resource_iri
-from dsp_tools.utils.shared import validate_xml_against_schema
+from dsp_tools.utils.validate_data_xml import validate_xml
 from dsp_tools.utils.xml_utils import parse_and_clean_xml_file
 
 logger = get_logger(__name__)
@@ -34,7 +34,7 @@ def validate_and_parse_xml_file(
     Returns:
         The ontology name, the parsed XML file and the shortcode of the project
     """
-    validate_xml_against_schema(input_file=input_file)
+    validate_xml(input_file=input_file)
     root = parse_and_clean_xml_file(input_file=input_file)
     _check_if_link_targets_exist(root)
     if not preprocessing_done:

diff --git a/src/dsp_tools/utils/shared.py b/src/dsp_tools/utils/shared.py
@@ -1,117 +1,20 @@
 from __future__ import annotations
 
-import copy
-import importlib.resources
 import json
 import unicodedata
-from datetime import datetime
 from pathlib import Path
 from typing import Any, Optional, TypeGuard, Union
 
 import pandas as pd
 import regex
-from lxml import etree
 
 from dsp_tools.commands.excel2xml.propertyelement import PropertyElement
-from dsp_tools.models.exceptions import BaseError, UserError
+from dsp_tools.models.exceptions import BaseError
 from dsp_tools.utils.create_logger import get_logger
 
 logger = get_logger(__name__)
 
 
-def validate_xml_against_schema(input_file: Union[str, Path, etree._ElementTree[Any]]) -> bool:
-    """
-    Validates an XML file against the DSP XSD schema.
-
-    Args:
-        input_file: path to the XML file to be validated, or parsed ElementTree
-
-    Raises:
-        UserError: if the XML file is invalid
-
-    Returns:
-        True if the XML file is valid
-    """
-    with importlib.resources.files("dsp_tools").joinpath("resources/schema/data.xsd").open(
-        encoding="utf-8"
-    ) as schema_file:
-        xmlschema = etree.XMLSchema(etree.parse(schema_file))
-    if isinstance(input_file, (str, Path)):
-        try:
-            doc = etree.parse(source=input_file)
-        except etree.XMLSyntaxError as err:
-            logger.error(f"The XML file contains the following syntax error: {err.msg}", exc_info=True)
-            raise UserError(f"The XML file contains the following syntax error: {err.msg}") from None
-    else:
-        doc = input_file
-
-    if not xmlschema.validate(doc):
-        error_msg = "The XML file cannot be uploaded due to the following validation error(s):"
-        for error in xmlschema.error_log:
-            error_msg = error_msg + f"\n  Line {error.line}: {error.message}"
-        error_msg = error_msg.replace("{https://dasch.swiss/schema}", "")
-        logger.error(error_msg)
-        raise UserError(error_msg)
-
-    # make sure there are no XML tags in simple texts
-    _validate_xml_tags_in_text_properties(doc)
-
-    logger.info("The XML file is syntactically correct and passed validation.")
-    print(f"{datetime.now()}: The XML file is syntactically correct and passed validation.")
-    return True
-
-
-def _validate_xml_tags_in_text_properties(doc: Union[etree._ElementTree[etree._Element], etree._Element]) -> bool:
-    """
-    Makes sure that there are no XML tags in simple texts.
-    This can only be done with a regex,
-    because even if the simple text contains some XML tags,
-    the simple text itself is not valid XML that could be parsed.
-    The extra challenge is that lxml transforms
-    "pebble (&lt;2cm) and boulder (&gt;20cm)" into
-    "pebble (<2cm) and boulder (>20cm)"
-    (but only if &gt; follows &lt;).
-    This forces us to write a regex that carefully distinguishes
-    between a real tag (which is not allowed) and a false-positive-tag.
-
-    Args:
-        doc: parsed XML file
-
-    Raises:
-        UserError: if there is an XML tag in one of the simple texts
-
-    Returns:
-        True if there are no XML tags in the simple texts
-    """
-    # first: remove namespaces
-    doc_without_namespace = copy.deepcopy(doc)
-    for elem in doc_without_namespace.iter():
-        if not isinstance(elem, (etree._Comment, etree._ProcessingInstruction)):
-            elem.tag = etree.QName(elem).localname
-
-    # then: make the test
-    resources_with_illegal_xml_tags = []
-    for text in doc_without_namespace.findall(path="resource/text-prop/text"):
-        regex_finds_tags = bool(regex.search(r'<([a-zA-Z/"]+|[^\s0-9].*[^\s0-9])>', str(text.text)))
-        etree_finds_tags = bool(list(text.iterchildren()))
-        has_tags = regex_finds_tags or etree_finds_tags
-        if text.attrib["encoding"] == "utf8" and has_tags:
-            sourceline = f" line {text.sourceline}: " if text.sourceline else " "
-            propname = text.getparent().attrib["name"]  # type: ignore[union-attr]
-            resname = text.getparent().getparent().attrib["id"]  # type: ignore[union-attr]
-            resources_with_illegal_xml_tags.append(f" -{sourceline}resource '{resname}', property '{propname}'")
-    if resources_with_illegal_xml_tags:
-        err_msg = (
-            "XML-tags are not allowed in text properties with encoding=utf8. "
-            "The following resources of your XML file violate this rule:\n"
-        )
-        err_msg += "\n".join(resources_with_illegal_xml_tags)
-        logger.error(err_msg, exc_info=True)
-        raise UserError(err_msg)
-
-    return True
-
-
 def prepare_dataframe(
     df: pd.DataFrame,
     required_columns: list[str],

diff --git a/src/dsp_tools/utils/validate_data_xml.py b/src/dsp_tools/utils/validate_data_xml.py
@@ -0,0 +1,128 @@
+from __future__ import annotations
+
+import importlib.resources
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Union
+
+import regex
+from lxml import etree
+
+from dsp_tools.models.exceptions import InputError
+from dsp_tools.utils.create_logger import get_logger
+from dsp_tools.utils.xml_utils import remove_namespaces_from_xml
+
+logger = get_logger(__name__)
+
+separator = "\n    "
+list_separator = "\n    - "
+medium_separator = "\n----------------------------\n"
+grand_separator = "\n\n---------------------------------------\n\n"
+
+
+def validate_xml(input_file: Union[str, Path, etree._ElementTree[Any]]) -> bool:
+    """
+    Validates an XML file against the DSP XSD schema.
+
+    Args:
+        input_file: path to the XML file to be validated, or parsed ElementTree
+
+    Raises:
+        InputError: if the XML file is invalid
+
+    Returns:
+        True if the XML file is valid
+    """
+    data_xml, xmlschema = _parse_schema_and_data_files(input_file)
+
+    problems = []
+
+    all_good, msg = _validate_xml_against_schema(xmlschema, data_xml)
+    if not all_good:
+        problems.append(msg)
+
+    xml_no_namespace = remove_namespaces_from_xml(data_xml)
+
+    all_good, msg = _find_xml_tags_in_simple_text_elements(xml_no_namespace)
+    if not all_good:
+        problems.append(msg)
+
+    if len(problems) > 0:
+        err_msg = grand_separator.join(problems)
+        logger.error(err_msg, exc_info=True)
+        raise InputError(err_msg)
+
+    logger.info("The XML file is syntactically correct and passed validation.")
+    print(f"{datetime.now()}: The XML file is syntactically correct and passed validation.")
+    return True
+
+
+def _parse_schema_and_data_files(
+    input_file: Union[str, Path, etree._ElementTree[Any]],
+) -> tuple[Union[etree._ElementTree[etree._Element], etree._Element], etree.XMLSchema]:
+    with importlib.resources.files("dsp_tools").joinpath("resources/schema/data.xsd").open(
+        encoding="utf-8"
+    ) as schema_file:
+        xmlschema = etree.XMLSchema(etree.parse(schema_file))
+    if isinstance(input_file, (str, Path)):
+        try:
+            data_xml = etree.parse(source=input_file)
+        except etree.XMLSyntaxError as err:
+            logger.error(f"The XML file contains the following syntax error: {err.msg}", exc_info=True)
+            raise InputError(f"The XML file contains the following syntax error: {err.msg}") from None
+    else:
+        data_xml = input_file
+    return data_xml, xmlschema
+
+
+def _validate_xml_against_schema(
+    xmlschema: etree.XMLSchema, data_xml: Union[etree._ElementTree[etree._Element], etree._Element]
+) -> tuple[bool, str]:
+    if not xmlschema.validate(data_xml):
+        error_msg = "The XML file cannot be uploaded due to the following validation error(s):"
+        for error in xmlschema.error_log:
+            error_msg = error_msg + f"{separator}Line {error.line}: {error.message}"
+        error_msg = error_msg.replace("{https://dasch.swiss/schema}", "")
+        return False, error_msg
+    return True, ""
+
+
+def _find_xml_tags_in_simple_text_elements(
+    xml_no_namespace: Union[etree._ElementTree[etree._Element], etree._Element],
+) -> tuple[bool, str]:
+    """
+    Makes sure that there are no XML tags in simple texts.
+    This can only be done with a regex,
+    because even if the simple text contains some XML tags,
+    the simple text itself is not valid XML that could be parsed.
+    The extra challenge is that lxml transforms
+    "pebble (&lt;2cm) and boulder (&gt;20cm)" into
+    "pebble (<2cm) and boulder (>20cm)"
+    (but only if &gt; follows &lt;).
+    This forces us to write a regex that carefully distinguishes
+    between a real tag (which is not allowed) and a false-positive-tag.
+
+    Args:
+        xml_no_namespace: parsed XML file with the namespaces removed
+
+    Returns:
+        True if there are no XML tags in the simple texts
+    """
+    resources_with_illegal_xml_tags = []
+    for text in xml_no_namespace.findall(path="resource/text-prop/text"):
+        regex_finds_tags = bool(regex.search(r'<([a-zA-Z/"]+|[^\s0-9].*[^\s0-9])>', str(text.text)))
+        etree_finds_tags = bool(list(text.iterchildren()))
+        has_tags = regex_finds_tags or etree_finds_tags
+        if text.attrib["encoding"] == "utf8" and has_tags:
+            sourceline = f"line {text.sourceline}: " if text.sourceline else " "
+            propname = text.getparent().attrib["name"]  # type: ignore[union-attr]
+            resname = text.getparent().getparent().attrib["id"]  # type: ignore[union-attr]
+            resources_with_illegal_xml_tags.append(f"{sourceline}resource '{resname}', property '{propname}'")
+    if resources_with_illegal_xml_tags:
+        err_msg = (
+            "XML-tags are not allowed in text properties with encoding=utf8.\n"
+            "The following resources of your XML file violate this rule:"
+        )
+        err_msg += list_separator + list_separator.join(resources_with_illegal_xml_tags)
+        return False, err_msg
+    return True, ""
diff --git a/src/dsp_tools/utils/xml_utils.py b/src/dsp_tools/utils/xml_utils.py
@@ -103,3 +103,22 @@ def _parse_xml_file(input_file: Union[str, Path]) -> etree._ElementTree[etree._E
     """
     parser = etree.XMLParser(remove_comments=True, remove_pis=True)
     return etree.parse(source=input_file, parser=parser)
+
+
+def remove_namespaces_from_xml(
+    data_xml: Union[etree._ElementTree[etree._Element], etree._Element],
+) -> Union[etree._ElementTree[etree._Element], etree._Element]:
+    """
+    This function removes all the namespaces from an XML file.
+
+    Args:
+        data_xml: file with namespaces
+
+    Returns:
+        the XMl file without the namespaces
+    """
+    xml_no_namespace = copy.deepcopy(data_xml)
+    for elem in xml_no_namespace.iter():
+        if not isinstance(elem, (etree._Comment, etree._ProcessingInstruction)):
+            elem.tag = etree.QName(elem).localname
+    return xml_no_namespace
diff --git a/test/unittests/cli/test_cli_with_mock.py b/test/unittests/cli/test_cli_with_mock.py
@@ -79,7 +79,7 @@ def test_project_get(get_project: Mock) -> None:
     )
 
 
-@patch("dsp_tools.cli.call_action.validate_xml_against_schema")
+@patch("dsp_tools.cli.call_action.validate_xml")
 def test_xmlupload_validate(validate_xml: Mock) -> None:
     """Test the 'dsp-tools xmlupload --validate-only' command"""
     file = "filename.xml"