Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: refactor validate xml in preparation for extension #814

4 changes: 2 additions & 2 deletions src/dsp_tools/cli/call_action.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from dsp_tools.commands.xmlupload.upload_config import DiagnosticsConfig, UploadConfig
from dsp_tools.commands.xmlupload.xmlupload import xmlupload
from dsp_tools.utils.create_logger import get_logger
from dsp_tools.utils.shared import validate_xml_against_schema
from dsp_tools.utils.validate_data_xml import validate_xml

logger = get_logger(__name__)

Expand Down Expand Up @@ -192,7 +192,7 @@ def _call_process_files(args: argparse.Namespace) -> bool:

def _call_xmlupload(args: argparse.Namespace) -> bool:
if args.validate_only:
return validate_xml_against_schema(args.xmlfile)
return validate_xml(args.xmlfile)
else:
return xmlupload(
input_file=args.xmlfile,
Expand Down
5 changes: 3 additions & 2 deletions src/dsp_tools/commands/excel2xml/excel2xml_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@
from dsp_tools.models.datetimestamp import DateTimeStamp
from dsp_tools.models.exceptions import BaseError
from dsp_tools.utils.date_util import is_full_date
from dsp_tools.utils.shared import check_notna, simplify_name, validate_xml_against_schema
from dsp_tools.utils.shared import check_notna, simplify_name
from dsp_tools.utils.uri_util import is_uri
from dsp_tools.utils.validate_data_xml import validate_xml

# ruff: noqa: E501 (line-too-long)

Expand Down Expand Up @@ -1939,7 +1940,7 @@ def write_xml(
with open(filepath, "w", encoding="utf-8") as f:
f.write(xml_string)
try:
validate_xml_against_schema(input_file=filepath)
validate_xml(input_file=filepath)
print(f"The XML file was successfully saved to {filepath}")
except BaseError as err:
warnings.warn(
Expand Down
4 changes: 2 additions & 2 deletions src/dsp_tools/commands/xmlupload/read_validate_xml_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from dsp_tools.models.exceptions import UserError
from dsp_tools.utils.create_logger import get_logger
from dsp_tools.utils.iri_util import is_resource_iri
from dsp_tools.utils.shared import validate_xml_against_schema
from dsp_tools.utils.validate_data_xml import validate_xml
from dsp_tools.utils.xml_utils import parse_and_clean_xml_file

logger = get_logger(__name__)
Expand All @@ -34,7 +34,7 @@ def validate_and_parse_xml_file(
Returns:
The ontology name, the parsed XML file and the shortcode of the project
"""
validate_xml_against_schema(input_file=input_file)
validate_xml(input_file=input_file)
root = parse_and_clean_xml_file(input_file=input_file)
_check_if_link_targets_exist(root)
if not preprocessing_done:
Expand Down
99 changes: 1 addition & 98 deletions src/dsp_tools/utils/shared.py
Original file line number Diff line number Diff line change
@@ -1,117 +1,20 @@
from __future__ import annotations

import copy
import importlib.resources
import json
import unicodedata
from datetime import datetime
from pathlib import Path
from typing import Any, Optional, TypeGuard, Union

import pandas as pd
import regex
from lxml import etree

from dsp_tools.commands.excel2xml.propertyelement import PropertyElement
from dsp_tools.models.exceptions import BaseError, UserError
from dsp_tools.models.exceptions import BaseError
from dsp_tools.utils.create_logger import get_logger

logger = get_logger(__name__)


def validate_xml_against_schema(input_file: Union[str, Path, etree._ElementTree[Any]]) -> bool:
"""
Validates an XML file against the DSP XSD schema.

Args:
input_file: path to the XML file to be validated, or parsed ElementTree

Raises:
UserError: if the XML file is invalid

Returns:
True if the XML file is valid
"""
with importlib.resources.files("dsp_tools").joinpath("resources/schema/data.xsd").open(
encoding="utf-8"
) as schema_file:
xmlschema = etree.XMLSchema(etree.parse(schema_file))
if isinstance(input_file, (str, Path)):
try:
doc = etree.parse(source=input_file)
except etree.XMLSyntaxError as err:
logger.error(f"The XML file contains the following syntax error: {err.msg}", exc_info=True)
raise UserError(f"The XML file contains the following syntax error: {err.msg}") from None
else:
doc = input_file

if not xmlschema.validate(doc):
error_msg = "The XML file cannot be uploaded due to the following validation error(s):"
for error in xmlschema.error_log:
error_msg = error_msg + f"\n Line {error.line}: {error.message}"
error_msg = error_msg.replace("{https://dasch.swiss/schema}", "")
logger.error(error_msg)
raise UserError(error_msg)

# make sure there are no XML tags in simple texts
_validate_xml_tags_in_text_properties(doc)

logger.info("The XML file is syntactically correct and passed validation.")
print(f"{datetime.now()}: The XML file is syntactically correct and passed validation.")
return True


def _validate_xml_tags_in_text_properties(doc: Union[etree._ElementTree[etree._Element], etree._Element]) -> bool:
"""
Makes sure that there are no XML tags in simple texts.
This can only be done with a regex,
because even if the simple text contains some XML tags,
the simple text itself is not valid XML that could be parsed.
The extra challenge is that lxml transforms
"pebble (<2cm) and boulder (>20cm)" into
"pebble (<2cm) and boulder (>20cm)"
(but only if &gt; follows &lt;).
This forces us to write a regex that carefully distinguishes
between a real tag (which is not allowed) and a false-positive-tag.

Args:
doc: parsed XML file

Raises:
UserError: if there is an XML tag in one of the simple texts

Returns:
True if there are no XML tags in the simple texts
"""
# first: remove namespaces
doc_without_namespace = copy.deepcopy(doc)
for elem in doc_without_namespace.iter():
if not isinstance(elem, (etree._Comment, etree._ProcessingInstruction)):
elem.tag = etree.QName(elem).localname

# then: make the test
resources_with_illegal_xml_tags = []
for text in doc_without_namespace.findall(path="resource/text-prop/text"):
regex_finds_tags = bool(regex.search(r'<([a-zA-Z/"]+|[^\s0-9].*[^\s0-9])>', str(text.text)))
etree_finds_tags = bool(list(text.iterchildren()))
has_tags = regex_finds_tags or etree_finds_tags
if text.attrib["encoding"] == "utf8" and has_tags:
sourceline = f" line {text.sourceline}: " if text.sourceline else " "
propname = text.getparent().attrib["name"] # type: ignore[union-attr]
resname = text.getparent().getparent().attrib["id"] # type: ignore[union-attr]
resources_with_illegal_xml_tags.append(f" -{sourceline}resource '{resname}', property '{propname}'")
if resources_with_illegal_xml_tags:
err_msg = (
"XML-tags are not allowed in text properties with encoding=utf8. "
"The following resources of your XML file violate this rule:\n"
)
err_msg += "\n".join(resources_with_illegal_xml_tags)
logger.error(err_msg, exc_info=True)
raise UserError(err_msg)

return True


def prepare_dataframe(
df: pd.DataFrame,
required_columns: list[str],
Expand Down
128 changes: 128 additions & 0 deletions src/dsp_tools/utils/validate_data_xml.py
Nora-Olivia-Ammann marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
from __future__ import annotations

import importlib.resources
from datetime import datetime
from pathlib import Path
from typing import Any, Union

import regex
from lxml import etree

from dsp_tools.models.exceptions import InputError
from dsp_tools.utils.create_logger import get_logger
from dsp_tools.utils.xml_utils import remove_namespaces_from_xml

logger = get_logger(__name__)

separator = "\n "
list_separator = "\n - "
medium_separator = "\n----------------------------\n"
grand_separator = "\n\n---------------------------------------\n\n"


def validate_xml(input_file: Union[str, Path, etree._ElementTree[Any]]) -> bool:
"""
Validates an XML file against the DSP XSD schema.

Args:
input_file: path to the XML file to be validated, or parsed ElementTree

Raises:
InputError: if the XML file is invalid

Returns:
True if the XML file is valid
"""
data_xml, xmlschema = _parse_schema_and_data_files(input_file)

problems = []

all_good, msg = _validate_xml_against_schema(xmlschema, data_xml)
if not all_good:
problems.append(msg)

xml_no_namespace = remove_namespaces_from_xml(data_xml)

all_good, msg = _find_xml_tags_in_simple_text_elements(xml_no_namespace)
if not all_good:
problems.append(msg)

if len(problems) > 0:
err_msg = grand_separator.join(problems)
logger.error(err_msg, exc_info=True)
raise InputError(err_msg)

logger.info("The XML file is syntactically correct and passed validation.")
print(f"{datetime.now()}: The XML file is syntactically correct and passed validation.")
return True


def _parse_schema_and_data_files(
input_file: Union[str, Path, etree._ElementTree[Any]],
) -> tuple[Union[etree._ElementTree[etree._Element], etree._Element], etree.XMLSchema]:
with importlib.resources.files("dsp_tools").joinpath("resources/schema/data.xsd").open(
encoding="utf-8"
) as schema_file:
xmlschema = etree.XMLSchema(etree.parse(schema_file))
if isinstance(input_file, (str, Path)):
try:
data_xml = etree.parse(source=input_file)
except etree.XMLSyntaxError as err:
logger.error(f"The XML file contains the following syntax error: {err.msg}", exc_info=True)
raise InputError(f"The XML file contains the following syntax error: {err.msg}") from None
else:
data_xml = input_file
return data_xml, xmlschema


def _validate_xml_against_schema(
xmlschema: etree.XMLSchema, data_xml: Union[etree._ElementTree[etree._Element], etree._Element]
) -> tuple[bool, str]:
if not xmlschema.validate(data_xml):
error_msg = "The XML file cannot be uploaded due to the following validation error(s):"
for error in xmlschema.error_log:
error_msg = error_msg + f"{separator}Line {error.line}: {error.message}"
error_msg = error_msg.replace("{https://dasch.swiss/schema}", "")
return False, error_msg
return True, ""


def _find_xml_tags_in_simple_text_elements(
xml_no_namespace: Union[etree._ElementTree[etree._Element], etree._Element],
) -> tuple[bool, str]:
"""
Makes sure that there are no XML tags in simple texts.
This can only be done with a regex,
because even if the simple text contains some XML tags,
the simple text itself is not valid XML that could be parsed.
The extra challenge is that lxml transforms
"pebble (&lt;2cm) and boulder (&gt;20cm)" into
"pebble (<2cm) and boulder (>20cm)"
(but only if &gt; follows &lt;).
This forces us to write a regex that carefully distinguishes
between a real tag (which is not allowed) and a false-positive-tag.

Args:
xml_no_namespace: parsed XML file with the namespaces removed

Returns:
True if there are no XML tags in the simple texts
"""
resources_with_illegal_xml_tags = []
for text in xml_no_namespace.findall(path="resource/text-prop/text"):
regex_finds_tags = bool(regex.search(r'<([a-zA-Z/"]+|[^\s0-9].*[^\s0-9])>', str(text.text)))
etree_finds_tags = bool(list(text.iterchildren()))
has_tags = regex_finds_tags or etree_finds_tags
if text.attrib["encoding"] == "utf8" and has_tags:
sourceline = f"line {text.sourceline}: " if text.sourceline else " "
propname = text.getparent().attrib["name"] # type: ignore[union-attr]
resname = text.getparent().getparent().attrib["id"] # type: ignore[union-attr]
resources_with_illegal_xml_tags.append(f"{sourceline}resource '{resname}', property '{propname}'")
if resources_with_illegal_xml_tags:
err_msg = (
"XML-tags are not allowed in text properties with encoding=utf8.\n"
"The following resources of your XML file violate this rule:"
)
err_msg += list_separator + list_separator.join(resources_with_illegal_xml_tags)
return False, err_msg
return True, ""
19 changes: 19 additions & 0 deletions src/dsp_tools/utils/xml_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,22 @@ def _parse_xml_file(input_file: Union[str, Path]) -> etree._ElementTree[etree._E
"""
parser = etree.XMLParser(remove_comments=True, remove_pis=True)
return etree.parse(source=input_file, parser=parser)


def remove_namespaces_from_xml(
data_xml: Union[etree._ElementTree[etree._Element], etree._Element],
) -> Union[etree._ElementTree[etree._Element], etree._Element]:
Nora-Olivia-Ammann marked this conversation as resolved.
Show resolved Hide resolved
"""
This function removes all the namespaces from an XML file.

Args:
data_xml: file with namespaces

Returns:
the XMl file without the namespaces
"""
xml_no_namespace = copy.deepcopy(data_xml)
for elem in xml_no_namespace.iter():
if not isinstance(elem, (etree._Comment, etree._ProcessingInstruction)):
elem.tag = etree.QName(elem).localname
return xml_no_namespace
2 changes: 1 addition & 1 deletion test/unittests/cli/test_cli_with_mock.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def test_project_get(get_project: Mock) -> None:
)


@patch("dsp_tools.cli.call_action.validate_xml_against_schema")
@patch("dsp_tools.cli.call_action.validate_xml")
def test_xmlupload_validate(validate_xml: Mock) -> None:
"""Test the 'dsp-tools xmlupload --validate-only' command"""
file = "filename.xml"
Expand Down