Skip to content

Commit

Permalink
feat(xmlupload): add check if links point to valid resource ID (DEV-2902
Browse files Browse the repository at this point in the history
) (#639)
  • Loading branch information
jnussbaum committed Nov 15, 2023
1 parent a49fa59 commit 803c9f6
Show file tree
Hide file tree
Showing 2 changed files with 149 additions and 2 deletions.
53 changes: 51 additions & 2 deletions src/dsp_tools/commands/xmlupload/read_validate_xml_file.py
Expand Up @@ -3,10 +3,12 @@
from pathlib import Path
from typing import Any, Union

import regex
from lxml import etree

from dsp_tools.models.exceptions import UserError
from dsp_tools.utils.create_logger import get_logger
from dsp_tools.utils.iri_util import is_resource_iri
from dsp_tools.utils.shared import validate_xml_against_schema
from dsp_tools.utils.xml_utils import parse_and_clean_xml_file

Expand Down Expand Up @@ -34,15 +36,62 @@ def validate_and_parse_xml_file(
"""
validate_xml_against_schema(input_file=input_file)
root = parse_and_clean_xml_file(input_file=input_file)
_check_if_link_targets_exist(root)
if not preprocessing_done:
check_if_bitstreams_exist(root=root, imgdir=imgdir)
_check_if_bitstreams_exist(root=root, imgdir=imgdir)
shortcode = root.attrib["shortcode"]
default_ontology = root.attrib["default-ontology"]
logger.info(f"Validated and parsed the XML file. {shortcode=:} and {default_ontology=:}")
return default_ontology, root, shortcode


def check_if_bitstreams_exist(
def _check_if_link_targets_exist(root: etree._Element) -> None:
"""
Make sure that all targets of links (resptr and salsah-links)
are either IRIsl or IDs that exist in the present XML file.
Args:
root: parsed XML file
Raises:
UserError: if a link target does not exist in the XML file
"""
resptr_errors = _check_if_resptr_targets_exist(root)
salsah_errors = _check_if_salsah_targets_exist(root)
errors = resptr_errors + salsah_errors
if errors:
sep = "\n - "
msg = f"It is not possible to upload the XML file, because it contains invalid links:{sep}" + sep.join(errors)
raise UserError(msg)


def _check_if_resptr_targets_exist(root: etree._Element) -> list[str]:
link_values = [x for x in root.iter() if x.tag == "resptr"]
resource_ids = [x.attrib["id"] for x in root.iter() if x.tag == "resource"]
invalid_link_values = [x for x in link_values if x.text not in resource_ids]
invalid_link_values = [x for x in invalid_link_values if not is_resource_iri(str(x.text))]
errors = []
for inv in invalid_link_values:
prop_name = next(inv.iterancestors(tag="resptr-prop")).attrib["name"]
res_id = next(inv.iterancestors(tag="resource")).attrib["id"]
errors.append(f"Resource '{res_id}', property '{prop_name}' has an invalid link target '{inv.text}'")
return errors


def _check_if_salsah_targets_exist(root: etree._Element) -> list[str]:
link_values = [x for x in root.iter() if x.tag == "a"]
resource_ids = [x.attrib["id"] for x in root.iter() if x.tag == "resource"]
invalid_link_values = [x for x in link_values if regex.sub(r"IRI:|:IRI", "", x.attrib["href"]) not in resource_ids]
invalid_link_values = [x for x in invalid_link_values if not is_resource_iri(x.attrib["href"])]
errors = []
for inv in invalid_link_values:
prop_name = next(inv.iterancestors(tag="text-prop")).attrib["name"]
res_id = next(inv.iterancestors(tag="resource")).attrib["id"]
errors.append(f"Resource '{res_id}', property '{prop_name}' has an invalid link target '{inv.attrib['href']}'")
return errors


def _check_if_bitstreams_exist(
root: etree._Element,
imgdir: str,
) -> None:
Expand Down
98 changes: 98 additions & 0 deletions test/unittests/commands/xmlupload/test_read_validate_xml_file.py
@@ -0,0 +1,98 @@
from lxml import etree

from dsp_tools.commands.xmlupload.read_validate_xml_file import (
_check_if_resptr_targets_exist,
_check_if_salsah_targets_exist,
)


def test_check_if_resptr_targets_exist() -> None:
"""Check correct input"""
xml = """
<knora>
<resource id="resource1">
<resptr-prop name="resptr1"><resptr>resource2</resptr></resptr-prop>
</resource>
<resource id="resource2">
<resptr-prop name="resptr2"><resptr>resource1</resptr></resptr-prop>
</resource>
</knora>
"""
root = etree.fromstring(xml)
errors_returned = _check_if_resptr_targets_exist(root)
assert not errors_returned


def test_check_if_resptr_targets_exist_invalid() -> None:
"""Check invalid input"""
xml = """
<knora>
<resource id="resource1">
<resptr-prop name="resptr1"><resptr>resource3</resptr></resptr-prop>
</resource>
<resource id="resource2">
<resptr-prop name="resptr2"><resptr>resource4</resptr></resptr-prop>
</resource>
</knora>
"""
root = etree.fromstring(xml)
errors_returned = _check_if_resptr_targets_exist(root)
errors_expected = [
"Resource 'resource1', property 'resptr1' has an invalid link target 'resource3'",
"Resource 'resource2', property 'resptr2' has an invalid link target 'resource4'",
]
assert errors_returned == errors_expected


def test_check_if_salsah_targets_exist() -> None:
"""Check correct input"""
xml = """
<knora>
<resource id="resource1">
<text-prop name="text1">
<text encoding="xml">
<a class="salsah-link" href="IRI:resource2:IRI">resource2</a>
</text>
</text-prop>
</resource>
<resource id="resource2">
<text-prop name="text2">
<text encoding="xml">
<a class="salsah-link" href="IRI:resource1:IRI">resource1</a>
</text>
</text-prop>
</resource>
</knora>
"""
root = etree.fromstring(xml)
errors_returned = _check_if_salsah_targets_exist(root)
assert not errors_returned


def test_check_if_salsah_targets_exist_invalid() -> None:
"""Check invalid input"""
xml = """
<knora>
<resource id="resource1">
<text-prop name="text1">
<text encoding="xml">
<a class="salsah-link" href="IRI:resource3:IRI">resource3</a>
</text>
</text-prop>
</resource>
<resource id="resource2">
<text-prop name="text2">
<text encoding="xml">
<a class="salsah-link" href="IRI:resource4:IRI">resource4</a>
</text>
</text-prop>
</resource>
</knora>
"""
root = etree.fromstring(xml)
errors_returned = _check_if_salsah_targets_exist(root)
errors_expected = [
"Resource 'resource1', property 'text1' has an invalid link target 'IRI:resource3:IRI'",
"Resource 'resource2', property 'text2' has an invalid link target 'IRI:resource4:IRI'",
]
assert errors_returned == errors_expected

0 comments on commit 803c9f6

Please sign in to comment.