Skip to content

Commit

Permalink
refactor(xmlupload): modularise individual functions in file read_val…
Browse files Browse the repository at this point in the history
…idate_xml_file.py (DEV-2767) (#541)
  • Loading branch information
Nora-Olivia-Ammann committed Oct 4, 2023
1 parent 6bad8d0 commit dc5c5a7
Show file tree
Hide file tree
Showing 5 changed files with 141 additions and 52 deletions.
4 changes: 2 additions & 2 deletions src/dsp_tools/utils/id2iri.py
Expand Up @@ -7,7 +7,7 @@

from dsp_tools.models.exceptions import UserError
from dsp_tools.utils.create_logger import get_logger
from dsp_tools.utils.xmlupload.read_validate_xml_file import parse_xml_file
from dsp_tools.utils.xml_utils import parse_and_clean_xml_file

logger = get_logger(__name__)

Expand Down Expand Up @@ -231,7 +231,7 @@ def id2iri(
"""
xml_file_as_path, json_file_as_path = _check_input_parameters(xml_file=xml_file, json_file=json_file)
mapping = _parse_json_file(json_file_as_path)
tree = parse_xml_file(xml_file_as_path)
tree = parse_and_clean_xml_file(xml_file_as_path)
tree = _replace_ids_by_iris(
tree=tree,
mapping=mapping,
Expand Down
105 changes: 105 additions & 0 deletions src/dsp_tools/utils/xml_utils.py
@@ -0,0 +1,105 @@
from __future__ import annotations

import copy
from pathlib import Path
from typing import Any, Union

from lxml import etree

from dsp_tools.utils.create_logger import get_logger

logger = get_logger(__name__)


def parse_and_clean_xml_file(input_file: Union[str, Path, etree._ElementTree[Any]]) -> etree._Element:
"""
Parse an XML file with DSP-conform data,
remove namespace URI from the elements' names,
and transform the special tags <annotation>, <region>, and <link>
to their technically correct form
<resource restype="Annotation">, <resource restype="Region">, and <resource restype="LinkObj">.
Args:
input_file: path to the XML file, or parsed ElementTree
Returns:
the root element of the parsed XML file
Raises:
UserError: if the input is not of either the expected types
"""

# remove comments and processing instructions (commented out properties break the XMLProperty constructor)

if isinstance(input_file, (str, Path)):
tree = _parse_xml_file(input_file)
else:
tree = _remove_comments_from_element_tree(input_file)

_remove_qnames_and_transform_special_tags(tree)

return tree.getroot()


def _remove_qnames_and_transform_special_tags(
input_tree: etree._ElementTree[etree._Element],
) -> etree._ElementTree[etree._Element]:
"""
This function removes the namespace URIs from the elements' names
and transforms the special tags <annotation>, <region>, and <link>
to their technically correct form
<resource restype="Annotation">, <resource restype="Region">, and <resource restype="LinkObj">.
Args:
input_tree: unclean tree
Returns:
cleaned tree
"""
for elem in input_tree.iter():
elem.tag = etree.QName(elem).localname # remove namespace URI in the element's name
if elem.tag == "annotation":
elem.attrib["restype"] = "Annotation"
elem.tag = "resource"
elif elem.tag == "link":
elem.attrib["restype"] = "LinkObj"
elem.tag = "resource"
elif elem.tag == "region":
elem.attrib["restype"] = "Region"
elem.tag = "resource"
return input_tree


def _remove_comments_from_element_tree(
input_tree: etree._ElementTree[etree._Element],
) -> etree._ElementTree[etree._Element]:
"""
This function removes comments and processing instructions.
Commented out properties break the XMLProperty constructor.
Args:
input_tree: etree that will be cleaned
Returns:
clean etree
"""
tree = copy.deepcopy(input_tree)
for c in tree.xpath("//comment()"):
c.getparent().remove(c)
for c in tree.xpath("//processing-instruction()"):
c.getparent().remove(c)
return tree


def _parse_xml_file(input_file: Union[str, Path]) -> etree._ElementTree[etree._Element]:
"""
This function parses an XML file and returns an Element Tree
Args:
input_file: path to the input file
Returns:
element tree
"""
parser = etree.XMLParser(remove_comments=True, remove_pis=True)
return etree.parse(source=input_file, parser=parser)
58 changes: 23 additions & 35 deletions src/dsp_tools/utils/xmlupload/read_validate_xml_file.py
@@ -1,6 +1,5 @@
from __future__ import annotations

import copy
from pathlib import Path
from typing import Any, Union

Expand All @@ -9,50 +8,39 @@
from dsp_tools.models.exceptions import UserError
from dsp_tools.models.xmlresource import XMLResource
from dsp_tools.utils.create_logger import get_logger
from dsp_tools.utils.shared import validate_xml_against_schema
from dsp_tools.utils.xml_utils import parse_and_clean_xml_file

logger = get_logger(__name__)


def parse_xml_file(input_file: Union[str, Path, etree._ElementTree[Any]]) -> etree._Element:
def validate_and_parse_xml_file(
imgdir: str,
input_file: Union[str, Path, etree._ElementTree[Any]],
preprocessing_done: bool,
) -> tuple[str, etree._Element, str]:
"""
Parse an XML file with DSP-conform data,
remove namespace URI from the elements' names,
and transform the special tags <annotation>, <region>, and <link>
to their technically correct form
<resource restype="Annotation">, <resource restype="Region">, and <resource restype="LinkObj">.
This function takes an element tree or a path to an XML file.
It validates the file against the XML schema.
It checks if all the mentioned bitstream files are in the specified location.
It retrieves the shortcode and default ontology from the XML file.
Args:
input_file: path to the XML file, or parsed ElementTree
imgdir: directory to the bitstream files
input_file: file or etree that will be processed
preprocessing_done: True if the bitstream files have already been processed
Returns:
the root element of the parsed XML file
The ontology name, the parsed XML file and the shortcode of the project
"""

# remove comments and processing instructions (commented out properties break the XMLProperty constructor)
if isinstance(input_file, (str, Path)):
parser = etree.XMLParser(remove_comments=True, remove_pis=True)
tree = etree.parse(source=input_file, parser=parser)
else:
tree = copy.deepcopy(input_file)
for c in tree.xpath("//comment()"):
c.getparent().remove(c)
for c in tree.xpath("//processing-instruction()"):
c.getparent().remove(c)

# remove namespace URI from the elements' names and transform the special tags to their technically correct form
for elem in tree.iter():
elem.tag = etree.QName(elem).localname # remove namespace URI in the element's name
if elem.tag == "annotation":
elem.attrib["restype"] = "Annotation"
elem.tag = "resource"
elif elem.tag == "link":
elem.attrib["restype"] = "LinkObj"
elem.tag = "resource"
elif elem.tag == "region":
elem.attrib["restype"] = "Region"
elem.tag = "resource"

return tree.getroot()
validate_xml_against_schema(input_file=input_file)
root = parse_and_clean_xml_file(input_file=input_file)
if not preprocessing_done:
check_if_bitstreams_exist(root=root, imgdir=imgdir)
shortcode = root.attrib["shortcode"]
default_ontology = root.attrib["default-ontology"]
logger.info(f"Validated and parsed the XML file. Shortcode='{shortcode}' and default_ontology='{default_ontology}'")
return default_ontology, root, shortcode


def _check_if_onto_name_exists(
Expand Down
18 changes: 7 additions & 11 deletions src/dsp_tools/utils/xmlupload/xmlupload.py
Expand Up @@ -22,12 +22,11 @@
from dsp_tools.models.xmlproperty import XMLProperty
from dsp_tools.models.xmlresource import XMLResource
from dsp_tools.utils.create_logger import get_logger
from dsp_tools.utils.shared import login, try_network_action, validate_xml_against_schema
from dsp_tools.utils.shared import login, try_network_action
from dsp_tools.utils.xmlupload.ark2iri import convert_ark_v0_to_resource_iri
from dsp_tools.utils.xmlupload.read_validate_xml_file import (
check_consistency_with_ontology,
check_if_bitstreams_exist,
parse_xml_file,
validate_and_parse_xml_file,
)
from dsp_tools.utils.xmlupload.stash_circular_references import remove_circular_references
from dsp_tools.utils.xmlupload.upload_stashed_resptr_props import (
Expand Down Expand Up @@ -79,14 +78,11 @@ def xmlupload(
True if all resources could be uploaded without errors; False if one of the resources could not be
uploaded because there is an error in it
"""
# parse the XML file
validate_xml_against_schema(input_file=input_file)
root = parse_xml_file(input_file=input_file)
if not preprocessing_done:
check_if_bitstreams_exist(root=root, imgdir=imgdir)
shortcode = root.attrib["shortcode"]
default_ontology = root.attrib["default-ontology"]
logger.info(f"Validated and parsed the XML file. Shortcode='{shortcode}' and default_ontology='{default_ontology}'")
default_ontology, root, shortcode = validate_and_parse_xml_file(
input_file=input_file,
imgdir=imgdir,
preprocessing_done=preprocessing_done,
)

# determine save location that will be used for diagnostic info if the xmlupload is interrupted
save_location, server_as_foldername, timestamp_str = determine_save_location_of_diagnostic_info(
Expand Down
8 changes: 4 additions & 4 deletions test/unittests/test_xmlupload/test_xmlupload.py
Expand Up @@ -10,8 +10,8 @@

from dsp_tools.models.exceptions import BaseError
from dsp_tools.models.xmlresource import XMLResource
from dsp_tools.utils.xml_utils import parse_and_clean_xml_file
from dsp_tools.utils.xmlupload.ark2iri import convert_ark_v0_to_resource_iri
from dsp_tools.utils.xmlupload.read_validate_xml_file import parse_xml_file
from dsp_tools.utils.xmlupload.stash_circular_references import remove_circular_references
from dsp_tools.utils.xmlupload.write_diagnostic_info import (
_transform_server_url_to_foldername,
Expand Down Expand Up @@ -62,8 +62,8 @@ def test_determine_save_location_of_logs(self) -> None:

def test_parse_xml_file(self) -> None:
test_data_systematic_tree = etree.parse("testdata/xml-data/test-data-systematic.xml")
output1 = parse_xml_file("testdata/xml-data/test-data-systematic.xml")
output2 = parse_xml_file(test_data_systematic_tree)
output1 = parse_and_clean_xml_file("testdata/xml-data/test-data-systematic.xml")
output2 = parse_and_clean_xml_file(test_data_systematic_tree)
result1 = regex.sub("\n", "", etree.tostring(output1, encoding=str))
result1 = regex.sub(" +", " ", result1)
result2 = regex.sub("\n", "", etree.tostring(output2, encoding=str))
Expand Down Expand Up @@ -119,7 +119,7 @@ def test_convert_ark_v0_to_resource_iri(self) -> None:

def test_remove_circular_references(self) -> None:
# create a list of XMLResources from the test data file
root = parse_xml_file("testdata/xml-data/test-data-systematic.xml")
root = parse_and_clean_xml_file("testdata/xml-data/test-data-systematic.xml")
resources = [XMLResource(x, "testonto") for x in root if x.tag == "resource"]

# get the purged resources and the stashes from the function to be tested
Expand Down

0 comments on commit dc5c5a7

Please sign in to comment.