Skip to content

Commit

Permalink
feat(xmlupload): reduce stash size of circular references (DEV-2848) (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
jnussbaum committed Oct 27, 2023
1 parent 096f47e commit 228b1f7
Show file tree
Hide file tree
Showing 8 changed files with 126 additions and 116 deletions.
4 changes: 2 additions & 2 deletions src/dsp_tools/analyse_xml_data/construct_and_analyze_graph.py
Expand Up @@ -57,7 +57,7 @@ def _create_resptr_link_objects(subject_id: str, resptr_prop: etree._Element) ->
if not regex.search(r"https?://rdfh.ch/[a-fA-F0-9]{4}/[\w-]{22}", resptr.text):
link_object = ResptrLink(subject_id, resptr.text)
# this UUID is so that the links that were stashed can be identified in the XML data file
resptr.attrib["stashUUID"] = link_object.link_uuid
resptr.attrib["linkUUID"] = link_object.link_uuid
resptr_links.append(link_object)
return resptr_links

Expand All @@ -71,7 +71,7 @@ def _create_text_link_objects(subject_id: str, text_prop: etree._Element) -> lis
xml_link = XMLLink(subject_id, links)
xml_props.append(xml_link)
# this UUID is so that the links that were stashed can be identified in the XML data file
text.attrib["stashUUID"] = xml_link.link_uuid
text.attrib["linkUUID"] = xml_link.link_uuid
return xml_props


Expand Down
2 changes: 2 additions & 0 deletions src/dsp_tools/models/xmlvalue.py
Expand Up @@ -13,6 +13,7 @@ class XMLValue: # pylint: disable=too-few-public-methods
resrefs: Optional[list[str]]
comment: Optional[str]
permissions: Optional[str]
link_uuid: Optional[str]

def __init__(
self,
Expand All @@ -37,6 +38,7 @@ def __init__(
self.value = listname + ":" + "".join(node.itertext())
else:
self.value = "".join(node.itertext())
self.link_uuid = node.attrib.get("linkUUID") # not all richtexts have a link, so this attribute is optional

def _cleanup_formatted_text(self, xmlstr_orig: str) -> str:
"""
Expand Down
184 changes: 93 additions & 91 deletions src/dsp_tools/utils/xmlupload/stash_circular_references.py
Expand Up @@ -3,8 +3,15 @@
from typing import cast
from uuid import uuid4

from dsp_tools.models.exceptions import BaseError
from lxml import etree

from dsp_tools.analyse_xml_data.construct_and_analyze_graph import (
create_info_from_xml_for_graph,
generate_upload_order,
make_graph,
)
from dsp_tools.models.value import KnoraStandoffXml
from dsp_tools.models.xmlproperty import XMLProperty
from dsp_tools.models.xmlresource import XMLResource
from dsp_tools.utils.create_logger import get_logger
from dsp_tools.utils.xmlupload.stash.stash_models import (
Expand All @@ -18,122 +25,117 @@
logger = get_logger(__name__)


def _stash_circular_references(
nok_resources: list[XMLResource],
ok_res_ids: set[str],
) -> tuple[list[XMLResource], set[str], list[XMLResource], Stash | None]:
def _stash_standoff(
res_id: str,
restype: str,
link_prop: XMLProperty,
stash_lookup: dict[str, list[str]],
) -> list[StandoffStashItem]:
stashed_items = []
for value in link_prop.values:
if value.link_uuid not in stash_lookup[res_id]:
continue
# value.value is a KnoraStandoffXml text with problematic links.
# stash it, then replace the problematic text with a UUID
standoff_xml = cast(KnoraStandoffXml, value.value)
uuid = str(uuid4())
standoff_stash_item = StandoffStashItem(
res_id=res_id,
res_type=restype,
uuid=uuid,
prop_name=link_prop.name,
value=standoff_xml,
)
value.value = KnoraStandoffXml(uuid)
stashed_items.append(standoff_stash_item)
return stashed_items


def _stash_resptr(
res_id: str,
restype: str,
link_prop: XMLProperty,
stash_lookup: dict[str, list[str]],
) -> list[LinkValueStashItem]:
stashed_items = []
for value in link_prop.values.copy():
if value.link_uuid not in stash_lookup[res_id]:
continue
# value.value is the ID of the target resource. stash it, then delete it
link_stash_item = LinkValueStashItem(
res_id=res_id,
res_type=restype,
prop_name=link_prop.name,
target_id=str(value.value),
)
link_prop.values.remove(value)
stashed_items.append(link_stash_item)
return stashed_items


def stash_circular_references(
resources: list[XMLResource],
stash_lookup: dict[str, list[str]],
) -> Stash | None:
"""
Raises:
BaseError
Stashes problematic resource-references from a list of resources.
The resources are modified in-place.
Args:
resources: all resources of the XML file
stash_lookup: A dictionary which maps the resources that have stashes to the UUIDs of the stashed links
Returns:
stash: an object that contains the stashed references
"""
stashed_standoff_values: list[StandoffStashItem] = []
stashed_link_values: list[LinkValueStashItem] = []
ok_resources: list[XMLResource] = []

for res in nok_resources.copy():
for res in resources:
if not res.id in stash_lookup:
continue
for link_prop in res.get_props_with_links():
assert link_prop.valtype in ["text", "resptr"]
if link_prop.valtype == "text":
for value in link_prop.values:
if value.resrefs and not all(_id in ok_res_ids for _id in value.resrefs):
# replace the problematic XML with a UUID
# and remove the problematic resrefs from the XMLValue's resrefs list
standoff_xml = cast(KnoraStandoffXml, value.value)
uuid = str(uuid4())
standoff_stash_item = StandoffStashItem(
res_id=res.id,
res_type=res.restype,
uuid=uuid,
prop_name=link_prop.name,
value=standoff_xml,
)
stashed_standoff_values.append(standoff_stash_item)
value.value = KnoraStandoffXml(uuid)
value.resrefs = [_id for _id in value.resrefs if _id in ok_res_ids]
standoff_stash_item = _stash_standoff(res.id, res.restype, link_prop, stash_lookup)
stashed_standoff_values.extend(standoff_stash_item)
elif link_prop.valtype == "resptr":
for value in link_prop.values.copy():
if value.value not in ok_res_ids:
# value.value is the id of the target resource. stash it, then delete it
link_stash_item = LinkValueStashItem(
res_id=res.id,
res_type=res.restype,
prop_name=link_prop.name,
target_id=str(value.value),
)
stashed_link_values.append(link_stash_item)
link_prop.values.remove(value)
else:
logger.error("ERROR in remove_circular_references(): link_prop.valtype is neither text nor resptr.")
raise BaseError("ERROR in remove_circular_references(): link_prop.valtype is neither text nor resptr.")
link_stash_item = _stash_resptr(res.id, res.restype, link_prop, stash_lookup)
stashed_link_values.extend(link_stash_item)

if len(link_prop.values) == 0:
# if all values of a link property have been stashed, the property needs to be removed
res.properties.remove(link_prop)

ok_resources.append(res)
ok_res_ids.add(res.id)
nok_resources.remove(res)

standoff_stash = StandoffStash.make(stashed_standoff_values)
link_value_stash = LinkValueStash.make(stashed_link_values)
stash = Stash.make(standoff_stash, link_value_stash)

return nok_resources, ok_res_ids, ok_resources, stash
return stash


def remove_circular_references(
resources: list[XMLResource],
def identify_circular_references(
root: etree._Element,
verbose: bool,
) -> tuple[list[XMLResource], Stash | None]:
) -> tuple[dict[str, list[str]], list[str]]:
"""
Temporarily removes problematic resource-references from a list of resources.
Identifies problematic resource-references inside an XML tree.
A reference is problematic if it creates a circle (circular references).
The XML tree is modified in-place:
A reference UUID is added to each XML element that contains a link (<resptr> or <text>).
Args:
resources: list of resources that possibly contain circular references
root: the root element of the parsed XML document
verbose: verbose output if True
Raises:
BaseError
Returns:
list: list of cleaned resources
stash: an object that contains the problematic references
stash_lookup: A dictionary which maps the resources that have stashes to the UUIDs of the stashed links
upload_order: A list of resource IDs in the order in which they should be uploaded
"""

logger.info("Checking resources for circular references...")
if verbose:
print("Checking resources for unresolvable references...")
logger.info("Checking resources for unresolvable references...")

stash: Stash | None = None
# sort the resources according to outgoing resptrs
ok_resources: list[XMLResource] = []
# resources with circular references
nok_resources: list[XMLResource] = []
# internal ids for the resources that do not have circular references
ok_res_ids: set[str] = set()
cnt = 0
nok_len = 9999999
while len(resources) > 0 and cnt < 10000:
for resource in resources:
resptrs = resource.get_internal_resptrs()
# if there are no resptrs references
# or all of them are in the ok resources,
# append the resource to the ok resources
if len(resptrs) == 0 or resptrs.issubset(ok_res_ids):
ok_resources.append(resource)
ok_res_ids.add(resource.id)
else:
nok_resources.append(resource)
resources = nok_resources
if len(nok_resources) == nok_len:
# there are circular references. go through all problematic resources, and stash the problematic references.
nok_resources, ok_res_ids, ok_res, stash = _stash_circular_references(nok_resources, ok_res_ids)
ok_resources.extend(ok_res)
nok_len = len(nok_resources)
nok_resources = []
cnt += 1
if verbose:
print(f"{cnt}. ordering pass finished.")
logger.debug(f"{cnt}. ordering pass finished.")

return ok_resources, stash
print("Checking resources for circular references...")
resptr_links, xml_links, all_resource_ids = create_info_from_xml_for_graph(root)
graph, node_to_id, edges = make_graph(resptr_links, xml_links, all_resource_ids)
stash_lookup, upload_order, _ = generate_upload_order(graph, node_to_id, edges)
return stash_lookup, upload_order
8 changes: 5 additions & 3 deletions src/dsp_tools/utils/xmlupload/xmlupload.py
Expand Up @@ -27,7 +27,7 @@
)
from dsp_tools.utils.xmlupload.resource_multimedia import handle_bitstream
from dsp_tools.utils.xmlupload.stash.stash_models import Stash
from dsp_tools.utils.xmlupload.stash_circular_references import remove_circular_references
from dsp_tools.utils.xmlupload.stash_circular_references import identify_circular_references, stash_circular_references
from dsp_tools.utils.xmlupload.upload_config import UploadConfig
from dsp_tools.utils.xmlupload.upload_stashed_resptr_props import upload_stashed_resptr_props
from dsp_tools.utils.xmlupload.upload_stashed_xml_texts import upload_stashed_xml_texts
Expand Down Expand Up @@ -118,15 +118,17 @@ def _prepare_upload(
shortcode: str,
verbose: bool,
) -> tuple[list[XMLResource], dict[str, Permissions], dict[str, type], Stash | None]:
stash_lookup, upload_order = identify_circular_references(root, verbose=verbose)
resources, permissions_lookup, resclass_name_2_type = _get_data_from_xml(
con=con,
root=root,
default_ontology=default_ontology,
shortcode=shortcode,
verbose=verbose,
)
# temporarily remove circular references
resources, stash = remove_circular_references(resources, verbose=verbose)
sorting_lookup = {res.id: res for res in resources}
resources = [sorting_lookup[res_id] for res_id in upload_order]
stash = stash_circular_references(resources, stash_lookup)
return resources, permissions_lookup, resclass_name_2_type, stash


Expand Down
5 changes: 3 additions & 2 deletions test/benchmarking/test_stash_circular_references.py
Expand Up @@ -4,14 +4,15 @@
from termcolor import cprint

from dsp_tools.utils.xml_utils import parse_and_clean_xml_file
from dsp_tools.utils.xmlupload.stash_circular_references import remove_circular_references
from dsp_tools.utils.xmlupload.stash_circular_references import identify_circular_references, stash_circular_references
from dsp_tools.utils.xmlupload.xmlupload import _extract_resources_from_xml


def test_get_length_ok_resources() -> None:
test_root = parse_and_clean_xml_file("testdata/xml-data/circular-references/test_circular_references_1.xml")
stash_lookup, _ = identify_circular_references(test_root, False)
resources = _extract_resources_from_xml(test_root, "simcir")
_, stash = remove_circular_references(resources, False)
stash = stash_circular_references(resources, stash_lookup)
len_standoff = len(stash.standoff_stash.res_2_stash_items) # type: ignore[union-attr]
len_resptr = len(stash.link_value_stash.res_2_stash_items) # type: ignore[union-attr]
stashed_links = len_standoff + len_resptr
Expand Down
Expand Up @@ -156,8 +156,8 @@ def test_extract_ids_from_text_prop_with_iris_and_ids() -> None:
assert len(res) == 1
assert res[0].target_ids == {"res_B_18"}
children = list(test_ele.iterchildren())
assert not children[0].attrib.get("stashUUID")
assert children[1].attrib.get("stashUUID")
assert not children[0].attrib.get("linkUUID")
assert children[1].attrib.get("linkUUID")


def test_create_class_instance_resptr_link_one_link() -> None:
Expand Down Expand Up @@ -199,9 +199,9 @@ def test_create_class_instance_resptr_link_with_iris() -> None:
assert res[0].target_id == "res_A_13"
assert res[1].target_id == "res_B_13"
children = list(test_ele.iterchildren())
assert children[0].attrib.get("stashUUID")
assert children[1].attrib.get("stashUUID")
assert not children[2].attrib.get("stashUUID")
assert children[0].attrib.get("linkUUID")
assert children[1].attrib.get("linkUUID")
assert not children[2].attrib.get("linkUUID")


def test_create_info_from_xml_for_graph_check_UUID_in_root() -> None:
Expand Down Expand Up @@ -229,9 +229,9 @@ def test_create_info_from_xml_for_graph_check_UUID_in_root() -> None:
assert isinstance(res_xml, XMLLink)
assert unordered(res_all_ids) == ["res_A_11", "res_B_11", "res_C_11"]
xml_res_resptr = root.find(".//resptr")
assert xml_res_resptr.attrib["stashUUID"] == res_resptr.link_uuid # type: ignore[union-attr]
assert xml_res_resptr.attrib["linkUUID"] == res_resptr.link_uuid # type: ignore[union-attr]
xml_res_text = root.find(".//text")
assert xml_res_text.attrib["stashUUID"] == res_xml.link_uuid # type: ignore[union-attr]
assert xml_res_text.attrib["linkUUID"] == res_xml.link_uuid # type: ignore[union-attr]


def test_make_graph() -> None:
Expand Down
4 changes: 2 additions & 2 deletions test/unittests/test_xmlupload/test_xmlvalue.py
Expand Up @@ -11,7 +11,7 @@ class TestXmlValue(unittest.TestCase):

def test_cleanup_unformatted_text(self) -> None:
"""Test the removal of whitespaces and line breaks in utf8-encoded text values"""
unformatted_text_orig = """<text permissions="prop-default" encoding="utf8">
unformatted_text_orig = """<text permissions="prop-default" encoding="utf8" linkUUID="foo">
Poem
with 1 line break:
Expand Down Expand Up @@ -41,7 +41,7 @@ def test_cleanup_unformatted_text(self) -> None:

def test_cleanup_formatted_text(self) -> None:
"""Test the removal of whitespaces and line breaks in xml-formatted text values"""
formatted_text_orig = """<text permissions="prop-default" encoding="xml">
formatted_text_orig = """<text permissions="prop-default" encoding="xml" linkUUID="foo">
This is <em>italicized and <strong>bold</strong></em> text!
It contains <code>monospace text that preserves whitespaces and &amp; HTML-escapes</code>.
Expand Down

0 comments on commit 228b1f7

Please sign in to comment.