Skip to content

Commit

Permalink
fix(xmlupload): fix performance slowdown during stash applying (DEV-2717
Browse files Browse the repository at this point in the history
) (#533)
  • Loading branch information
Nora-Olivia-Ammann committed Oct 2, 2023
1 parent bf93bc8 commit add40b2
Show file tree
Hide file tree
Showing 3 changed files with 155 additions and 30 deletions.
11 changes: 9 additions & 2 deletions src/dsp_tools/models/value.py
Expand Up @@ -26,11 +26,18 @@ def __str__(self) -> str:
def get_all_iris(self) -> Optional[list[str]]:
return self.__iriregexp.findall(self.__xmlstr)

def find_ids_referenced_in_salsah_links(self) -> set[str]:
return set(regex.findall(pattern='href="IRI:(.*?):IRI"', string=self.__xmlstr))

def replace(self, fromStr: str, toStr: str) -> None:
self.__xmlstr = self.__xmlstr.replace(fromStr, toStr)

def regex_replace(self, pattern: str, repl: str) -> None:
self.__xmlstr = regex.sub(pattern=repr(pattern)[1:-1], repl=repl, string=self.__xmlstr)
def replace_one_id_with_iri_in_salsah_link(self, internal_id: str, iri: str) -> None:
self.__xmlstr = regex.sub(
pattern=f'href="IRI:{internal_id}:IRI"',
repl=f'href="{iri}"',
string=self.__xmlstr,
)


class Value:
Expand Down
88 changes: 60 additions & 28 deletions src/dsp_tools/utils/xml_upload_stash.py
Expand Up @@ -52,9 +52,6 @@ def _log_unable_to_upload_xml_resource(
received_error: Error received
stashed_resource: resource that is stashed
all_link_props: all the link properties from that resource
Returns:
"""
# print the message to keep track of the cause for the failure
# apart from that; no action is necessary:
Expand All @@ -65,12 +62,34 @@ def _log_unable_to_upload_xml_resource(
logger.warning(err_msg, exc_info=True)


def _log_iri_does_not_exist_error(
received_error: KeyError,
stashed_resource: XMLResource,
all_link_props: XMLProperty,
) -> None:
"""
This function logs if it is not possible to upload an XML resource
if a linked resource does not have an IRI in the triplestore.
Args:
received_error: Error received
stashed_resource: resource that is stashed
all_link_props: all the link properties from that resource
"""
err_msg = (
f"Unable to upload the xml text of '{all_link_props.name}' of resource '{stashed_resource.id}'"
f"because the resource with the internal id '{received_error.args[0]}' does not exist in the triplestore."
)
print(f" WARNING: {err_msg}")
logger.warning(err_msg, exc_info=True)


def _get_text_hash_value(old_xmltext: str) -> str:
"""
This function extracts the hash values in the text
Args:
old_xmltext: Text with has values.
old_xmltext: Text with hash values.
Returns:
hash values
Expand All @@ -79,31 +98,33 @@ def _get_text_hash_value(old_xmltext: str) -> str:


def _replace_internal_ids_with_iris(
pure_text: str,
id2iri_mapping: dict[str, str],
hash_to_value: dict[str, KnoraStandoffXml],
xml_with_id: KnoraStandoffXml,
id_set: set[str],
) -> KnoraStandoffXml:
"""
This function replaces the internal ids with the new IRIs from the triplestore.
This function takes an XML string and a set with internal ids that are referenced in salsah-links in that string.
It replaces all internal ids of that set with the corresponding iri according to the mapping dictionary.
Args:
pure_text: the text with the ids
id2iri_mapping: the dictionary that contains the mapping information
hash_to_value: the dictionary that contains the hash of the string and the xml value
id2iri_mapping: dictionary with id to iri mapping
xml_with_id: KnoraStandoffXml with the string that should have replacements
id_set: set of ids that are in the string
Returns:
the xml value with the old ids replaced
"""
new_xmltext = hash_to_value[pure_text]
# replace the outdated internal ids by their IRI
for _id, _iri in id2iri_mapping.items():
new_xmltext.regex_replace(f'href="IRI:{_id}:IRI"', f'href="{_iri}"')
return new_xmltext
for internal_id in id_set:
xml_with_id.replace_one_id_with_iri_in_salsah_link(
internal_id=internal_id,
iri=id2iri_mapping[internal_id],
)
return xml_with_id


def _create_XMLResource_json_object_to_update(
res_iri: str,
resource_in_triplestore: Any,
resource_in_triplestore: dict[str, Any],
stashed_resource: XMLResource,
link_prop_in_triplestore: dict[str, Any],
new_xmltext: KnoraStandoffXml,
Expand All @@ -117,7 +138,7 @@ def _create_XMLResource_json_object_to_update(
resource_in_triplestore: the resource existing in the triplestore
stashed_resource: the same resource from the stash
link_prop_in_triplestore: the link property in the triplestore
new_xmltext: the new xml text with the IRIs
new_xmltext: The KnoraStandOffXml with replaced ids
link_prop_name: the name of the link property
Returns:
Expand All @@ -137,11 +158,11 @@ def _create_XMLResource_json_object_to_update(
return json.dumps(jsonobj, indent=4, separators=(",", ": "), cls=KnoraStandoffXmlEncoder)


def upload_single_link_xml_property(
link_prop_in_triplestore: Any,
def _upload_single_link_xml_property(
link_prop_in_triplestore: dict[str, Any],
res_iri: str,
stashed_resource: XMLResource,
resource_in_triplestore: Any,
resource_in_triplestore: dict[str, Any],
link_prop: XMLProperty,
hash_to_value: dict[str, KnoraStandoffXml],
id2iri_mapping: dict[str, str],
Expand All @@ -158,15 +179,15 @@ def upload_single_link_xml_property(
stashed_resource: the stashed resource
resource_in_triplestore: the resource retrieved from the triplestore
link_prop: the name of the link property
hash_to_value: the has value of the xml text
hash_to_value: the hash value of the xml text
id2iri_mapping: the dictionary with the internal ids and the new IRIs
nonapplied_xml_texts: the dictionary with the stashes
verbose: what is printed out
con: the connection to the triplestore
Returns:
The stash dictionary with the newly uploaded resource removed.
If the upload was not sucessfull it returns the dictionary as it was before.
If the upload was not sucessfull, it returns the dictionary as it was before.
"""
xmltext_in_triplestore = link_prop_in_triplestore.get("knora-api:textValueAsXml")
if not xmltext_in_triplestore:
Expand All @@ -179,13 +200,24 @@ def upload_single_link_xml_property(

# if the pure text is a hash, the replacement must be made
# this hash originates from _stash_circular_references(), and identifies the XML texts
if text_hash_value not in hash_to_value:
try:
xml_from_stash = hash_to_value[text_hash_value]
except KeyError as err:
_log_iri_does_not_exist_error(
received_error=err,
stashed_resource=stashed_resource,
all_link_props=link_prop,
)
# no action necessary: this property will remain in nonapplied_xml_texts,
# which will be handled by the caller
return nonapplied_xml_texts

new_xmltext = _replace_internal_ids_with_iris(
pure_text=text_hash_value, id2iri_mapping=id2iri_mapping, hash_to_value=hash_to_value
id_set = xml_from_stash.find_ids_referenced_in_salsah_links()

xml_from_stash = _replace_internal_ids_with_iris(
id2iri_mapping=id2iri_mapping,
xml_with_id=xml_from_stash,
id_set=id_set,
)

# prepare API call
Expand All @@ -194,7 +226,7 @@ def upload_single_link_xml_property(
resource_in_triplestore=resource_in_triplestore,
stashed_resource=stashed_resource,
link_prop_in_triplestore=link_prop_in_triplestore,
new_xmltext=new_xmltext,
new_xmltext=xml_from_stash,
link_prop_name=link_prop.name,
)

Expand Down Expand Up @@ -233,7 +265,7 @@ def upload_all_link_props_of_single_resource(
stashed_resource: the resource from the stash
resource_in_triplestore: the resource from the triplestore
link_prop: the link property
hash_to_value: the dictionary which stored the hashes and their corresponding text
hash_to_value: the dictionary which stored the hashes and the KnoraStandoffXml with the corresponding texts
id2iri_mapping: the dictionary that has the internal ids and IRIs to map
nonapplied_xml_texts: the dictionary which contains the unprocessed resources
verbose: how much information should be printed
Expand All @@ -248,7 +280,7 @@ def upload_all_link_props_of_single_resource(
all_link_props_in_triplestore = [all_link_props_in_triplestore]

for link_prop_in_triplestore in all_link_props_in_triplestore:
nonapplied_xml_texts = upload_single_link_xml_property(
nonapplied_xml_texts = _upload_single_link_xml_property(
link_prop_in_triplestore=link_prop_in_triplestore,
res_iri=res_iri,
stashed_resource=stashed_resource,
Expand Down
86 changes: 86 additions & 0 deletions test/unittests/test_xml_upload_stash.py
@@ -0,0 +1,86 @@
# pylint: disable=missing-class-docstring,missing-function-docstring,protected-access

from unittest import TestCase

import pytest

from dsp_tools.models.value import KnoraStandoffXml
from dsp_tools.utils import xml_upload_stash


class TestXMLUploadStash(TestCase):
def test_find_ids_referenced_in_salsah_links_one_link(self) -> None:
one_link_KnoraStandoffXml = KnoraStandoffXml(
xmlstr=(
'<resource label="r1_label" restype="r1_restype" id="r1_id" permissions="res-default">'
'<text-prop name=":hasRichtext"><text permissions="res-default" encoding="xml">'
'<a class="salsah-link" href="IRI:r2_id:IRI">r2_id</a>'
"</text></text-prop></resource>"
)
)
returned_set = one_link_KnoraStandoffXml.find_ids_referenced_in_salsah_links()
self.assertEqual({"r2_id"}, returned_set)

def test_find_ids_referenced_in_salsah_links_three_links(self) -> None:
three_link_KnoraStandoffXml = KnoraStandoffXml(
xmlstr=(
'<resource label="r1_label" restype="r1_restype" id="r1_id" permissions="res-default">'
'<text-prop name=":hasRichtext"><text permissions="res-default" encoding="xml">'
'<a class="salsah-link" href="IRI:r2_id:IRI">r2_id</a>This is normal text'
'<a class="salsah-link" href="IRI:r3_id:IRI">r3_id</a>'
'<a class="salsah-link" href="IRI:r2_id:IRI">r2_id</a>'
"</text></text-prop></resource>"
)
)
returned_set = three_link_KnoraStandoffXml.find_ids_referenced_in_salsah_links()
self.assertEqual({"r2_id", "r3_id"}, returned_set)

def test__replace_internal_ids_with_iris_one_link(self) -> None:
test_id2iri = {"r1_id": "r1_iri", "r2_id": "r2_iri", "r3_id": "r3_iri"}
one_link_KnoraStandoffXml = KnoraStandoffXml(
xmlstr=(
'<resource label="r1_label" restype="r1_restype" id="r1_id" permissions="res-default">'
'<text-prop name=":hasRichtext"><text permissions="res-default" encoding="xml">'
'<a class="salsah-link" href="IRI:r2_id:IRI">r2_id</a>'
"</text></text-prop></resource>"
)
)
returned_instance = xml_upload_stash._replace_internal_ids_with_iris(
id2iri_mapping=test_id2iri, xml_with_id=one_link_KnoraStandoffXml, id_set={"r2_id"}
)
expected_str = (
'<resource label="r1_label" restype="r1_restype" id="r1_id" permissions="res-default">'
'<text-prop name=":hasRichtext"><text permissions="res-default" encoding="xml">'
'<a class="salsah-link" href="r2_iri">r2_id</a>'
"</text></text-prop></resource>"
)
self.assertEqual(expected_str, str(returned_instance))

def test__replace_internal_ids_with_iris_three_links(self) -> None:
test_id2iri = {"r1_id": "r1_iri", "r2_id": "r2_iri", "r3_id": "r3_iri"}
three_link_KnoraStandoffXml = KnoraStandoffXml(
xmlstr=(
'<resource label="r1_label" restype="r1_restype" id="r1_id" permissions="res-default">'
'<text-prop name=":hasRichtext"><text permissions="res-default" encoding="xml">'
'<a class="salsah-link" href="IRI:r2_id:IRI">r2_id</a>This is normal text'
'<a class="salsah-link" href="IRI:r3_id:IRI">r3_id</a>'
'<a class="salsah-link" href="IRI:r2_id:IRI">r2_id</a>'
"</text></text-prop></resource>"
)
)
returned_instance = xml_upload_stash._replace_internal_ids_with_iris(
id2iri_mapping=test_id2iri, xml_with_id=three_link_KnoraStandoffXml, id_set={"r2_id", "r3_id"}
)
expected_str = (
'<resource label="r1_label" restype="r1_restype" id="r1_id" permissions="res-default">'
'<text-prop name=":hasRichtext"><text permissions="res-default" encoding="xml">'
'<a class="salsah-link" href="r2_iri">r2_id</a>This is normal text'
'<a class="salsah-link" href="r3_iri">r3_id</a>'
'<a class="salsah-link" href="r2_iri">r2_id</a>'
"</text></text-prop></resource>"
)
self.assertEqual(expected_str, str(returned_instance))


if __name__ == "__main__":
pytest.main([__file__])

0 comments on commit add40b2

Please sign in to comment.