From 08624b2bdf0acd4f1cd2f126e2f5511f4b633d15 Mon Sep 17 00:00:00 2001 From: Balduin Landolt <33053745+BalduinLandolt@users.noreply.github.com> Date: Mon, 9 Oct 2023 15:17:39 +0200 Subject: [PATCH] refactor: identify temporary text values with UUID instead of text hash (DEV-2790) (#558) --- src/dsp_tools/models/value.py | 16 +- .../utils/xmlupload/stash/__init__.py | 0 .../utils/xmlupload/stash/stash_models.py | 51 +++ .../xmlupload/stash_circular_references.py | 38 +-- .../xmlupload/upload_stashed_xml_texts.py | 317 ++++++------------ src/dsp_tools/utils/xmlupload/xmlupload.py | 21 +- .../test_upload_stashed_xml_texts.py | 9 +- .../test_xmlupload/test_xmlupload.py | 87 ----- 8 files changed, 192 insertions(+), 347 deletions(-) create mode 100644 src/dsp_tools/utils/xmlupload/stash/__init__.py create mode 100644 src/dsp_tools/utils/xmlupload/stash/stash_models.py diff --git a/src/dsp_tools/models/value.py b/src/dsp_tools/models/value.py index a494e5610..96fe1acfa 100644 --- a/src/dsp_tools/models/value.py +++ b/src/dsp_tools/models/value.py @@ -1,4 +1,5 @@ # pylint: disable=missing-class-docstring,missing-function-docstring +from __future__ import annotations from typing import Any, Optional, Union @@ -32,12 +33,15 @@ def find_ids_referenced_in_salsah_links(self) -> set[str]: def replace(self, fromStr: str, toStr: str) -> None: self.__xmlstr = self.__xmlstr.replace(fromStr, toStr) - def replace_one_id_with_iri_in_salsah_link(self, internal_id: str, iri: str) -> None: - self.__xmlstr = regex.sub( - pattern=f'href="IRI:{internal_id}:IRI"', - repl=f'href="{iri}"', - string=self.__xmlstr, - ) + def with_iris(self, id_2_iri: dict[str, str]) -> KnoraStandoffXml: + """ + Returns a copy of this object, where all internal ids are replaced with iris according to the provided mapping. + """ + s = self.__xmlstr + for internal_id in self.find_ids_referenced_in_salsah_links(): + iri = id_2_iri[internal_id] + s = s.replace(f'href="IRI:{internal_id}:IRI"', f'href="{iri}"') + return KnoraStandoffXml(s) class Value: diff --git a/src/dsp_tools/utils/xmlupload/stash/__init__.py b/src/dsp_tools/utils/xmlupload/stash/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/dsp_tools/utils/xmlupload/stash/stash_models.py b/src/dsp_tools/utils/xmlupload/stash/stash_models.py new file mode 100644 index 000000000..c0d070163 --- /dev/null +++ b/src/dsp_tools/utils/xmlupload/stash/stash_models.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +from dataclasses import dataclass + +from dsp_tools.models.value import KnoraStandoffXml +from dsp_tools.models.xmlresource import XMLResource + + +@dataclass(frozen=True) +class StandoffStashItem: + """ + Holds information about a single stashed XML text value. + """ + + uuid: str + prop_name: str + value: KnoraStandoffXml + # Permissions missing still + + +@dataclass(frozen=True) +class StandoffStash: + """ + Holds information about a number of stashed XML text values, organized by resource instance. + """ + + res_2_stash_items: dict[str, list[StandoffStashItem]] + res_2_xmlres: dict[str, XMLResource] + + @staticmethod + def make(tups: list[tuple[XMLResource, StandoffStashItem]]) -> StandoffStash | None: + """ + Factory method for StandoffStash. + + Args: + tups: A list of tuples of XMLResource and StandoffStashItem. + + Returns: + StandoffStash | None: A StandoffStash object or None, if an empty list was passed. + """ + if not tups: + return None + res_2_stash_items = {} + res_2_xmlres = {} + for xmlres, stash_item in tups: + if xmlres.id not in res_2_stash_items: + res_2_stash_items[xmlres.id] = [stash_item] + res_2_xmlres[xmlres.id] = xmlres + else: + res_2_stash_items[xmlres.id].append(stash_item) + return StandoffStash(res_2_stash_items, res_2_xmlres) diff --git a/src/dsp_tools/utils/xmlupload/stash_circular_references.py b/src/dsp_tools/utils/xmlupload/stash_circular_references.py index ff5aff6fd..5bfd6b1ac 100644 --- a/src/dsp_tools/utils/xmlupload/stash_circular_references.py +++ b/src/dsp_tools/utils/xmlupload/stash_circular_references.py @@ -1,13 +1,14 @@ from __future__ import annotations -from datetime import datetime from typing import cast +from uuid import uuid4 from dsp_tools.models.exceptions import BaseError from dsp_tools.models.value import KnoraStandoffXml from dsp_tools.models.xmlproperty import XMLProperty from dsp_tools.models.xmlresource import XMLResource from dsp_tools.utils.create_logger import get_logger +from dsp_tools.utils.xmlupload.stash.stash_models import StandoffStash, StandoffStashItem logger = get_logger(__name__) @@ -16,34 +17,31 @@ def _stash_circular_references( nok_resources: list[XMLResource], ok_res_ids: set[str], ok_resources: list[XMLResource], - stashed_xml_texts: dict[XMLResource, dict[XMLProperty, dict[str, KnoraStandoffXml]]], stashed_resptr_props: dict[XMLResource, dict[XMLProperty, list[str]]], ) -> tuple[ list[XMLResource], set[str], list[XMLResource], - dict[XMLResource, dict[XMLProperty, dict[str, KnoraStandoffXml]]], + StandoffStash | None, dict[XMLResource, dict[XMLProperty, list[str]]], ]: """ Raises: BaseError """ + stashed_standoff_values: list[tuple[XMLResource, StandoffStashItem]] = [] for res in nok_resources.copy(): for link_prop in res.get_props_with_links(): if link_prop.valtype == "text": for value in link_prop.values: if value.resrefs and not all(_id in ok_res_ids for _id in value.resrefs): - # stash this XML text, replace it by its hash, and remove the - # problematic resrefs from the XMLValue's resrefs list - value_hash = str(hash(f"{value.value}{datetime.now()}")) - if res not in stashed_xml_texts: - stashed_xml_texts[res] = {link_prop: {value_hash: cast(KnoraStandoffXml, value.value)}} - elif link_prop not in stashed_xml_texts[res]: - stashed_xml_texts[res][link_prop] = {value_hash: cast(KnoraStandoffXml, value.value)} - else: - stashed_xml_texts[res][link_prop][value_hash] = cast(KnoraStandoffXml, value.value) - value.value = KnoraStandoffXml(value_hash) + # replace the problematic XML with a UUID + # and remove the problematic resrefs from the XMLValue's resrefs list + standoff_xml = cast(KnoraStandoffXml, value.value) + uuid = str(uuid4()) + stash_item = StandoffStashItem(uuid=uuid, prop_name=link_prop.name, value=standoff_xml) + stashed_standoff_values.append((res, stash_item)) + value.value = KnoraStandoffXml(uuid) value.resrefs = [_id for _id in value.resrefs if _id in ok_res_ids] elif link_prop.valtype == "resptr": for value in link_prop.values.copy(): @@ -70,17 +68,15 @@ def _stash_circular_references( ok_res_ids.add(res.id) nok_resources.remove(res) - return nok_resources, ok_res_ids, ok_resources, stashed_xml_texts, stashed_resptr_props + standoff_stash = StandoffStash.make(stashed_standoff_values) + + return nok_resources, ok_res_ids, ok_resources, standoff_stash, stashed_resptr_props def remove_circular_references( resources: list[XMLResource], verbose: bool, -) -> tuple[ - list[XMLResource], - dict[XMLResource, dict[XMLProperty, dict[str, KnoraStandoffXml]]], - dict[XMLResource, dict[XMLProperty, list[str]]], -]: +) -> tuple[list[XMLResource], StandoffStash | None, dict[XMLResource, dict[XMLProperty, list[str]]],]: """ Temporarily removes problematic resource-references from a list of resources. A reference is problematic if it creates a circle (circular references). @@ -102,7 +98,7 @@ def remove_circular_references( print("Checking resources for unresolvable references...") logger.info("Checking resources for unresolvable references...") - stashed_xml_texts: dict[XMLResource, dict[XMLProperty, dict[str, KnoraStandoffXml]]] = {} + stashed_xml_texts: StandoffStash | None = None stashed_resptr_props: dict[XMLResource, dict[XMLProperty, list[str]]] = {} # sort the resources according to outgoing resptrs @@ -137,7 +133,6 @@ def remove_circular_references( nok_resources=nok_resources, ok_res_ids=ok_res_ids, ok_resources=ok_resources, - stashed_xml_texts=stashed_xml_texts, stashed_resptr_props=stashed_resptr_props, ) nok_len = len(nok_resources) @@ -146,4 +141,5 @@ def remove_circular_references( if verbose: print(f"{cnt}. ordering pass finished.") logger.info(f"{cnt}. ordering pass finished.") + return ok_resources, stashed_xml_texts, stashed_resptr_props diff --git a/src/dsp_tools/utils/xmlupload/upload_stashed_xml_texts.py b/src/dsp_tools/utils/xmlupload/upload_stashed_xml_texts.py index 96ddd0b5d..933850d24 100644 --- a/src/dsp_tools/utils/xmlupload/upload_stashed_xml_texts.py +++ b/src/dsp_tools/utils/xmlupload/upload_stashed_xml_texts.py @@ -4,8 +4,6 @@ from typing import Any from urllib.parse import quote_plus -import regex - from dsp_tools.models.connection import Connection from dsp_tools.models.exceptions import BaseError from dsp_tools.models.resource import KnoraStandoffXmlEncoder @@ -14,11 +12,12 @@ from dsp_tools.models.xmlresource import XMLResource from dsp_tools.utils.create_logger import get_logger from dsp_tools.utils.shared import try_network_action +from dsp_tools.utils.xmlupload.stash.stash_models import StandoffStash, StandoffStashItem logger = get_logger(__name__) -def log_unable_to_retrieve_resource( +def _log_unable_to_retrieve_resource( resource: XMLResource, received_error: BaseError, ) -> None: @@ -43,238 +42,68 @@ def log_unable_to_retrieve_resource( def _log_unable_to_upload_xml_resource( received_error: BaseError, - stashed_resource: XMLResource, - all_link_props: XMLProperty, + stashed_resource_id: str, + prop_name: str, ) -> None: """ This function logs if it is not possible to upload a xml resource. Args: received_error: Error received - stashed_resource: resource that is stashed - all_link_props: all the link properties from that resource + stashed_resource_id: id of the resource + prop_name: name of the property """ # print the message to keep track of the cause for the failure # apart from that; no action is necessary: # this resource will remain in nonapplied_xml_texts, which will be handled by the caller orig_err_msg = received_error.orig_err_msg_from_api or received_error.message - err_msg = f"Unable to upload the xml text of '{all_link_props.name}' of resource '{stashed_resource.id}'." + err_msg = f"Unable to upload the xml text of '{prop_name}' of resource '{stashed_resource_id}'." print(f" WARNING: {err_msg} Original error message: {orig_err_msg}") logger.warning(err_msg, exc_info=True) -def _get_text_hash_value(old_xmltext: str) -> str: - """ - This function extracts the hash values in the text - - Args: - old_xmltext: Text with hash values. - - Returns: - hash values - """ - return regex.sub(r"(<\?xml.+>\s*)?\s*(.+)\s*<\/text>", r"\2", old_xmltext) - - -def _replace_internal_ids_with_iris( - id2iri_mapping: dict[str, str], - xml_with_id: KnoraStandoffXml, - id_set: set[str], -) -> KnoraStandoffXml: - """ - This function takes an XML string and a set with internal ids that are referenced in salsah-links in that string. - It replaces all internal ids of that set with the corresponding iri according to the mapping dictionary. - - Args: - id2iri_mapping: dictionary with id to iri mapping - xml_with_id: KnoraStandoffXml with the string that should have replacements - id_set: set of ids that are in the string - - Returns: - the xml value with the old ids replaced - """ - for internal_id in id_set: - xml_with_id.replace_one_id_with_iri_in_salsah_link( - internal_id=internal_id, - iri=id2iri_mapping[internal_id], - ) - return xml_with_id - - def _create_XMLResource_json_object_to_update( res_iri: str, - resource_in_triplestore: dict[str, Any], - stashed_resource: XMLResource, - link_prop_in_triplestore: dict[str, Any], - new_xmltext: KnoraStandoffXml, + res_type: str, link_prop_name: str, + value_iri: str, + new_xmltext: KnoraStandoffXml, + context: dict[str, str], ) -> str: """ This function creates a JSON object that can be sent as update request to DSP-API. Args: res_iri: the iri of the resource - resource_in_triplestore: the resource existing in the triplestore - stashed_resource: the same resource from the stash - link_prop_in_triplestore: the link property in the triplestore - new_xmltext: The KnoraStandOffXml with replaced ids + res_type: the type of the resource link_prop_name: the name of the link property + value_iri: the iri of the value + new_xmltext: the new xml text to be uploaded + context: the JSON-LD context of the resource Returns: json string """ jsonobj = { "@id": res_iri, - "@type": stashed_resource.restype, + "@type": res_type, link_prop_name: { - "@id": link_prop_in_triplestore["@id"], + "@id": value_iri, "@type": "knora-api:TextValue", "knora-api:textValueAsXml": new_xmltext, "knora-api:textValueHasMapping": {"@id": "http://rdfh.ch/standoff/mappings/StandardMapping"}, }, - "@context": resource_in_triplestore["@context"], + "@context": context, } return json.dumps(jsonobj, indent=4, separators=(",", ": "), cls=KnoraStandoffXmlEncoder) -def _upload_single_link_xml_property( - link_prop_in_triplestore: dict[str, Any], - res_iri: str, - stashed_resource: XMLResource, - resource_in_triplestore: dict[str, Any], - link_prop: XMLProperty, - hash_to_value: dict[str, KnoraStandoffXml], - id2iri_mapping: dict[str, str], - nonapplied_xml_texts: dict[XMLResource, dict[XMLProperty, dict[str, KnoraStandoffXml]]], - verbose: bool, - con: Connection, -) -> dict[XMLResource, dict[XMLProperty, dict[str, KnoraStandoffXml]]]: - """ - This function uploads a single xml link property, which was previously stashed. - - Args: - link_prop_in_triplestore: the link property from the triplestore - res_iri: the iri of the resource - stashed_resource: the stashed resource - resource_in_triplestore: the resource retrieved from the triplestore - link_prop: the name of the link property - hash_to_value: the hash value of the xml text - id2iri_mapping: the dictionary with the internal ids and the new IRIs - nonapplied_xml_texts: the dictionary with the stashes - verbose: what is printed out - con: the connection to the triplestore - - Returns: - The stash dictionary with the newly uploaded resource removed. - If the upload was not sucessfull, it returns the dictionary as it was before. - """ - xmltext_in_triplestore = link_prop_in_triplestore.get("knora-api:textValueAsXml") - if not xmltext_in_triplestore: - # no action necessary: this property will remain in nonapplied_xml_texts, - # which will be handled by the caller - return nonapplied_xml_texts - - # strip all xml tags from the old xmltext, so that the pure text itself remains - text_hash_value = _get_text_hash_value(xmltext_in_triplestore) - - # if the pure text is a hash, the replacement must be made - # this hash originates from _stash_circular_references(), and identifies the XML texts - try: - xml_from_stash = hash_to_value[text_hash_value] - except KeyError: - # no action necessary: this property will remain in nonapplied_xml_texts, - # which will be handled by the caller - return nonapplied_xml_texts - - id_set = xml_from_stash.find_ids_referenced_in_salsah_links() - - xml_from_stash = _replace_internal_ids_with_iris( - id2iri_mapping=id2iri_mapping, - xml_with_id=xml_from_stash, - id_set=id_set, - ) - - # prepare API call - jsondata = _create_XMLResource_json_object_to_update( - res_iri=res_iri, - resource_in_triplestore=resource_in_triplestore, - stashed_resource=stashed_resource, - link_prop_in_triplestore=link_prop_in_triplestore, - new_xmltext=xml_from_stash, - link_prop_name=link_prop.name, - ) - - # execute API call - try: - try_network_action(con.put, route="/v2/values", jsondata=jsondata) - except BaseError as err: - _log_unable_to_upload_xml_resource( - received_error=err, stashed_resource=stashed_resource, all_link_props=link_prop - ) - return nonapplied_xml_texts - if verbose: - print(f' Successfully uploaded xml text of "{link_prop.name}"\n') - logger.info(f' Successfully uploaded xml text of "{link_prop.name}"\n') - nonapplied_xml_texts[stashed_resource][link_prop].pop(text_hash_value) - return nonapplied_xml_texts - - -def _upload_all_xml_texts_of_single_resource( - res_iri: str, - stashed_resource: XMLResource, - resource_in_triplestore: dict[str, Any], - link_prop: XMLProperty, - hash_to_value: dict[str, KnoraStandoffXml], - id2iri_mapping: dict[str, str], - nonapplied_xml_texts: dict[XMLResource, dict[XMLProperty, dict[str, KnoraStandoffXml]]], - verbose: bool, - con: Connection, -) -> dict[XMLResource, dict[XMLProperty, dict[str, KnoraStandoffXml]]]: - """ - This function takes one resource and extracts all the link properties of that resource. - It sends all the link props to the DSP-API. - - Args: - res_iri: resource IRI - stashed_resource: the resource from the stash - resource_in_triplestore: the resource from the triplestore - link_prop: the link property - hash_to_value: the dictionary which stored the hashes and the KnoraStandoffXml with the corresponding texts - id2iri_mapping: the dictionary that has the internal ids and IRIs to map - nonapplied_xml_texts: the dictionary which contains the unprocessed resources - verbose: how much information should be printed - con: connection to the api - - Returns: - the dictionary which contains the unprocessed resources - """ - all_link_props_in_triplestore = resource_in_triplestore[link_prop.name] - - if not isinstance(all_link_props_in_triplestore, list): - all_link_props_in_triplestore = [all_link_props_in_triplestore] - - for link_prop_in_triplestore in all_link_props_in_triplestore: - nonapplied_xml_texts = _upload_single_link_xml_property( - link_prop_in_triplestore=link_prop_in_triplestore, - res_iri=res_iri, - stashed_resource=stashed_resource, - resource_in_triplestore=resource_in_triplestore, - link_prop=link_prop, - hash_to_value=hash_to_value, - id2iri_mapping=id2iri_mapping, - nonapplied_xml_texts=nonapplied_xml_texts, - verbose=verbose, - con=con, - ) - return nonapplied_xml_texts - - def upload_stashed_xml_texts( verbose: bool, id2iri_mapping: dict[str, str], con: Connection, - stashed_xml_texts: dict[XMLResource, dict[XMLProperty, dict[str, KnoraStandoffXml]]], -) -> dict[XMLResource, dict[XMLProperty, dict[str, KnoraStandoffXml]]]: + stashed_xml_texts: StandoffStash, +) -> StandoffStash | None: """ After all resources are uploaded, the stashed xml texts must be applied to their resources in DSP. @@ -290,40 +119,106 @@ def upload_stashed_xml_texts( print("Upload the stashed XML texts...") logger.info("Upload the stashed XML texts...") - nonapplied_xml_texts = stashed_xml_texts.copy() - for stashed_resource, all_link_props in stashed_xml_texts.items(): - if stashed_resource.id not in id2iri_mapping: + not_uploaded: list[tuple[XMLResource, StandoffStashItem]] = [] + for res_id, stash_items in stashed_xml_texts.res_2_stash_items.items(): + res_iri = id2iri_mapping.get(res_id) + if not res_iri: # resource could not be uploaded to DSP, so the stash cannot be uploaded either # no action necessary: this resource will remain in nonapplied_xml_texts, # which will be handled by the caller continue - res_iri = id2iri_mapping[stashed_resource.id] + xmlres: XMLResource = stashed_xml_texts.res_2_xmlres[res_id] try: resource_in_triplestore = try_network_action(con.get, route=f"/v2/resources/{quote_plus(res_iri)}") except BaseError as err: - log_unable_to_retrieve_resource(resource=stashed_resource, received_error=err) + _log_unable_to_retrieve_resource(resource=xmlres, received_error=err) continue - print(f' Upload XML text(s) of resource "{stashed_resource.id}"...') - logger.info(f' Upload XML text(s) of resource "{stashed_resource.id}"...') - for link_prop, hash_to_value in all_link_props.items(): - nonapplied_xml_texts = _upload_all_xml_texts_of_single_resource( + if verbose: + print(f' Upload XML text(s) of resource "{res_id}"...') + logger.debug(f' Upload XML text(s) of resource "{res_id}"...') + context = resource_in_triplestore["@context"] + for stash_item in stash_items: + value_iri = _get_value_iri(stash_item.prop_name, resource_in_triplestore, stash_item.uuid) + if not value_iri: + not_uploaded.append((xmlres, stash_item)) # does that even make sense to hold on to that one? + continue + success = _upload_stash_item( + stash_item=stash_item, res_iri=res_iri, - stashed_resource=stashed_resource, - resource_in_triplestore=resource_in_triplestore, - link_prop=link_prop, - hash_to_value=hash_to_value, + res_type=xmlres.restype, + res_id=res_id, + value_iri=value_iri, id2iri_mapping=id2iri_mapping, - nonapplied_xml_texts=nonapplied_xml_texts, - verbose=verbose, con=con, + context=context, ) + if not success: + not_uploaded.append((xmlres, stash_item)) + return StandoffStash.make(not_uploaded) + + +def _get_value_iri( + property_name: str, + resource: dict[str, Any], + uuid: str, +) -> str | None: + values_on_server = resource.get(property_name) + if not isinstance(values_on_server, list): + values_on_server = [values_on_server] + + # get the IRI of the value that contains the UUID in its text + text_and_iris = ((v["knora-api:textValueAsXml"], v["@id"]) for v in values_on_server) + value_iri: str | None = next((iri for text, iri in text_and_iris if uuid in text), None) + if not value_iri: + # the value that contains the UUID in its text does not exist in DSP + # no action necessary: this resource will remain in nonapplied_xml_texts, + # which will be handled by the caller + return None + return value_iri - # make a purged version of nonapplied_xml_texts, without empty entries - nonapplied_xml_texts = purge_stashed_xml_texts( - stashed_xml_texts=nonapplied_xml_texts, - id2iri_mapping=id2iri_mapping, + +def _upload_stash_item( + stash_item: StandoffStashItem, + res_iri: str, + res_type: str, + res_id: str, + value_iri: str, + id2iri_mapping: dict[str, str], + con: Connection, + context: dict[str, str], +) -> bool: + """ + Upload a single stashed xml text to DSP. + + Args: + stash_item: the stashed text value to upload + res_iri: the iri of the resource + res_type: the type of the resource + res_id: the internal id of the resource + value_iri: the iri of the value + id2iri_mapping: mapping of ids from the XML file to IRIs in DSP + con: connection to DSP + context: the JSON-LD context of the resource + + Returns: + True, if the upload was successful, False otherwise + """ + adjusted_text_value = stash_item.value.with_iris(id2iri_mapping) + jsondata = _create_XMLResource_json_object_to_update( + res_iri, + res_type, + stash_item.prop_name, + value_iri, + adjusted_text_value, + context, ) - return nonapplied_xml_texts + try: + try_network_action(con.put, route="/v2/values", jsondata=jsondata) + except BaseError as err: + _log_unable_to_upload_xml_resource(err, res_id, stash_item.prop_name) + return False + logger.debug(f' Successfully uploaded xml text of "{stash_item.prop_name}"\n') + return True def purge_stashed_xml_texts( diff --git a/src/dsp_tools/utils/xmlupload/xmlupload.py b/src/dsp_tools/utils/xmlupload/xmlupload.py index 70279e883..1c4bfee37 100644 --- a/src/dsp_tools/utils/xmlupload/xmlupload.py +++ b/src/dsp_tools/utils/xmlupload/xmlupload.py @@ -17,7 +17,6 @@ from dsp_tools.models.projectContext import ProjectContext from dsp_tools.models.resource import KnoraStandoffXmlEncoder, ResourceInstance, ResourceInstanceFactory from dsp_tools.models.sipi import Sipi -from dsp_tools.models.value import KnoraStandoffXml from dsp_tools.models.xmlpermission import XmlPermission from dsp_tools.models.xmlproperty import XMLProperty from dsp_tools.models.xmlresource import XMLResource @@ -32,12 +31,13 @@ calculate_multimedia_file_size, get_sipi_multimedia_information, ) +from dsp_tools.utils.xmlupload.stash.stash_models import StandoffStash from dsp_tools.utils.xmlupload.stash_circular_references import remove_circular_references from dsp_tools.utils.xmlupload.upload_stashed_resptr_props import ( purge_stashed_resptr_props, upload_stashed_resptr_props, ) -from dsp_tools.utils.xmlupload.upload_stashed_xml_texts import purge_stashed_xml_texts, upload_stashed_xml_texts +from dsp_tools.utils.xmlupload.upload_stashed_xml_texts import upload_stashed_xml_texts from dsp_tools.utils.xmlupload.write_diagnostic_info import ( MetricRecord, determine_save_location_of_diagnostic_info, @@ -138,7 +138,7 @@ def xmlupload( id2iri_mapping: dict[str, str] = {} failed_uploads: list[str] = [] nonapplied_resptr_props = {} - nonapplied_xml_texts = {} + nonapplied_xml_texts: StandoffStash | None = None try: id2iri_mapping, failed_uploads, metrics = _upload_resources( resources=resources, @@ -424,7 +424,7 @@ def _handle_upload_error( err: BaseException, id2iri_mapping: dict[str, str], failed_uploads: list[str], - stashed_xml_texts: dict[XMLResource, dict[XMLProperty, dict[str, KnoraStandoffXml]]], + stashed_xml_texts: StandoffStash | None, stashed_resptr_props: dict[XMLResource, dict[XMLProperty, list[str]]], save_location: Path, timestamp_str: str, @@ -455,11 +455,6 @@ def _handle_upload_error( ) logger.error("xmlupload must be aborted because of an error", exc_info=err) - # only stashed properties of resources that already exist in DSP are of interest - stashed_xml_texts = purge_stashed_xml_texts( - stashed_xml_texts=stashed_xml_texts, - id2iri_mapping=id2iri_mapping, - ) stashed_resptr_props = purge_stashed_resptr_props( stashed_resptr_props=stashed_resptr_props, id2iri_mapping=id2iri_mapping, @@ -540,7 +535,7 @@ def save_json_stashed_resptr_properties( def save_json_stashed_text_properties( - stashed_xml_texts: dict[XMLResource, dict[XMLProperty, dict[str, KnoraStandoffXml]]], + stashed_xml_texts: StandoffStash, save_location: Path, timestamp_str: str, ) -> str: @@ -556,14 +551,10 @@ def save_json_stashed_text_properties( Returns: name of the JSON file """ - stashed_xml_texts_serializable = { - resource.id: {_property.name: xml for _property, xml in res_dict.items()} - for resource, res_dict in stashed_xml_texts.items() - } xml_filename = f"{save_location}/{timestamp_str}_stashed_text_properties.json" with open(xml_filename, "x", encoding="utf-8") as file: json.dump( - obj=stashed_xml_texts_serializable, + obj=stashed_xml_texts, fp=file, ensure_ascii=False, indent=4, diff --git a/test/unittests/test_xmlupload/test_upload_stashed_xml_texts.py b/test/unittests/test_xmlupload/test_upload_stashed_xml_texts.py index f288e636c..809c54055 100644 --- a/test/unittests/test_xmlupload/test_upload_stashed_xml_texts.py +++ b/test/unittests/test_xmlupload/test_upload_stashed_xml_texts.py @@ -5,7 +5,6 @@ import pytest from dsp_tools.models.value import KnoraStandoffXml -from dsp_tools.utils.xmlupload import upload_stashed_xml_texts class TestXMLUploadStash(TestCase): @@ -45,9 +44,7 @@ def test__replace_internal_ids_with_iris_one_link(self) -> None: "" ) ) - returned_instance = upload_stashed_xml_texts._replace_internal_ids_with_iris( - id2iri_mapping=test_id2iri, xml_with_id=one_link_KnoraStandoffXml, id_set={"r2_id"} - ) + returned_instance = one_link_KnoraStandoffXml.with_iris(test_id2iri) expected_str = ( '' '' @@ -68,9 +65,7 @@ def test__replace_internal_ids_with_iris_three_links(self) -> None: "" ) ) - returned_instance = upload_stashed_xml_texts._replace_internal_ids_with_iris( - id2iri_mapping=test_id2iri, xml_with_id=three_link_KnoraStandoffXml, id_set={"r2_id", "r3_id"} - ) + returned_instance = three_link_KnoraStandoffXml.with_iris(test_id2iri) expected_str = ( '' '' diff --git a/test/unittests/test_xmlupload/test_xmlupload.py b/test/unittests/test_xmlupload/test_xmlupload.py index e7b72dd4e..753b22b40 100644 --- a/test/unittests/test_xmlupload/test_xmlupload.py +++ b/test/unittests/test_xmlupload/test_xmlupload.py @@ -9,10 +9,8 @@ from lxml import etree from dsp_tools.models.exceptions import BaseError -from dsp_tools.models.xmlresource import XMLResource from dsp_tools.utils.xml_utils import parse_and_clean_xml_file from dsp_tools.utils.xmlupload.ark2iri import convert_ark_v0_to_resource_iri -from dsp_tools.utils.xmlupload.stash_circular_references import remove_circular_references from dsp_tools.utils.xmlupload.write_diagnostic_info import ( _transform_server_url_to_foldername, determine_save_location_of_diagnostic_info, @@ -117,91 +115,6 @@ def test_convert_ark_v0_to_resource_iri(self) -> None: ): convert_ark_v0_to_resource_iri("ark:/72163/080c-779b99+90a0c3f-6e") - def test_remove_circular_references(self) -> None: - # create a list of XMLResources from the test data file - root = parse_and_clean_xml_file("testdata/xml-data/test-data-systematic.xml") - resources = [XMLResource(x, "testonto") for x in root if x.tag == "resource"] - - # get the purged resources and the stashes from the function to be tested - resources, stashed_xml_texts_original, stashed_resptr_props_original = remove_circular_references( - resources=resources, - verbose=False, - ) - - # make a list of all hashes from the stashed xml texts - stashed_xml_texts_hashes = list() - for res, propdict in stashed_xml_texts_original.items(): - for elem in propdict.values(): - for _hash in elem: - stashed_xml_texts_hashes.append(_hash) - - # make a version of the stashes with the IDs from the XML file instead of the Python objects - stashed_xml_texts = { - res.id: {prop.name: [str(x) for x in d.values()] for prop, d in _dict.items()} - for res, _dict in stashed_xml_texts_original.items() - } - stashed_resptr_props = { - res.id: {prop.name: l for prop, l in _dict.items()} for res, _dict in stashed_resptr_props_original.items() - } - - # hardcode the expected values - stashed_xml_texts_expected = { - "test_thing_1": { - "testonto:hasRichtext": [ - "This text contains links to all resources: " - 'test_thing_0 ' - 'test_thing_1 ' - 'image_thing_0 ' - 'compound_thing_0 ' - 'partof_thing_1 ' - 'partof_thing_2 ' - 'partof_thing_3 ' - 'document_thing_1 ' - 'text_thing_1 ' - 'zip_thing_1 ' - 'audio_thing_1 ' - 'test_thing_2 ' - 'test_thing_with_iri_1' - ] - }, - "test_thing_2": { - "testonto:hasRichtext": [ - "This text contains links to all resources: " - 'test_thing_0 ' - 'test_thing_1 ' - 'image_thing_0 ' - 'compound_thing_0 ' - 'partof_thing_1 ' - 'partof_thing_2 ' - 'partof_thing_3 ' - 'document_thing_1 ' - 'text_thing_1 ' - 'zip_thing_1 ' - 'audio_thing_1 ' - 'test_thing_2' - ] - }, - } - stashed_resptr_props_expected = { - "test_thing_0": {"testonto:hasTestThing": ["test_thing_1"]}, - "test_thing_1": {"testonto:hasResource": ["test_thing_2", "link_obj_1"]}, - } - - # check if the stashes are equal to the expected stashes - self.assertDictEqual(stashed_resptr_props, stashed_resptr_props_expected) - self.assertDictEqual(stashed_xml_texts, stashed_xml_texts_expected) - - # check if the stashed hashes can also be found at the correct position in the purged resources - for res, propdict in stashed_xml_texts_original.items(): - for prop, hashdict in propdict.items(): - stashed_hashes = list(hashdict.keys()) - purged_res = resources[resources.index(res)] - purged_prop = purged_res.properties[purged_res.properties.index(prop)] - purged_hashes = [ - str(val.value) for val in purged_prop.values if str(val.value) in stashed_xml_texts_hashes - ] - self.assertListEqual(stashed_hashes, purged_hashes) - if __name__ == "__main__": pytest.main([__file__])