diff --git a/src/dsp_tools/models/value.py b/src/dsp_tools/models/value.py index bb8ef776a..a494e5610 100644 --- a/src/dsp_tools/models/value.py +++ b/src/dsp_tools/models/value.py @@ -26,11 +26,18 @@ def __str__(self) -> str: def get_all_iris(self) -> Optional[list[str]]: return self.__iriregexp.findall(self.__xmlstr) + def find_ids_referenced_in_salsah_links(self) -> set[str]: + return set(regex.findall(pattern='href="IRI:(.*?):IRI"', string=self.__xmlstr)) + def replace(self, fromStr: str, toStr: str) -> None: self.__xmlstr = self.__xmlstr.replace(fromStr, toStr) - def regex_replace(self, pattern: str, repl: str) -> None: - self.__xmlstr = regex.sub(pattern=repr(pattern)[1:-1], repl=repl, string=self.__xmlstr) + def replace_one_id_with_iri_in_salsah_link(self, internal_id: str, iri: str) -> None: + self.__xmlstr = regex.sub( + pattern=f'href="IRI:{internal_id}:IRI"', + repl=f'href="{iri}"', + string=self.__xmlstr, + ) class Value: diff --git a/src/dsp_tools/utils/xml_upload_stash.py b/src/dsp_tools/utils/xml_upload_stash.py index 5b43246eb..97c626882 100644 --- a/src/dsp_tools/utils/xml_upload_stash.py +++ b/src/dsp_tools/utils/xml_upload_stash.py @@ -52,9 +52,6 @@ def _log_unable_to_upload_xml_resource( received_error: Error received stashed_resource: resource that is stashed all_link_props: all the link properties from that resource - - Returns: - """ # print the message to keep track of the cause for the failure # apart from that; no action is necessary: @@ -65,12 +62,34 @@ def _log_unable_to_upload_xml_resource( logger.warning(err_msg, exc_info=True) +def _log_iri_does_not_exist_error( + received_error: KeyError, + stashed_resource: XMLResource, + all_link_props: XMLProperty, +) -> None: + """ + This function logs if it is not possible to upload an XML resource + if a linked resource does not have an IRI in the triplestore. + + Args: + received_error: Error received + stashed_resource: resource that is stashed + all_link_props: all the link properties from that resource + """ + err_msg = ( + f"Unable to upload the xml text of '{all_link_props.name}' of resource '{stashed_resource.id}'" + f"because the resource with the internal id '{received_error.args[0]}' does not exist in the triplestore." + ) + print(f" WARNING: {err_msg}") + logger.warning(err_msg, exc_info=True) + + def _get_text_hash_value(old_xmltext: str) -> str: """ This function extracts the hash values in the text Args: - old_xmltext: Text with has values. + old_xmltext: Text with hash values. Returns: hash values @@ -79,31 +98,33 @@ def _get_text_hash_value(old_xmltext: str) -> str: def _replace_internal_ids_with_iris( - pure_text: str, id2iri_mapping: dict[str, str], - hash_to_value: dict[str, KnoraStandoffXml], + xml_with_id: KnoraStandoffXml, + id_set: set[str], ) -> KnoraStandoffXml: """ - This function replaces the internal ids with the new IRIs from the triplestore. + This function takes an XML string and a set with internal ids that are referenced in salsah-links in that string. + It replaces all internal ids of that set with the corresponding iri according to the mapping dictionary. Args: - pure_text: the text with the ids - id2iri_mapping: the dictionary that contains the mapping information - hash_to_value: the dictionary that contains the hash of the string and the xml value + id2iri_mapping: dictionary with id to iri mapping + xml_with_id: KnoraStandoffXml with the string that should have replacements + id_set: set of ids that are in the string Returns: the xml value with the old ids replaced """ - new_xmltext = hash_to_value[pure_text] - # replace the outdated internal ids by their IRI - for _id, _iri in id2iri_mapping.items(): - new_xmltext.regex_replace(f'href="IRI:{_id}:IRI"', f'href="{_iri}"') - return new_xmltext + for internal_id in id_set: + xml_with_id.replace_one_id_with_iri_in_salsah_link( + internal_id=internal_id, + iri=id2iri_mapping[internal_id], + ) + return xml_with_id def _create_XMLResource_json_object_to_update( res_iri: str, - resource_in_triplestore: Any, + resource_in_triplestore: dict[str, Any], stashed_resource: XMLResource, link_prop_in_triplestore: dict[str, Any], new_xmltext: KnoraStandoffXml, @@ -117,7 +138,7 @@ def _create_XMLResource_json_object_to_update( resource_in_triplestore: the resource existing in the triplestore stashed_resource: the same resource from the stash link_prop_in_triplestore: the link property in the triplestore - new_xmltext: the new xml text with the IRIs + new_xmltext: The KnoraStandOffXml with replaced ids link_prop_name: the name of the link property Returns: @@ -137,11 +158,11 @@ def _create_XMLResource_json_object_to_update( return json.dumps(jsonobj, indent=4, separators=(",", ": "), cls=KnoraStandoffXmlEncoder) -def upload_single_link_xml_property( - link_prop_in_triplestore: Any, +def _upload_single_link_xml_property( + link_prop_in_triplestore: dict[str, Any], res_iri: str, stashed_resource: XMLResource, - resource_in_triplestore: Any, + resource_in_triplestore: dict[str, Any], link_prop: XMLProperty, hash_to_value: dict[str, KnoraStandoffXml], id2iri_mapping: dict[str, str], @@ -158,7 +179,7 @@ def upload_single_link_xml_property( stashed_resource: the stashed resource resource_in_triplestore: the resource retrieved from the triplestore link_prop: the name of the link property - hash_to_value: the has value of the xml text + hash_to_value: the hash value of the xml text id2iri_mapping: the dictionary with the internal ids and the new IRIs nonapplied_xml_texts: the dictionary with the stashes verbose: what is printed out @@ -166,7 +187,7 @@ def upload_single_link_xml_property( Returns: The stash dictionary with the newly uploaded resource removed. - If the upload was not sucessfull it returns the dictionary as it was before. + If the upload was not sucessfull, it returns the dictionary as it was before. """ xmltext_in_triplestore = link_prop_in_triplestore.get("knora-api:textValueAsXml") if not xmltext_in_triplestore: @@ -179,13 +200,24 @@ def upload_single_link_xml_property( # if the pure text is a hash, the replacement must be made # this hash originates from _stash_circular_references(), and identifies the XML texts - if text_hash_value not in hash_to_value: + try: + xml_from_stash = hash_to_value[text_hash_value] + except KeyError as err: + _log_iri_does_not_exist_error( + received_error=err, + stashed_resource=stashed_resource, + all_link_props=link_prop, + ) # no action necessary: this property will remain in nonapplied_xml_texts, # which will be handled by the caller return nonapplied_xml_texts - new_xmltext = _replace_internal_ids_with_iris( - pure_text=text_hash_value, id2iri_mapping=id2iri_mapping, hash_to_value=hash_to_value + id_set = xml_from_stash.find_ids_referenced_in_salsah_links() + + xml_from_stash = _replace_internal_ids_with_iris( + id2iri_mapping=id2iri_mapping, + xml_with_id=xml_from_stash, + id_set=id_set, ) # prepare API call @@ -194,7 +226,7 @@ def upload_single_link_xml_property( resource_in_triplestore=resource_in_triplestore, stashed_resource=stashed_resource, link_prop_in_triplestore=link_prop_in_triplestore, - new_xmltext=new_xmltext, + new_xmltext=xml_from_stash, link_prop_name=link_prop.name, ) @@ -233,7 +265,7 @@ def upload_all_link_props_of_single_resource( stashed_resource: the resource from the stash resource_in_triplestore: the resource from the triplestore link_prop: the link property - hash_to_value: the dictionary which stored the hashes and their corresponding text + hash_to_value: the dictionary which stored the hashes and the KnoraStandoffXml with the corresponding texts id2iri_mapping: the dictionary that has the internal ids and IRIs to map nonapplied_xml_texts: the dictionary which contains the unprocessed resources verbose: how much information should be printed @@ -248,7 +280,7 @@ def upload_all_link_props_of_single_resource( all_link_props_in_triplestore = [all_link_props_in_triplestore] for link_prop_in_triplestore in all_link_props_in_triplestore: - nonapplied_xml_texts = upload_single_link_xml_property( + nonapplied_xml_texts = _upload_single_link_xml_property( link_prop_in_triplestore=link_prop_in_triplestore, res_iri=res_iri, stashed_resource=stashed_resource, diff --git a/test/unittests/test_xml_upload_stash.py b/test/unittests/test_xml_upload_stash.py new file mode 100644 index 000000000..bbbd98ae7 --- /dev/null +++ b/test/unittests/test_xml_upload_stash.py @@ -0,0 +1,86 @@ +# pylint: disable=missing-class-docstring,missing-function-docstring,protected-access + +from unittest import TestCase + +import pytest + +from dsp_tools.models.value import KnoraStandoffXml +from dsp_tools.utils import xml_upload_stash + + +class TestXMLUploadStash(TestCase): + def test_find_ids_referenced_in_salsah_links_one_link(self) -> None: + one_link_KnoraStandoffXml = KnoraStandoffXml( + xmlstr=( + '' + '' + 'r2_id' + "" + ) + ) + returned_set = one_link_KnoraStandoffXml.find_ids_referenced_in_salsah_links() + self.assertEqual({"r2_id"}, returned_set) + + def test_find_ids_referenced_in_salsah_links_three_links(self) -> None: + three_link_KnoraStandoffXml = KnoraStandoffXml( + xmlstr=( + '' + '' + 'r2_idThis is normal text' + 'r3_id' + 'r2_id' + "" + ) + ) + returned_set = three_link_KnoraStandoffXml.find_ids_referenced_in_salsah_links() + self.assertEqual({"r2_id", "r3_id"}, returned_set) + + def test__replace_internal_ids_with_iris_one_link(self) -> None: + test_id2iri = {"r1_id": "r1_iri", "r2_id": "r2_iri", "r3_id": "r3_iri"} + one_link_KnoraStandoffXml = KnoraStandoffXml( + xmlstr=( + '' + '' + 'r2_id' + "" + ) + ) + returned_instance = xml_upload_stash._replace_internal_ids_with_iris( + id2iri_mapping=test_id2iri, xml_with_id=one_link_KnoraStandoffXml, id_set={"r2_id"} + ) + expected_str = ( + '' + '' + 'r2_id' + "" + ) + self.assertEqual(expected_str, str(returned_instance)) + + def test__replace_internal_ids_with_iris_three_links(self) -> None: + test_id2iri = {"r1_id": "r1_iri", "r2_id": "r2_iri", "r3_id": "r3_iri"} + three_link_KnoraStandoffXml = KnoraStandoffXml( + xmlstr=( + '' + '' + 'r2_idThis is normal text' + 'r3_id' + 'r2_id' + "" + ) + ) + returned_instance = xml_upload_stash._replace_internal_ids_with_iris( + id2iri_mapping=test_id2iri, xml_with_id=three_link_KnoraStandoffXml, id_set={"r2_id", "r3_id"} + ) + expected_str = ( + '' + '' + 'r2_idThis is normal text' + 'r3_id' + 'r2_id' + "" + ) + self.assertEqual(expected_str, str(returned_instance)) + + +if __name__ == "__main__": + pytest.main([__file__])