From 635bc5cf3ce5651b5ee0a80734dfc9d33548cd81 Mon Sep 17 00:00:00 2001 From: Johannes Nussbaum <39048939+jnussbaum@users.noreply.github.com> Date: Fri, 27 Oct 2023 10:53:05 +0200 Subject: [PATCH] fix: ignore IRIs in stashing algorithm (DEV-2885) (#603) --- .../construct_and_analyze_graph.py | 16 ++++---- .../test_construct_and_analyze_graph.py | 41 ++++++++++++++++++- 2 files changed, 47 insertions(+), 10 deletions(-) diff --git a/src/dsp_tools/analyse_xml_data/construct_and_analyze_graph.py b/src/dsp_tools/analyse_xml_data/construct_and_analyze_graph.py index 46214d36c..2da5ce0c2 100644 --- a/src/dsp_tools/analyse_xml_data/construct_and_analyze_graph.py +++ b/src/dsp_tools/analyse_xml_data/construct_and_analyze_graph.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any +from typing import Any, cast import regex import rustworkx as rx @@ -53,11 +53,12 @@ def _create_info_from_xml_for_graph_from_one_resource( def _create_resptr_link_objects(subject_id: str, resptr_prop: etree._Element) -> list[ResptrLink]: resptr_links = [] for resptr in resptr_prop.getchildren(): - if r_text := resptr.text: - instance = ResptrLink(subject_id, r_text) + resptr.text = cast(str, resptr.text) + if not regex.search(r"https?://rdfh.ch/[a-fA-F0-9]{4}/\w{22}", resptr.text): + link_object = ResptrLink(subject_id, resptr.text) # this UUID is so that the links that were stashed can be identified in the XML data file - resptr.attrib["stashUUID"] = instance.link_uuid - resptr_links.append(instance) + resptr.attrib["stashUUID"] = link_object.link_uuid + resptr_links.append(link_object) return resptr_links @@ -79,9 +80,8 @@ def _extract_ids_from_one_text_value(text: etree._Element) -> set[str]: all_links = set() for ele in text.iterdescendants(): if href := ele.attrib.get("href"): - searched = regex.search(r"IRI:(.*):IRI", href) - if searched: - all_links.add(searched.group(1)) + if internal_id := regex.search(r"IRI:(.*):IRI", href): + all_links.add(internal_id.group(1)) return all_links diff --git a/test/unittests/test_analyse_xml_data/test_construct_and_analyze_graph.py b/test/unittests/test_analyse_xml_data/test_construct_and_analyze_graph.py index 511388f84..59024aecf 100644 --- a/test/unittests/test_analyse_xml_data/test_construct_and_analyze_graph.py +++ b/test/unittests/test_analyse_xml_data/test_construct_and_analyze_graph.py @@ -117,8 +117,8 @@ def test_extract_id_one_text_with_several_id() -> None: test_ele = etree.fromstring( """ res_A_11 - res_A_11 - res_A_11 + res_B_11 + res_B_11 """ ) res = _extract_ids_from_one_text_value(test_ele) @@ -141,6 +141,25 @@ def test_extract_ids_from_text_prop_with_several_text_links() -> None: assert unordered(res_ids) == [{"res_A_18"}, {"res_B_18"}] +def test_extract_ids_from_text_prop_with_iris_and_ids() -> None: + test_ele = etree.fromstring( + """ + + foo + + + res_B_18 + + """ + ) + res = _create_text_link_objects("foo", test_ele) + assert len(res) == 1 + assert res[0].target_ids == {"res_B_18"} + children = list(test_ele.iterchildren()) + assert not children[0].attrib.get("stashUUID") + assert children[1].attrib.get("stashUUID") + + def test_create_class_instance_resptr_link_one_link() -> None: test_ele = etree.fromstring( """ None: assert res[2].target_id == "res_C_13" +def test_create_class_instance_resptr_link_with_iris() -> None: + test_ele = etree.fromstring( + """ + res_A_13 + res_B_13 + http://rdfh.ch/4123/vEpjk7zAQBC2j3pvTGSxcw + """ + ) + res = _create_resptr_link_objects("foo", test_ele) + assert len(res) == 2 + assert res[0].target_id == "res_A_13" + assert res[1].target_id == "res_B_13" + children = list(test_ele.iterchildren()) + assert children[0].attrib.get("stashUUID") + assert children[1].attrib.get("stashUUID") + assert not children[2].attrib.get("stashUUID") + + def test_create_info_from_xml_for_graph_check_UUID_in_root() -> None: root = etree.fromstring( """