From 635bc5cf3ce5651b5ee0a80734dfc9d33548cd81 Mon Sep 17 00:00:00 2001
From: Johannes Nussbaum <39048939+jnussbaum@users.noreply.github.com>
Date: Fri, 27 Oct 2023 10:53:05 +0200
Subject: [PATCH] fix: ignore IRIs in stashing algorithm (DEV-2885) (#603)
---
.../construct_and_analyze_graph.py | 16 ++++----
.../test_construct_and_analyze_graph.py | 41 ++++++++++++++++++-
2 files changed, 47 insertions(+), 10 deletions(-)
diff --git a/src/dsp_tools/analyse_xml_data/construct_and_analyze_graph.py b/src/dsp_tools/analyse_xml_data/construct_and_analyze_graph.py
index 46214d36c..2da5ce0c2 100644
--- a/src/dsp_tools/analyse_xml_data/construct_and_analyze_graph.py
+++ b/src/dsp_tools/analyse_xml_data/construct_and_analyze_graph.py
@@ -1,6 +1,6 @@
from __future__ import annotations
-from typing import Any
+from typing import Any, cast
import regex
import rustworkx as rx
@@ -53,11 +53,12 @@ def _create_info_from_xml_for_graph_from_one_resource(
def _create_resptr_link_objects(subject_id: str, resptr_prop: etree._Element) -> list[ResptrLink]:
resptr_links = []
for resptr in resptr_prop.getchildren():
- if r_text := resptr.text:
- instance = ResptrLink(subject_id, r_text)
+ resptr.text = cast(str, resptr.text)
+ if not regex.search(r"https?://rdfh.ch/[a-fA-F0-9]{4}/\w{22}", resptr.text):
+ link_object = ResptrLink(subject_id, resptr.text)
# this UUID is so that the links that were stashed can be identified in the XML data file
- resptr.attrib["stashUUID"] = instance.link_uuid
- resptr_links.append(instance)
+ resptr.attrib["stashUUID"] = link_object.link_uuid
+ resptr_links.append(link_object)
return resptr_links
@@ -79,9 +80,8 @@ def _extract_ids_from_one_text_value(text: etree._Element) -> set[str]:
all_links = set()
for ele in text.iterdescendants():
if href := ele.attrib.get("href"):
- searched = regex.search(r"IRI:(.*):IRI", href)
- if searched:
- all_links.add(searched.group(1))
+ if internal_id := regex.search(r"IRI:(.*):IRI", href):
+ all_links.add(internal_id.group(1))
return all_links
diff --git a/test/unittests/test_analyse_xml_data/test_construct_and_analyze_graph.py b/test/unittests/test_analyse_xml_data/test_construct_and_analyze_graph.py
index 511388f84..59024aecf 100644
--- a/test/unittests/test_analyse_xml_data/test_construct_and_analyze_graph.py
+++ b/test/unittests/test_analyse_xml_data/test_construct_and_analyze_graph.py
@@ -117,8 +117,8 @@ def test_extract_id_one_text_with_several_id() -> None:
test_ele = etree.fromstring(
"""
res_A_11
- res_A_11
- res_A_11
+ res_B_11
+ res_B_11
"""
)
res = _extract_ids_from_one_text_value(test_ele)
@@ -141,6 +141,25 @@ def test_extract_ids_from_text_prop_with_several_text_links() -> None:
assert unordered(res_ids) == [{"res_A_18"}, {"res_B_18"}]
+def test_extract_ids_from_text_prop_with_iris_and_ids() -> None:
+ test_ele = etree.fromstring(
+ """
+
+ foo
+
+
+ res_B_18
+
+ """
+ )
+ res = _create_text_link_objects("foo", test_ele)
+ assert len(res) == 1
+ assert res[0].target_ids == {"res_B_18"}
+ children = list(test_ele.iterchildren())
+ assert not children[0].attrib.get("stashUUID")
+ assert children[1].attrib.get("stashUUID")
+
+
def test_create_class_instance_resptr_link_one_link() -> None:
test_ele = etree.fromstring(
""" None:
assert res[2].target_id == "res_C_13"
+def test_create_class_instance_resptr_link_with_iris() -> None:
+ test_ele = etree.fromstring(
+ """
+ res_A_13
+ res_B_13
+ http://rdfh.ch/4123/vEpjk7zAQBC2j3pvTGSxcw
+ """
+ )
+ res = _create_resptr_link_objects("foo", test_ele)
+ assert len(res) == 2
+ assert res[0].target_id == "res_A_13"
+ assert res[1].target_id == "res_B_13"
+ children = list(test_ele.iterchildren())
+ assert children[0].attrib.get("stashUUID")
+ assert children[1].attrib.get("stashUUID")
+ assert not children[2].attrib.get("stashUUID")
+
+
def test_create_info_from_xml_for_graph_check_UUID_in_root() -> None:
root = etree.fromstring(
"""