Skip to content

Commit

Permalink
fix: ignore IRIs in stashing algorithm (DEV-2885) (#603)
Browse files Browse the repository at this point in the history
  • Loading branch information
jnussbaum committed Oct 27, 2023
1 parent b693a94 commit 635bc5c
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 10 deletions.
16 changes: 8 additions & 8 deletions src/dsp_tools/analyse_xml_data/construct_and_analyze_graph.py
@@ -1,6 +1,6 @@
from __future__ import annotations

from typing import Any
from typing import Any, cast

import regex
import rustworkx as rx
Expand Down Expand Up @@ -53,11 +53,12 @@ def _create_info_from_xml_for_graph_from_one_resource(
def _create_resptr_link_objects(subject_id: str, resptr_prop: etree._Element) -> list[ResptrLink]:
resptr_links = []
for resptr in resptr_prop.getchildren():
if r_text := resptr.text:
instance = ResptrLink(subject_id, r_text)
resptr.text = cast(str, resptr.text)
if not regex.search(r"https?://rdfh.ch/[a-fA-F0-9]{4}/\w{22}", resptr.text):
link_object = ResptrLink(subject_id, resptr.text)
# this UUID is so that the links that were stashed can be identified in the XML data file
resptr.attrib["stashUUID"] = instance.link_uuid
resptr_links.append(instance)
resptr.attrib["stashUUID"] = link_object.link_uuid
resptr_links.append(link_object)
return resptr_links


Expand All @@ -79,9 +80,8 @@ def _extract_ids_from_one_text_value(text: etree._Element) -> set[str]:
all_links = set()
for ele in text.iterdescendants():
if href := ele.attrib.get("href"):
searched = regex.search(r"IRI:(.*):IRI", href)
if searched:
all_links.add(searched.group(1))
if internal_id := regex.search(r"IRI:(.*):IRI", href):
all_links.add(internal_id.group(1))
return all_links


Expand Down
Expand Up @@ -117,8 +117,8 @@ def test_extract_id_one_text_with_several_id() -> None:
test_ele = etree.fromstring(
"""<text permissions="prop-default" encoding="xml">
<a class="salsah-link" href="IRI:res_A_11:IRI">res_A_11</a>
<a class="salsah-link" href="IRI:res_B_11:IRI">res_A_11</a>
<a class="salsah-link" href="IRI:res_B_11:IRI">res_A_11</a>
<a class="salsah-link" href="IRI:res_B_11:IRI">res_B_11</a>
<a class="salsah-link" href="IRI:res_B_11:IRI">res_B_11</a>
</text>"""
)
res = _extract_ids_from_one_text_value(test_ele)
Expand All @@ -141,6 +141,25 @@ def test_extract_ids_from_text_prop_with_several_text_links() -> None:
assert unordered(res_ids) == [{"res_A_18"}, {"res_B_18"}]


def test_extract_ids_from_text_prop_with_iris_and_ids() -> None:
test_ele = etree.fromstring(
"""<text-prop name=":hasRichtext">
<text permissions="prop-default" encoding="xml">
<a class="salsah-link" href="http://rdfh.ch/4123/vEpjk7zAQBC2j3pvTGSxcw">foo</a>
</text>
<text permissions="prop-default" encoding="xml">
<a class="salsah-link" href="IRI:res_B_18:IRI">res_B_18</a>
</text>
</text-prop>"""
)
res = _create_text_link_objects("foo", test_ele)
assert len(res) == 1
assert res[0].target_ids == {"res_B_18"}
children = list(test_ele.iterchildren())
assert not children[0].attrib.get("stashUUID")
assert children[1].attrib.get("stashUUID")


def test_create_class_instance_resptr_link_one_link() -> None:
test_ele = etree.fromstring(
"""<resptr-prop xmlns="https://dasch.swiss/schema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
Expand All @@ -167,6 +186,24 @@ def test_create_class_instance_resptr_link_several() -> None:
assert res[2].target_id == "res_C_13"


def test_create_class_instance_resptr_link_with_iris() -> None:
test_ele = etree.fromstring(
"""<resptr-prop name=":hasResource1">
<resptr permissions="prop-default">res_A_13</resptr>
<resptr permissions="prop-default">res_B_13</resptr>
<resptr permissions="prop-default">http://rdfh.ch/4123/vEpjk7zAQBC2j3pvTGSxcw</resptr>
</resptr-prop>"""
)
res = _create_resptr_link_objects("foo", test_ele)
assert len(res) == 2
assert res[0].target_id == "res_A_13"
assert res[1].target_id == "res_B_13"
children = list(test_ele.iterchildren())
assert children[0].attrib.get("stashUUID")
assert children[1].attrib.get("stashUUID")
assert not children[2].attrib.get("stashUUID")


def test_create_info_from_xml_for_graph_check_UUID_in_root() -> None:
root = etree.fromstring(
"""<knora shortcode="0700" default-ontology="simcir">
Expand Down

0 comments on commit 635bc5c

Please sign in to comment.