Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(xmlupload): optimize stash links (DEV-2847) #573

Merged
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
4d4aa56
Update stash_models.py
Nora-Olivia-Ammann Oct 17, 2023
1dff378
Update Test Data
Nora-Olivia-Ammann Oct 17, 2023
3f97c2c
Add text string to model and update test
Nora-Olivia-Ammann Oct 17, 2023
1fd8286
update cost models
Nora-Olivia-Ammann Oct 17, 2023
381cb4f
Update extract_links_from_XML.py
Nora-Olivia-Ammann Oct 17, 2023
3b027aa
Update models.py
Nora-Olivia-Ammann Oct 17, 2023
fd84ac9
Restructure Classes
Nora-Olivia-Ammann Oct 18, 2023
6bfc860
Merge branch 'main' into wip/dev-2833-apply-optimized-stash-identific…
Nora-Olivia-Ammann Oct 18, 2023
4ddf9bf
Create create_upload_order_stash_circles.py
Nora-Olivia-Ammann Oct 18, 2023
ca112ca
change node indices
Nora-Olivia-Ammann Oct 18, 2023
3050cf7
Merge branch 'wip/dev-2833-apply-optimized-stash-identification-in-ac…
Nora-Olivia-Ammann Oct 18, 2023
2b12b41
Update extract_links_from_XML.py
Nora-Olivia-Ammann Oct 18, 2023
005dfe3
Merge branch 'main' into wip/dev-2833-apply-optimized-stash-identific…
Nora-Olivia-Ammann Oct 18, 2023
4f39e80
Merge branch 'wip/dev-2833-apply-optimized-stash-identification-in-ac…
Nora-Olivia-Ammann Oct 18, 2023
eb36f1e
Update extract_links_from_XML.py
Nora-Olivia-Ammann Oct 18, 2023
8dabfeb
Update extract_links_from_XML.py
Nora-Olivia-Ammann Oct 18, 2023
7787679
phantom link removal
Nora-Olivia-Ammann Oct 18, 2023
9e29a20
Update extract_links_from_XML.py
Nora-Olivia-Ammann Oct 18, 2023
66def93
Merge branch 'main' into wip/dev-2833-apply-optimized-stash-identific…
Nora-Olivia-Ammann Oct 18, 2023
a68fe27
Update extract_links_from_XML.py
Nora-Olivia-Ammann Oct 19, 2023
23eb6d1
fix test
Nora-Olivia-Ammann Oct 19, 2023
bc91de0
add test
Nora-Olivia-Ammann Oct 19, 2023
004d8e6
Merge branch 'main' into wip/dev-2833-apply-optimized-stash-identific…
Nora-Olivia-Ammann Oct 19, 2023
8079619
remove print statements
Nora-Olivia-Ammann Oct 19, 2023
631221c
Merge branch 'wip/dev-2833-apply-optimized-stash-identification-in-ac…
Nora-Olivia-Ammann Oct 19, 2023
ede4218
Update extract_links_from_XML.py
Nora-Olivia-Ammann Oct 19, 2023
f5ce7d4
Update extract_links_from_XML.py
Nora-Olivia-Ammann Oct 19, 2023
8c27ac7
Update extract_links_from_XML.py
Nora-Olivia-Ammann Oct 19, 2023
011e385
remove resource stash info
Nora-Olivia-Ammann Oct 19, 2023
bf1ff70
comments update
Nora-Olivia-Ammann Oct 19, 2023
9104e66
Update test_extract_links_from_XML.py
Nora-Olivia-Ammann Oct 19, 2023
506f30a
Update test_extract_links_from_XML.py
Nora-Olivia-Ammann Oct 19, 2023
a87c554
remove root
Nora-Olivia-Ammann Oct 19, 2023
1f13d11
Update extract_links_from_XML.py
Nora-Olivia-Ammann Oct 19, 2023
fe7a875
Update models.py
Nora-Olivia-Ammann Oct 19, 2023
c71ef40
remove unnecessary function
Nora-Olivia-Ammann Oct 19, 2023
0f99f61
Update test_extract_links_from_XML.py
Nora-Olivia-Ammann Oct 19, 2023
6c9105b
renaming files
Nora-Olivia-Ammann Oct 20, 2023
bf3d577
docstrings
Nora-Olivia-Ammann Oct 20, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
316 changes: 219 additions & 97 deletions src/dsp_tools/analyse_xml_data/extract_links_from_XML.py

Large diffs are not rendered by default.

18 changes: 13 additions & 5 deletions src/dsp_tools/analyse_xml_data/models.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,24 @@
from __future__ import annotations

import uuid
from dataclasses import dataclass, field


@dataclass
@dataclass(frozen=True)
class ResptrLink:
"""This class represents a link between two resources."""

subject_id: str
object_id: str
link_uuid: str = field(default_factory=lambda: str(uuid.uuid4()))
Nora-Olivia-Ammann marked this conversation as resolved.
Show resolved Hide resolved

@property
def cost_links(self) -> float:
"""The cost of this outgoing is consistently 1"""
return 1

@dataclass

@dataclass(frozen=True)
class XMLLink:
"""
This class represents a link between a resource and an XML text
Nora-Olivia-Ammann marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -20,6 +27,7 @@ class XMLLink:

subject_id: str
object_link_ids: set[str]
link_uuid: str = field(default_factory=lambda: str(uuid.uuid4()))
Nora-Olivia-Ammann marked this conversation as resolved.
Show resolved Hide resolved

@property
def cost_links(self) -> float:
Expand All @@ -28,13 +36,13 @@ def cost_links(self) -> float:


@dataclass
class UploadResource:
class ResourceStashInfo:
"""
Holds information about a resource that can be uploaded to the DSP.

May hold information about the links that need to be stashed from this resource before it can be uploaded.
A ordered list of UploadResources can be used to determine the order in which resources need to be uploaded.
An ordered list of UploadResources can be used to determine the order in which resources need to be uploaded.
"""

res_id: str
stash_links_to: list[str] = field(default_factory=list)
stash_links_to: list[XMLLink | ResptrLink] = field(default_factory=list)
Nora-Olivia-Ammann marked this conversation as resolved.
Show resolved Hide resolved
Empty file.
2 changes: 1 addition & 1 deletion src/dsp_tools/utils/xmlupload/stash/stash_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def make(standoff_stash: StandoffStash | None, link_value_stash: LinkValueStash
link_value_stash: A LinkValueStash object or None.

Returns:
Stash: A Stash object, or None if both iunputs are None.
Stash: A Stash object, or None if both inputs are None.
"""
if standoff_stash or link_value_stash:
return Stash(standoff_stash, link_value_stash)
Expand Down
65 changes: 46 additions & 19 deletions test/unittests/test_analyse_xml_data/test_extract_links_from_XML.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@
from pytest_unordered import unordered

from dsp_tools.analyse_xml_data.extract_links_from_XML import (
_create_class_instance_resptr_link,
_create_class_instance_text_prop,
_create_info_from_xml_for_graph,
_create_info_from_xml_for_graph_from_one_resource,
_extract_ids_from_one_resptr_prop,
_extract_ids_from_one_text_value,
_extract_ids_from_text_prop,
_get_all_links_from_one_resource,
)

Expand All @@ -25,8 +26,7 @@ def test_create_info_from_xml_for_graph_from_one_resource() -> None:
res_resptr_links, res_xml_links, res_all_used_ids = _create_info_from_xml_for_graph_from_one_resource(test_ele)
res_B_19 = [obj.object_id for obj in res_resptr_links]
assert "res_B_19" in res_B_19
res_C_19 = [obj.object_id for obj in res_resptr_links]
assert "res_C_19" in res_C_19
assert "res_C_19" in res_B_19
assert "res_A_19" == res_all_used_ids
assert res_xml_links[0].subject_id == "res_A_19"
assert res_xml_links[0].object_link_ids == {"res_B_19", "res_C_19"}
Expand All @@ -40,17 +40,17 @@ def test_get_all_links_from_one_resource() -> None:
'href="IRI:res_B_11:IRI">res_B_11</a></text></text-prop><resptr-prop name=":hasResource1"><resptr '
'permissions="prop-default">res_B_11</resptr></resptr-prop></resource>'
)
res_resptr, res_xml = _get_all_links_from_one_resource(test_ele)
expected_resptr, expected_xml = ["res_B_11"], [{"res_B_11"}]
assert expected_resptr == res_resptr
assert unordered(res_xml) == expected_xml
res_resptr, res_xml = _get_all_links_from_one_resource("res_A_11", test_ele)
assert res_resptr[0].object_id == "res_B_11"
res_xml_ids = res_xml[0]
assert res_xml_ids.object_link_ids == {"res_B_11"}


def test_get_all_links_from_one_resource_no_links() -> None:
test_ele = etree.fromstring(
'<resource label="res_B_18" restype=":TestThing" id="res_B_18" permissions="res-default"/>'
)
res = _get_all_links_from_one_resource(test_ele)
res = _get_all_links_from_one_resource("res_B_18", test_ele)
assert res == ([], [])


Expand All @@ -62,9 +62,10 @@ def test_text_only_get_all_links_from_one_resource() -> None:
'href="IRI:res_A_18:IRI">res_A_18</a></text><text permissions="prop-default" encoding="xml"><a '
'class="salsah-link" href="IRI:res_B_18:IRI">res_B_18</a></text></text-prop></resource>'
)
res_resptr, res_xml = _get_all_links_from_one_resource(test_ele)
res_resptr, res_xml = _get_all_links_from_one_resource("res_C_18", test_ele)
assert not res_resptr
assert unordered(res_xml) == [{"res_A_18"}, {"res_B_18"}]
res_xml_ids = [x.object_link_ids for x in res_xml]
assert unordered(res_xml_ids) == [{"res_A_18"}, {"res_B_18"}]


def test_extract_id_one_text_with_one_id() -> None:
Expand Down Expand Up @@ -102,28 +103,54 @@ def test_extract_ids_from_text_prop_with_several_text_links() -> None:
'href="IRI:res_A_18:IRI">res_A_18</a></text><text permissions="prop-default" encoding="xml"><a '
'class="salsah-link" href="IRI:res_B_18:IRI">res_B_18</a></text></text-prop>'
)
res = _extract_ids_from_text_prop(test_ele)
assert unordered(res) == [{"res_A_18"}, {"res_B_18"}]
res = _create_class_instance_text_prop("res_C_18", test_ele)
res_ids = [x.object_link_ids for x in res]
assert unordered(res_ids) == [{"res_A_18"}, {"res_B_18"}]


def test_extract_one_id_resptr_prop() -> None:
def test_create_class_instance_resptr_link_one_link() -> None:
test_ele = etree.fromstring(
'<resptr-prop xmlns="https://dasch.swiss/schema" '
'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" name=":hasResource1"><resptr '
'permissions="prop-default">res_C_15</resptr></resptr-prop>'
)
res = _extract_ids_from_one_resptr_prop(test_ele)
assert unordered(res) == ["res_C_15"]
res = _create_class_instance_resptr_link("res_A_15", test_ele)
assert res[0].object_id == "res_C_15"


def test_extract_several_id_resptr_prop() -> None:
def test_create_class_instance_resptr_link_several() -> None:
test_ele = etree.fromstring(
'<resptr-prop xmlns="https://dasch.swiss/schema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" '
'name=":hasResource1"><resptr permissions="prop-default">res_A_13</resptr><resptr '
'permissions="prop-default">res_B_13</resptr><resptr permissions="prop-default">res_C_13</resptr></resptr-prop>'
)
res = _extract_ids_from_one_resptr_prop(test_ele)
assert unordered(res) == ["res_A_13", "res_B_13", "res_C_13"]
res = _create_class_instance_resptr_link("res_D_13", test_ele)
assert res[0].object_id == "res_A_13"
assert res[1].object_id == "res_B_13"
assert res[2].object_id == "res_C_13"
Nora-Olivia-Ammann marked this conversation as resolved.
Show resolved Hide resolved


def test_create_info_from_xml_for_graph_check_UUID_in_root() -> None:
root = etree.fromstring(
b'<?xml version="1.0" encoding="UTF-8"?><knora xmlns="https://dasch.swiss/schema" '
b'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="https://dasch.swiss/schema '
b'https://raw.githubusercontent.com/dasch-swiss/dsp-tools/main/src/dsp_tools/resources/schema/data.xsd" '
b'shortcode="0700" default-ontology="simcir"><resource label="res_A_11" restype=":TestThing" id="res_A_11" '
b'permissions="res-default"><resptr-prop name=":hasResource1"><resptr '
b'permissions="prop-default">res_B_11</resptr></resptr-prop></resource><resource label="res_B_11" '
b'restype=":TestThing" id="res_B_11" permissions="res-default"><text-prop name=":hasRichtext"><text '
b'permissions="prop-default" encoding="xml">Start text<a class="salsah-link" '
b'href="IRI:res_C_11:IRI">res_C_11</a>end text.</text></text-prop></resource><resource label="res_C_11" '
b'restype=":TestThing" id="res_C_11" permissions="res-default"></resource></knora>'
Nora-Olivia-Ammann marked this conversation as resolved.
Show resolved Hide resolved
)
res_root, res_resptr_li, res_xml_li, res_all_ids = _create_info_from_xml_for_graph(root)
res_resptr = res_resptr_li[0]
res_xml = res_xml_li[0]
assert unordered(res_all_ids) == ["res_A_11", "res_B_11", "res_C_11"]
xml_res_resptr = res_root.find(".//{https://dasch.swiss/schema}resptr")
assert xml_res_resptr.attrib["stashUUID"] == res_resptr.link_uuid # type: ignore[union-attr]
xml_res_text = res_root.find(".//{https://dasch.swiss/schema}text")
assert xml_res_text.attrib["stashUUID"] == res_xml.link_uuid # type: ignore[union-attr]


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,9 @@ def _make_xml_text_prop(target_res: etree._Element | list[etree._Element]) -> et
salsah_link = _make_salsah_link(target_res.attrib["id"])
# one resource with many targets
case list():
salsah_link = "".join([_make_salsah_link(x.attrib["id"]) for x in target_res])
return excel2xml.make_text_prop(name=":hasRichtext", value=excel2xml.PropertyElement(salsah_link, encoding="xml"))
salsah_link = "blabla".join([_make_salsah_link(x.attrib["id"]) for x in target_res])
text_value = "Start text" + salsah_link + "end text."
return excel2xml.make_text_prop(name=":hasRichtext", value=excel2xml.PropertyElement(text_value, encoding="xml"))


def _make_resptr_prop(target_id: list[str] | str, property_name: str = ":hasResource1") -> etree._Element:
Expand Down Expand Up @@ -257,7 +258,8 @@ def _make_two_resource_circle_plus_non_circle_link(replication_counter: str) ->

def _make_single_text_ele_for_text_prop(target_resource_id: list[str]) -> etree._Element:
salsa_links = [_make_salsah_link(x) for x in target_resource_id]
xml_props = [excel2xml.PropertyElement(salsah_link, encoding="xml") for salsah_link in salsa_links]
text_values = [f"Start text{link}end text." for link in salsa_links]
xml_props = [excel2xml.PropertyElement(text_value, encoding="xml") for text_value in text_values]
return excel2xml.make_text_prop(name=":hasRichtext", value=xml_props)


Expand Down Expand Up @@ -297,3 +299,7 @@ def _make_complex_circle_with_leaf_nodes(replication_counter: str) -> list[etree
resource_list[-1].append(xml_prop)
resource_list.extend(leaf_resources)
return resource_list


if __name__ == "__main__":
create_and_save_circular_references_test_graph()