Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: refactor graph analysing #589

Merged
merged 8 commits into from
Oct 24, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/check-pr-title.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
with:
regex:
"^[a-z]+(\\([0-9a-z\\-_, ]+\\))?!?: .+\\(DEV-\\d+(, DEV-\\d+)*\\)$|\
^chore.*"
^(chore|refactor|style)(\\([0-9a-z\\-_, ]+\\))?!?: .+"
# see here on how to cope with linebreaks in YAML: https://stackoverflow.com/a/21699210/14414188
allowed_prefixes: "fix,refactor,feat,docs,chore,style,test"
disallowed_prefixes: "feature,hotfix"
Expand Down
50 changes: 23 additions & 27 deletions src/dsp_tools/analyse_xml_data/construct_and_analyze_graph.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
from __future__ import annotations

from typing import Any

import regex
import rustworkx as rx
from lxml import etree
Expand Down Expand Up @@ -84,12 +88,7 @@ def _extract_ids_from_one_text_value(text: etree._Element) -> set[str]:

def make_graph(
resptr_links: list[ResptrLink], xml_links: list[XMLLink], all_resource_ids: list[str]
) -> tuple[ # type: ignore[type-arg]
rx.PyDiGraph, # pylint: disable=no-member
dict[int, str],
list[tuple[int, int, ResptrLink | XMLLink]],
set[int],
]:
) -> tuple[rx.PyDiGraph[Any, Any], dict[int, str], list[tuple[int, int, ResptrLink | XMLLink]], set[int]]:
"""
This function takes information about the resources (nodes) and links between them (edges).
From that it constructs a rustworkx directed graph.
Expand All @@ -105,23 +104,22 @@ def make_graph(
Returns:
The rustworkx graph and a dictionary that contains the index number of the nodes with the original resource id
"""
g: rx.PyDiGraph = rx.PyDiGraph() # type: ignore[type-arg] # pylint: disable=no-member
g: rx.PyDiGraph[Any, Any] = rx.PyDiGraph() # pylint: disable=no-member
nodes = [(id_, None, None) for id_ in all_resource_ids]
node_indices = g.add_nodes_from(nodes)
node_indices = list(node_indices) # type: ignore[assignment]
node_indices = list(g.add_nodes_from(nodes))
node_id_lookup = dict(zip(all_resource_ids, node_indices))
node_index_lookup = dict(zip(node_indices, all_resource_ids))
edges: list[tuple[int, int, ResptrLink | XMLLink]] = [
(node_id_lookup[x.subject_id], node_id_lookup[x.object_id], x) for x in resptr_links
(node_id_lookup[x.source_id], node_id_lookup[x.target_id], x) for x in resptr_links
]
for xml in xml_links:
edges.extend([(node_id_lookup[xml.subject_id], node_id_lookup[x], xml) for x in xml.object_link_ids])
edges.extend([(node_id_lookup[xml.source_id], node_id_lookup[x], xml) for x in xml.target_ids])
g.add_edges_from(edges)
return g, node_index_lookup, edges, set(node_indices)


def _remove_leaf_nodes(
g: rx.PyDiGraph, # type: ignore[type-arg] # pylint: disable=no-member
g: rx.PyDiGraph[Any, Any],
node_index_lookup: dict[int, str],
node_indices: set[int],
) -> tuple[list[str], set[int]]:
Expand All @@ -148,7 +146,7 @@ def _remove_leaf_nodes(


def _find_cheapest_outgoing_links(
g: rx.PyDiGraph, # type: ignore[type-arg] # pylint: disable=no-member
g: rx.PyDiGraph[Any, Any],
cycle: list[tuple[int, int]],
edge_list: list[tuple[int, int, XMLLink | ResptrLink]],
) -> list[tuple[int, int, XMLLink | ResptrLink]]:
Expand Down Expand Up @@ -179,7 +177,7 @@ def _find_cheapest_outgoing_links(


def _remove_edges_to_stash(
g: rx.PyDiGraph, # type: ignore[type-arg] # pylint: disable=no-member,
g: rx.PyDiGraph[Any, Any],
edges_to_remove: list[tuple[int, int, XMLLink | ResptrLink]],
edge_list: list[tuple[int, int, XMLLink | ResptrLink]],
remaining_nodes: set[int],
Expand Down Expand Up @@ -246,39 +244,37 @@ def check(x: tuple[int, int, XMLLink | ResptrLink]) -> bool:


def _add_stash_to_lookup_dict(
stash_dict: dict[str, list[str]], to_stash_links: list[XMLLink | ResptrLink]
stash_dict: dict[str, list[str]], links_to_stash: list[XMLLink | ResptrLink]
) -> dict[str, list[str]]:
stash_list = [stash_link.link_uuid for stash_link in to_stash_links]
stash_list = [stash_link.link_uuid for stash_link in links_to_stash]
# all stashed links have the same subject id, so we can just take the first one
subj_id = to_stash_links[0].subject_id
if subj_id in stash_dict.keys():
subj_id = links_to_stash[0].source_id
if subj_id in stash_dict:
stash_dict[subj_id].extend(stash_list)
else:
stash_dict[subj_id] = stash_list
return stash_dict


def generate_upload_order(
g: rx.PyDiGraph, # type: ignore[type-arg] # pylint: disable=no-member
g: rx.PyDiGraph[Any, Any],
node_index_lookup: dict[int, str],
edge_list: list[tuple[int, int, XMLLink | ResptrLink]],
node_indices: set[int],
) -> tuple[dict[str, list[str]], list[str], int]:
"""
This function takes a graph and a dictionary with the mapping between the graph indices and original ids.
It generates the order in which the resources should be uploaded to the DSP-API based on the dependencies.
Generate the order in which the resources should be uploaded to the DSP-API based on the dependencies.

Args:
g: graph
node_index_lookup: reference between graph indices and original id
edge_list: list of edges in the graph as tuple (source_node, target_node, Class Instance)
node_index_lookup: mapping between graph indices and original IDs
edge_list: list of edges in the graph as tuple (source node, target node, link info)
node_indices: index numbers of the nodes still in the graph

Returns:
Dictionary, which stores the information which resources have stashes
and which UUIDs of the elements should be stashed
A list that of resource IDs which gives the order in which the resources should be uploaded in the API
The number of links in the stash.
Dictionary that maps the resources that have stashes to the UUIDs of the stashed links
A list of resource IDs which gives the order in which the resources should be uploaded to DSP-API
The number of links in the stash
jnussbaum marked this conversation as resolved.
Show resolved Hide resolved
"""
upload_order: list[str] = []
stash_lookup: dict[str, list[str]] = {}
Expand Down
26 changes: 13 additions & 13 deletions src/dsp_tools/analyse_xml_data/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@ class ResptrLink:
This class represents a direct link (resptr) between a starting resource and a target resource.

Args:
subject_id: resource ID that is in subject position of the triple
object_id: resource ID that is in object position of the triple
link_uuid: each link, which is represented in the graph gets a UUID
source_id: ID of the resource from which the link originates
target_id: ID of the resource where the link points to
link_uuid: identifier of this link
"""

subject_id: str
object_id: str
source_id: str
target_id: str
link_uuid: str = field(default_factory=lambda: str(uuid.uuid4()))

@property
Expand All @@ -28,20 +28,20 @@ def cost_links(self) -> float:
@dataclass(frozen=True)
class XMLLink:
"""
This class represents one or more links from a single starting resource to a set of target resources,
where all target resources are linked to from a single text value on the starting resource.
This class represents one or more links from a starting resource to a set of target resources,
where all target resources are linked to from a single text value of the starting resource.
jnussbaum marked this conversation as resolved.
Show resolved Hide resolved

Args:
subject_id: resource ID that is in subject position of the triple
object_link_ids: a set that contains the resource IDs which were embedded in the <text> element
link_uuid: each link, which is represented in the graph gets a UUID
source_id: ID of the resource from which the link(s) originate
target_ids: IDs of the resources that are referenced in the text value
link_uuid: identifier of this link
"""

subject_id: str
object_link_ids: set[str]
source_id: str
target_ids: set[str]
link_uuid: str = field(default_factory=lambda: str(uuid.uuid4()))

@property
def cost_links(self) -> float:
"""The cost of this outgoing link (1 / number of links in the XML text)"""
return 1 / len(self.object_link_ids)
return 1 / len(self.target_ids)
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,12 @@ def test_create_info_from_xml_for_graph_from_one_resource() -> None:
</resource>"""
)
res_resptr_links, res_xml_links, subject_id = _create_info_from_xml_for_graph_from_one_resource(test_ele)
res_B_19 = [obj.object_id for obj in res_resptr_links]
res_B_19 = [obj.target_id for obj in res_resptr_links]
assert "res_B_19" in res_B_19
assert "res_C_19" in res_B_19
assert "res_A_19" == subject_id
assert res_xml_links[0].subject_id == "res_A_19"
assert res_xml_links[0].object_link_ids == {"res_B_19", "res_C_19"}
assert res_xml_links[0].source_id == "res_A_19"
assert res_xml_links[0].target_ids == {"res_B_19", "res_C_19"}


def test_create_info_from_xml_for_graph_from_one_resource_one() -> None:
Expand All @@ -67,9 +67,9 @@ def test_create_info_from_xml_for_graph_from_one_resource_one() -> None:
)
res_resptr, res_xml, subject_id = _create_info_from_xml_for_graph_from_one_resource(test_ele)
assert subject_id == "res_A_11"
assert res_resptr[0].object_id == "res_B_11"
assert res_resptr[0].target_id == "res_B_11"
assert isinstance(res_resptr[0], ResptrLink)
assert res_xml[0].object_link_ids == {"res_B_11"}
assert res_xml[0].target_ids == {"res_B_11"}
assert isinstance(res_xml[0], XMLLink)


Expand Down Expand Up @@ -101,7 +101,7 @@ def test_text_only_create_info_from_xml_for_graph_from_one_resource() -> None:
res_resptr, res_xml, subject_id = _create_info_from_xml_for_graph_from_one_resource(test_ele)
assert subject_id == "res_C_18"
assert not res_resptr
res_xml_ids = [x.object_link_ids for x in res_xml]
res_xml_ids = [x.target_ids for x in res_xml]
assert unordered(res_xml_ids) == [{"res_A_18"}, {"res_B_18"}]


Expand Down Expand Up @@ -148,7 +148,7 @@ def test_extract_ids_from_text_prop_with_several_text_links() -> None:
'class="salsah-link" href="IRI:res_B_18:IRI">res_B_18</a></text></text-prop>'
)
res = _create_text_link_objects("res_C_18", test_ele)
res_ids = [x.object_link_ids for x in res]
res_ids = [x.target_ids for x in res]
assert unordered(res_ids) == [{"res_A_18"}, {"res_B_18"}]


Expand All @@ -162,7 +162,7 @@ def test_create_class_instance_resptr_link_one_link() -> None:
"""
)
res = _create_resptr_link_objects("res_A_15", test_ele)
assert res[0].object_id == "res_C_15"
assert res[0].target_id == "res_C_15"


def test_create_class_instance_resptr_link_several() -> None:
Expand All @@ -178,9 +178,9 @@ def test_create_class_instance_resptr_link_several() -> None:
)
res = _create_resptr_link_objects("res_D_13", test_ele)
assert all(isinstance(x, ResptrLink) for x in res)
assert res[0].object_id == "res_A_13"
assert res[1].object_id == "res_B_13"
assert res[2].object_id == "res_C_13"
assert res[0].target_id == "res_A_13"
assert res[1].target_id == "res_B_13"
assert res[2].target_id == "res_C_13"


def test_create_info_from_xml_for_graph_check_UUID_in_root() -> None:
Expand Down