Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(xmlupload): optimize stash links (DEV-2847) #573

Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
4d4aa56
Update stash_models.py
Nora-Olivia-Ammann Oct 17, 2023
1dff378
Update Test Data
Nora-Olivia-Ammann Oct 17, 2023
3f97c2c
Add text string to model and update test
Nora-Olivia-Ammann Oct 17, 2023
1fd8286
update cost models
Nora-Olivia-Ammann Oct 17, 2023
381cb4f
Update extract_links_from_XML.py
Nora-Olivia-Ammann Oct 17, 2023
3b027aa
Update models.py
Nora-Olivia-Ammann Oct 17, 2023
fd84ac9
Restructure Classes
Nora-Olivia-Ammann Oct 18, 2023
6bfc860
Merge branch 'main' into wip/dev-2833-apply-optimized-stash-identific…
Nora-Olivia-Ammann Oct 18, 2023
4ddf9bf
Create create_upload_order_stash_circles.py
Nora-Olivia-Ammann Oct 18, 2023
ca112ca
change node indices
Nora-Olivia-Ammann Oct 18, 2023
3050cf7
Merge branch 'wip/dev-2833-apply-optimized-stash-identification-in-ac…
Nora-Olivia-Ammann Oct 18, 2023
2b12b41
Update extract_links_from_XML.py
Nora-Olivia-Ammann Oct 18, 2023
005dfe3
Merge branch 'main' into wip/dev-2833-apply-optimized-stash-identific…
Nora-Olivia-Ammann Oct 18, 2023
4f39e80
Merge branch 'wip/dev-2833-apply-optimized-stash-identification-in-ac…
Nora-Olivia-Ammann Oct 18, 2023
eb36f1e
Update extract_links_from_XML.py
Nora-Olivia-Ammann Oct 18, 2023
8dabfeb
Update extract_links_from_XML.py
Nora-Olivia-Ammann Oct 18, 2023
7787679
phantom link removal
Nora-Olivia-Ammann Oct 18, 2023
9e29a20
Update extract_links_from_XML.py
Nora-Olivia-Ammann Oct 18, 2023
66def93
Merge branch 'main' into wip/dev-2833-apply-optimized-stash-identific…
Nora-Olivia-Ammann Oct 18, 2023
a68fe27
Update extract_links_from_XML.py
Nora-Olivia-Ammann Oct 19, 2023
23eb6d1
fix test
Nora-Olivia-Ammann Oct 19, 2023
bc91de0
add test
Nora-Olivia-Ammann Oct 19, 2023
004d8e6
Merge branch 'main' into wip/dev-2833-apply-optimized-stash-identific…
Nora-Olivia-Ammann Oct 19, 2023
8079619
remove print statements
Nora-Olivia-Ammann Oct 19, 2023
631221c
Merge branch 'wip/dev-2833-apply-optimized-stash-identification-in-ac…
Nora-Olivia-Ammann Oct 19, 2023
ede4218
Update extract_links_from_XML.py
Nora-Olivia-Ammann Oct 19, 2023
f5ce7d4
Update extract_links_from_XML.py
Nora-Olivia-Ammann Oct 19, 2023
8c27ac7
Update extract_links_from_XML.py
Nora-Olivia-Ammann Oct 19, 2023
011e385
remove resource stash info
Nora-Olivia-Ammann Oct 19, 2023
bf1ff70
comments update
Nora-Olivia-Ammann Oct 19, 2023
9104e66
Update test_extract_links_from_XML.py
Nora-Olivia-Ammann Oct 19, 2023
506f30a
Update test_extract_links_from_XML.py
Nora-Olivia-Ammann Oct 19, 2023
a87c554
remove root
Nora-Olivia-Ammann Oct 19, 2023
1f13d11
Update extract_links_from_XML.py
Nora-Olivia-Ammann Oct 19, 2023
fe7a875
Update models.py
Nora-Olivia-Ammann Oct 19, 2023
c71ef40
remove unnecessary function
Nora-Olivia-Ammann Oct 19, 2023
0f99f61
Update test_extract_links_from_XML.py
Nora-Olivia-Ammann Oct 19, 2023
6c9105b
renaming files
Nora-Olivia-Ammann Oct 20, 2023
bf3d577
docstrings
Nora-Olivia-Ammann Oct 20, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
229 changes: 136 additions & 93 deletions src/dsp_tools/analyse_xml_data/extract_links_from_XML.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,62 +6,65 @@
from lxml import etree
from viztracer import VizTracer

from dsp_tools.analyse_xml_data.models import ResptrLink, UploadResource, XMLLink
from dsp_tools.analyse_xml_data.models import ResourceStashInfo, ResptrLink, XMLLink


def _create_info_from_xml_for_graph(root: etree._Element) -> tuple[list[ResptrLink], list[XMLLink], set[str]]:
def _create_info_from_xml_for_graph(
root: etree._Element,
) -> tuple[etree._Element, list[ResptrLink], list[XMLLink], list[str]]:
"""Create instances of the classes ResptrLink and XMLLink from the root of the XML file."""
Nora-Olivia-Ammann marked this conversation as resolved.
Show resolved Hide resolved
resptr_instances = []
xml_instances = []
all_resource_ids = set()
all_resource_ids = []
for resource in root.iter(tag="{https://dasch.swiss/schema}resource"):
resptr, xml, subject_id = _create_info_from_xml_for_graph_from_one_resource(resource)
if resptr:
resptr_instances.extend(resptr)
if xml:
xml_instances.extend(xml)
if subject_id:
all_resource_ids.add(subject_id)
return resptr_instances, xml_instances, set(all_resource_ids)
all_resource_ids.append(subject_id)
resptr_instances.extend(resptr)
xml_instances.extend(xml)
return root, resptr_instances, xml_instances, all_resource_ids
Nora-Olivia-Ammann marked this conversation as resolved.
Show resolved Hide resolved


def _create_info_from_xml_for_graph_from_one_resource(
resource: etree._Element,
) -> tuple[list[ResptrLink], list[XMLLink], str]:
subject_id = resource.attrib["id"]
resptr_links, xml_links = _get_all_links_from_one_resource(resource)
resptr_link_objects = []
xml_link_objects = []
if resptr_links:
resptr_link_objects = [ResptrLink(subject_id, object_id) for object_id in resptr_links]
if xml_links:
xml_link_objects = [XMLLink(subject_id, x) for x in xml_links]
return resptr_link_objects, xml_link_objects, subject_id


def _get_all_links_from_one_resource(resource: etree._Element) -> tuple[list[str], list[set[str]]]:
resptr_links: list[str] = []
xml_links: list[set[str]] = []
resptr_links, xml_links = _get_all_links_from_one_resource(subject_id, resource)
return resptr_links, xml_links, subject_id
Nora-Olivia-Ammann marked this conversation as resolved.
Show resolved Hide resolved


def _get_all_links_from_one_resource(
subject_id: str, resource: etree._Element
) -> tuple[list[ResptrLink], list[XMLLink]]:
resptr_links: list[ResptrLink] = []
xml_links: list[XMLLink] = []
for prop in resource.getchildren():
match prop.tag:
case "{https://dasch.swiss/schema}resptr-prop":
resptr_links.extend(_extract_ids_from_one_resptr_prop(prop))
resptr_links.extend(_create_class_instance_resptr_link(subject_id, prop))
case "{https://dasch.swiss/schema}text-prop":
xml_links.extend(_extract_ids_from_text_prop(prop))
xml_links.extend(_create_class_instance_text_prop(subject_id, prop))
return resptr_links, xml_links


def _extract_ids_from_one_resptr_prop(resptr_prop: etree._Element) -> list[str]:
return [x.text for x in resptr_prop.getchildren() if x.text]
def _create_class_instance_resptr_link(subject_id: str, resptr_prop: etree._Element) -> list[ResptrLink]:
Nora-Olivia-Ammann marked this conversation as resolved.
Show resolved Hide resolved
resptr_links = []
for resptr in resptr_prop.getchildren():
if r_text := resptr.text:
instance = ResptrLink(subject_id, r_text)
resptr.attrib["stashUUID"] = instance.link_uuid
resptr_links.append(instance)
return resptr_links


def _extract_ids_from_text_prop(text_prop: etree._Element) -> list[set[str]]:
def _create_class_instance_text_prop(subject_id: str, text_prop: etree._Element) -> list[XMLLink]:
Nora-Olivia-Ammann marked this conversation as resolved.
Show resolved Hide resolved
# if the same ID is in several separate <text> values of one <text-prop>, they are considered separate links
xml_props = []
for text in text_prop.getchildren():
links = _extract_ids_from_one_text_value(text)
if links:
xml_props.append(links)
xml_link = XMLLink(subject_id, links)
xml_props.append(xml_link)
text.attrib["stashUUID"] = xml_link.link_uuid
return xml_props


Expand All @@ -77,8 +80,13 @@ def _extract_ids_from_one_text_value(text: etree._Element) -> set[str]:


def _make_graph(
resptr_instances: list[ResptrLink], xml_instances: list[XMLLink], all_resource_ids: set[str]
) -> tuple[rx.PyDiGraph, dict[int, str]]: # type: ignore[type-arg] # pylint: disable=no-member
resptr_instances: list[ResptrLink], xml_instances: list[XMLLink], all_resource_ids: list[str]
) -> tuple[ # type: ignore[type-arg]
rx.PyDiGraph, # pylint: disable=no-member
dict[int, str],
list[tuple[int, int, ResptrLink | XMLLink]],
set[int],
]:
"""
This function takes information about the resources (nodes) and links between them (edges).
From that it constructs a rustworkx directed graph.
Expand All @@ -93,94 +101,128 @@ def _make_graph(
"""
Nora-Olivia-Ammann marked this conversation as resolved.
Show resolved Hide resolved
g: rx.PyDiGraph = rx.PyDiGraph() # type: ignore[type-arg] # pylint: disable=no-member
nodes = [(id_, None, None) for id_ in all_resource_ids]
node_ids = [x[0] for x in nodes]
node_inidices = g.add_nodes_from(nodes)
node_id_lookup = dict(zip(node_ids, node_inidices))
node_index_lookup = dict(zip(node_inidices, node_ids))
print(f"number of nodes: {len(nodes)}")
resptr_edges = [(node_id_lookup[x.subject_id], node_id_lookup[x.object_id], 1) for x in resptr_instances]
g.add_edges_from(resptr_edges)
print(f"number of resptr edges: {len(resptr_edges)}")
xml_edges = []
node_indices = g.add_nodes_from(nodes)
node_indices = list(node_indices) # type: ignore[assignment]
node_id_lookup = dict(zip(all_resource_ids, node_indices))
node_index_lookup = dict(zip(node_indices, all_resource_ids))
edges: list[tuple[int, int, ResptrLink | XMLLink]] = [
(node_id_lookup[x.subject_id], node_id_lookup[x.object_id], x) for x in resptr_instances
]
for xml in xml_instances:
xml_edges.extend(
[(node_id_lookup[xml.subject_id], node_id_lookup[x], xml.cost_links) for x in xml.object_link_ids]
)
g.add_edges_from(xml_edges)
print(f"number of xml edges: {len(xml_edges)}")
return g, node_index_lookup
edges.extend([(node_id_lookup[xml.subject_id], node_id_lookup[x], xml) for x in xml.object_link_ids])
g.add_edges_from(edges)
return g, node_index_lookup, edges, set(node_indices)
Nora-Olivia-Ammann marked this conversation as resolved.
Show resolved Hide resolved


def _remove_leaf_nodes(
g: rx.PyDiGraph, # type: ignore[type-arg] # pylint: disable=no-member
node_index_lookup: dict[int, str],
) -> list[UploadResource]:
res: list[UploadResource] = []
while leaf_nodes := [x for x in g.node_indexes() if g.out_degree(x) == 0]:
print(f"number of leaf nodes removed: {len(leaf_nodes)}")
res.extend(UploadResource(node_index_lookup[n]) for n in leaf_nodes)
node_indices: set[int],
) -> tuple[list[ResourceStashInfo], set[int]]:
res: list[ResourceStashInfo] = []
while leaf_nodes := [x for x in node_indices if g.out_degree(x) == 0]:
res.extend(ResourceStashInfo(node_index_lookup[n]) for n in leaf_nodes)
g.remove_nodes_from(leaf_nodes)
return res
node_indices = node_indices - set(leaf_nodes)
return res, node_indices


def _find_cheapest_node(
def _find_cheapest_outgoing_links(
g: rx.PyDiGraph, # type: ignore[type-arg] # pylint: disable=no-member
cycle: rx.EdgeList, # pylint: disable=no-member
node_index_lookup: dict[int, str],
) -> tuple[int, list[str]]:
cycle: list[tuple[int, int]],
edge_list: list[tuple[int, int, XMLLink | ResptrLink]],
) -> list[tuple[int, int, XMLLink | ResptrLink]]:
costs = []
for source, _ in cycle:
for source, target in cycle:
edges_in = g.in_edges(source)
node_gain = len(edges_in)
edges_out = g.out_edges(source)
node_cost = sum(x[2] for x in edges_out)
node_cost = sum(x[2].cost_links for x in edges_out)
node_value = node_cost / node_gain
costs.append((source, node_value, edges_out))
sorted_nodes = sorted(costs, key=lambda x: x[1])
cheapest_node, _, edges_out = sorted_nodes[0]
print("cheapest", cheapest_node)
removed_target_ids: list[str] = [node_index_lookup[x[1]] for x in edges_out]
return cheapest_node, removed_target_ids
costs.append((source, target, node_value, edges_out))
cheapest_nodes = sorted(costs, key=lambda x: x[2])[0]
cheapest_links = [x for x in edge_list if x[0] == cheapest_nodes[0] and x[1] == cheapest_nodes[1]]
return cheapest_links


def _generate_upload_order(
def _remove_edges_get_removed_class_instances(
Nora-Olivia-Ammann marked this conversation as resolved.
Show resolved Hide resolved
g: rx.PyDiGraph, # type: ignore[type-arg] # pylint: disable=no-member,
edges_to_remove: list[tuple[int, int, XMLLink | ResptrLink]],
node_index_lookup: dict[int, str],
edge_list: list[tuple[int, int, XMLLink | ResptrLink]],
remaining_nodes: set[int],
) -> ResourceStashInfo:
source, target = edges_to_remove[0][0], edges_to_remove[0][1]
links_to_stash = [x[2] for x in edges_to_remove]
# if only one (source, target) is entered, it removes only one edge, not all
to_remove_list = [(x[0], x[1]) for x in edges_to_remove]
phantom_links = []
for instance in links_to_stash:
if isinstance(instance, XMLLink):
phantom_links.extend(_find_remove_phantom_xml_edges(source, target, edge_list, instance, remaining_nodes))
to_remove_list.extend(phantom_links)
g.remove_edges_from(to_remove_list)
return ResourceStashInfo(node_index_lookup[source], links_to_stash)


def _find_remove_phantom_xml_edges(
Nora-Olivia-Ammann marked this conversation as resolved.
Show resolved Hide resolved
source: int,
target: int,
edge_list: list[tuple[int, int, XMLLink | ResptrLink]],
xml_instance: XMLLink,
remaining_nodes: set[int],
) -> list[tuple[int, int]]:
def check(x: tuple[int, int, XMLLink | ResptrLink]) -> bool:
return x[0] == source and x[1] != target and x[2] == xml_instance and x[1] in remaining_nodes

return [(x[0], x[1]) for x in edge_list if check(x)]


def generate_upload_order(
g: rx.PyDiGraph, # type: ignore[type-arg] # pylint: disable=no-member
node_index_lookup: dict[int, str],
) -> list[UploadResource]:
edge_list: list[tuple[int, int, XMLLink | ResptrLink]],
node_indices: set[int],
) -> tuple[list[ResourceStashInfo], int]:
"""
This function takes a graph and a dictionary with the mapping between the graph indices and original ids.
It generates the order in which the resources should be uploaded to the DSP-API based on the dependencies.

Args:
g: graph
node_index_lookup: reference between graph indices and original id
edge_list: list of edges in the graph as tuple (source_node, target_node, Class Instance)
node_indices: index numbers of the nodes still in the graph

Returns:
List of instances that contain the information of the resource id and its links.
The number of links in the stash.
Nora-Olivia-Ammann marked this conversation as resolved.
Show resolved Hide resolved
"""
removed_nodes = []
leaf_nodes = _remove_leaf_nodes(g, node_index_lookup)
leaf_nodes, node_indices = _remove_leaf_nodes(g, node_index_lookup, node_indices)
removed_nodes.extend(leaf_nodes)
removed_from_cycle = 0
while g.num_nodes():
print(f"total number of nodes remaining: {g.num_nodes()}")
cycle = rx.digraph_find_cycle(g) # type: ignore[attr-defined] # pylint: disable=no-member
print("-" * 10)
print(f"cycle: {cycle}")
node = _find_cheapest_node(g, cycle, node_index_lookup)
source, targets = node
removed_nodes.append(UploadResource(g[source][0], targets))
g.remove_node(source)
removed_from_cycle += 1
print(f"removed link: {node}")
leaf_nodes = _remove_leaf_nodes(g, node_index_lookup)
stash_counter = 0
while node_indices:
cycle = list(rx.digraph_find_cycle(g)) # type: ignore[attr-defined] # pylint: disable=no-member
links_to_remove = _find_cheapest_outgoing_links(g, cycle, edge_list)
stash_counter += len(links_to_remove)
removed_nodes.append(
_remove_edges_get_removed_class_instances(
g=g,
edges_to_remove=links_to_remove,
node_index_lookup=node_index_lookup,
edge_list=edge_list,
remaining_nodes=node_indices,
)
)
leaf_nodes, node_indices = _remove_leaf_nodes(g, node_index_lookup, node_indices)
removed_nodes.extend(leaf_nodes)
print("=" * 80)
print(f"removed links total: {removed_from_cycle}")
return removed_nodes
return removed_nodes, stash_counter


def analyse_circles_in_data(xml_filepath: Path, tracer_output_file: str) -> list[UploadResource]:
def analyse_circles_in_data(
xml_filepath: Path, tracer_output_file: str, save_tracer: bool = False
) -> list[ResourceStashInfo]:
"""
This function takes an XML filepath
It analyzes how many and which links have to be removed
Expand All @@ -189,6 +231,7 @@ def analyse_circles_in_data(xml_filepath: Path, tracer_output_file: str) -> list
Args:
xml_filepath: path to the file
tracer_output_file: name of the file where the viztracer results should be saved
save_tracer: True if the output of the viztracer should be saved

Returns:
The order in which the resources should be uploaded.
Expand All @@ -197,32 +240,32 @@ def analyse_circles_in_data(xml_filepath: Path, tracer_output_file: str) -> list
print("=" * 80)
tracer = VizTracer(
minimize_memory=True,
ignore_c_function=True,
ignore_frozen=True,
include_files=["extract_links_from_XML.py", "models.py"],
max_stack_depth=3,
)
Nora-Olivia-Ammann marked this conversation as resolved.
Show resolved Hide resolved
tracer.start()
tree = etree.parse(xml_filepath)
root = tree.getroot()
resptr_instances, xml_instances, all_resource_ids = _create_info_from_xml_for_graph(root)
root, resptr_instances, xml_instances, all_resource_ids = _create_info_from_xml_for_graph(root)
Nora-Olivia-Ammann marked this conversation as resolved.
Show resolved Hide resolved
print(f"Total Number of Resources: {len(all_resource_ids)}")
print(f"Total Number of resptr Links: {len(resptr_instances)}")
print(f"Total Number of XML Texts with Links: {len(xml_instances)}")
print("=" * 80)
g, node_index_lookup = _make_graph(resptr_instances, xml_instances, all_resource_ids)
print("=" * 80)
resource_upload_order = _generate_upload_order(g, node_index_lookup)
print("=" * 80)
g, node_index_lookup, edges, node_indices = _make_graph(resptr_instances, xml_instances, all_resource_ids)
resource_upload_order, stash_size = generate_upload_order(g, node_index_lookup, edges, node_indices)
print("Number of Links Stashed:", stash_size)
tracer.stop()
tracer.save(output_file=tracer_output_file)
if save_tracer:
tracer.save(output_file=tracer_output_file)
print("=" * 80)
print("Start time:", start)
print("End time:", datetime.now())
print("=" * 80)
return resource_upload_order


if __name__ == "__main__":
analyse_circles_in_data(
xml_filepath=Path("testdata/xml-data/circular-references/test_circular_references_1.xml"),
tracer_output_file="circular_references_tracer.json",
save_tracer=False,
)
18 changes: 13 additions & 5 deletions src/dsp_tools/analyse_xml_data/models.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,24 @@
from __future__ import annotations

import uuid
from dataclasses import dataclass, field


@dataclass
@dataclass(frozen=True)
class ResptrLink:
"""This class represents a link between two resources."""

subject_id: str
object_id: str
link_uuid: str = field(default_factory=lambda: str(uuid.uuid4()))
Nora-Olivia-Ammann marked this conversation as resolved.
Show resolved Hide resolved

@property
def cost_links(self) -> float:
"""The cost of this outgoing is consistently 1"""
return 1

@dataclass

@dataclass(frozen=True)
class XMLLink:
"""
This class represents a link between a resource and an XML text
Nora-Olivia-Ammann marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -20,6 +27,7 @@ class XMLLink:

subject_id: str
object_link_ids: set[str]
link_uuid: str = field(default_factory=lambda: str(uuid.uuid4()))
Nora-Olivia-Ammann marked this conversation as resolved.
Show resolved Hide resolved

@property
def cost_links(self) -> float:
Expand All @@ -28,13 +36,13 @@ def cost_links(self) -> float:


@dataclass
class UploadResource:
class ResourceStashInfo:
"""
Holds information about a resource that can be uploaded to the DSP.

May hold information about the links that need to be stashed from this resource before it can be uploaded.
A ordered list of UploadResources can be used to determine the order in which resources need to be uploaded.
An ordered list of UploadResources can be used to determine the order in which resources need to be uploaded.
"""

res_id: str
stash_links_to: list[str] = field(default_factory=list)
stash_links_to: list[XMLLink | ResptrLink] = field(default_factory=list)
Nora-Olivia-Ammann marked this conversation as resolved.
Show resolved Hide resolved
Empty file.