dasch-swiss · Nora-Olivia-Ammann · Oct 20, 2023 · Oct 17, 2023 · Oct 17, 2023 · Oct 17, 2023
diff --git a/src/dsp_tools/analyse_xml_data/extract_links_from_XML.py b/src/dsp_tools/analyse_xml_data/extract_links_from_XML.py
@@ -6,62 +6,65 @@
 from lxml import etree
 from viztracer import VizTracer
 
-from dsp_tools.analyse_xml_data.models import ResptrLink, UploadResource, XMLLink
+from dsp_tools.analyse_xml_data.models import ResourceStashInfo, ResptrLink, XMLLink
 
 
-def _create_info_from_xml_for_graph(root: etree._Element) -> tuple[list[ResptrLink], list[XMLLink], set[str]]:
+def _create_info_from_xml_for_graph(
+    root: etree._Element,
+) -> tuple[etree._Element, list[ResptrLink], list[XMLLink], list[str]]:
     """Create instances of the classes ResptrLink and XMLLink from the root of the XML file."""
     resptr_instances = []
     xml_instances = []
-    all_resource_ids = set()
+    all_resource_ids = []
     for resource in root.iter(tag="{https://dasch.swiss/schema}resource"):
         resptr, xml, subject_id = _create_info_from_xml_for_graph_from_one_resource(resource)
-        if resptr:
-            resptr_instances.extend(resptr)
-        if xml:
-            xml_instances.extend(xml)
-        if subject_id:
-            all_resource_ids.add(subject_id)
-    return resptr_instances, xml_instances, set(all_resource_ids)
+        all_resource_ids.append(subject_id)
+        resptr_instances.extend(resptr)
+        xml_instances.extend(xml)
+    return root, resptr_instances, xml_instances, all_resource_ids
 
 
 def _create_info_from_xml_for_graph_from_one_resource(
     resource: etree._Element,
 ) -> tuple[list[ResptrLink], list[XMLLink], str]:
     subject_id = resource.attrib["id"]
-    resptr_links, xml_links = _get_all_links_from_one_resource(resource)
-    resptr_link_objects = []
-    xml_link_objects = []
-    if resptr_links:
-        resptr_link_objects = [ResptrLink(subject_id, object_id) for object_id in resptr_links]
-    if xml_links:
-        xml_link_objects = [XMLLink(subject_id, x) for x in xml_links]
-    return resptr_link_objects, xml_link_objects, subject_id
-
-
-def _get_all_links_from_one_resource(resource: etree._Element) -> tuple[list[str], list[set[str]]]:
-    resptr_links: list[str] = []
-    xml_links: list[set[str]] = []
+    resptr_links, xml_links = _get_all_links_from_one_resource(subject_id, resource)
+    return resptr_links, xml_links, subject_id
+
+
+def _get_all_links_from_one_resource(
+    subject_id: str, resource: etree._Element
+) -> tuple[list[ResptrLink], list[XMLLink]]:
+    resptr_links: list[ResptrLink] = []
+    xml_links: list[XMLLink] = []
     for prop in resource.getchildren():
         match prop.tag:
             case "{https://dasch.swiss/schema}resptr-prop":
-                resptr_links.extend(_extract_ids_from_one_resptr_prop(prop))
+                resptr_links.extend(_create_class_instance_resptr_link(subject_id, prop))
             case "{https://dasch.swiss/schema}text-prop":
-                xml_links.extend(_extract_ids_from_text_prop(prop))
+                xml_links.extend(_create_class_instance_text_prop(subject_id, prop))
     return resptr_links, xml_links
 
 
-def _extract_ids_from_one_resptr_prop(resptr_prop: etree._Element) -> list[str]:
-    return [x.text for x in resptr_prop.getchildren() if x.text]
+def _create_class_instance_resptr_link(subject_id: str, resptr_prop: etree._Element) -> list[ResptrLink]:
+    resptr_links = []
+    for resptr in resptr_prop.getchildren():
+        if r_text := resptr.text:
+            instance = ResptrLink(subject_id, r_text)
+            resptr.attrib["stashUUID"] = instance.link_uuid
+            resptr_links.append(instance)
+    return resptr_links
 
 
-def _extract_ids_from_text_prop(text_prop: etree._Element) -> list[set[str]]:
+def _create_class_instance_text_prop(subject_id: str, text_prop: etree._Element) -> list[XMLLink]:
     # if the same ID is in several separate <text> values of one <text-prop>, they are considered separate links
     xml_props = []
     for text in text_prop.getchildren():
         links = _extract_ids_from_one_text_value(text)
         if links:
-            xml_props.append(links)
+            xml_link = XMLLink(subject_id, links)
+            xml_props.append(xml_link)
+            text.attrib["stashUUID"] = xml_link.link_uuid
     return xml_props
 
 
@@ -77,8 +80,13 @@ def _extract_ids_from_one_text_value(text: etree._Element) -> set[str]:
 
 
 def _make_graph(
-    resptr_instances: list[ResptrLink], xml_instances: list[XMLLink], all_resource_ids: set[str]
-) -> tuple[rx.PyDiGraph, dict[int, str]]:  # type: ignore[type-arg] # pylint: disable=no-member
+    resptr_instances: list[ResptrLink], xml_instances: list[XMLLink], all_resource_ids: list[str]
+) -> tuple[  # type: ignore[type-arg]
+    rx.PyDiGraph,  # pylint: disable=no-member
+    dict[int, str],
+    list[tuple[int, int, ResptrLink | XMLLink]],
+    set[int],
+]:
     """
     This function takes information about the resources (nodes) and links between them (edges).
     From that it constructs a rustworkx directed graph.
@@ -93,94 +101,128 @@ def _make_graph(
     """
     g: rx.PyDiGraph = rx.PyDiGraph()  # type: ignore[type-arg] # pylint: disable=no-member
     nodes = [(id_, None, None) for id_ in all_resource_ids]
-    node_ids = [x[0] for x in nodes]
-    node_inidices = g.add_nodes_from(nodes)
-    node_id_lookup = dict(zip(node_ids, node_inidices))
-    node_index_lookup = dict(zip(node_inidices, node_ids))
-    print(f"number of nodes: {len(nodes)}")
-    resptr_edges = [(node_id_lookup[x.subject_id], node_id_lookup[x.object_id], 1) for x in resptr_instances]
-    g.add_edges_from(resptr_edges)
-    print(f"number of resptr edges: {len(resptr_edges)}")
-    xml_edges = []
+    node_indices = g.add_nodes_from(nodes)
+    node_indices = list(node_indices)  # type: ignore[assignment]
+    node_id_lookup = dict(zip(all_resource_ids, node_indices))
+    node_index_lookup = dict(zip(node_indices, all_resource_ids))
+    edges: list[tuple[int, int, ResptrLink | XMLLink]] = [
+        (node_id_lookup[x.subject_id], node_id_lookup[x.object_id], x) for x in resptr_instances
+    ]
     for xml in xml_instances:
-        xml_edges.extend(
-            [(node_id_lookup[xml.subject_id], node_id_lookup[x], xml.cost_links) for x in xml.object_link_ids]
-        )
-    g.add_edges_from(xml_edges)
-    print(f"number of xml edges: {len(xml_edges)}")
-    return g, node_index_lookup
+        edges.extend([(node_id_lookup[xml.subject_id], node_id_lookup[x], xml) for x in xml.object_link_ids])
+    g.add_edges_from(edges)
+    return g, node_index_lookup, edges, set(node_indices)
 
 
 def _remove_leaf_nodes(
     g: rx.PyDiGraph,  # type: ignore[type-arg] # pylint: disable=no-member
     node_index_lookup: dict[int, str],
-) -> list[UploadResource]:
-    res: list[UploadResource] = []
-    while leaf_nodes := [x for x in g.node_indexes() if g.out_degree(x) == 0]:
-        print(f"number of leaf nodes removed: {len(leaf_nodes)}")
-        res.extend(UploadResource(node_index_lookup[n]) for n in leaf_nodes)
+    node_indices: set[int],
+) -> tuple[list[ResourceStashInfo], set[int]]:
+    res: list[ResourceStashInfo] = []
+    while leaf_nodes := [x for x in node_indices if g.out_degree(x) == 0]:
+        res.extend(ResourceStashInfo(node_index_lookup[n]) for n in leaf_nodes)
         g.remove_nodes_from(leaf_nodes)
-    return res
+        node_indices = node_indices - set(leaf_nodes)
+    return res, node_indices
 
 
-def _find_cheapest_node(
+def _find_cheapest_outgoing_links(
     g: rx.PyDiGraph,  # type: ignore[type-arg] # pylint: disable=no-member
-    cycle: rx.EdgeList,  # pylint: disable=no-member
-    node_index_lookup: dict[int, str],
-) -> tuple[int, list[str]]:
+    cycle: list[tuple[int, int]],
+    edge_list: list[tuple[int, int, XMLLink | ResptrLink]],
+) -> list[tuple[int, int, XMLLink | ResptrLink]]:
     costs = []
-    for source, _ in cycle:
+    for source, target in cycle:
         edges_in = g.in_edges(source)
         node_gain = len(edges_in)
         edges_out = g.out_edges(source)
-        node_cost = sum(x[2] for x in edges_out)
+        node_cost = sum(x[2].cost_links for x in edges_out)
         node_value = node_cost / node_gain
-        costs.append((source, node_value, edges_out))
-    sorted_nodes = sorted(costs, key=lambda x: x[1])
-    cheapest_node, _, edges_out = sorted_nodes[0]
-    print("cheapest", cheapest_node)
-    removed_target_ids: list[str] = [node_index_lookup[x[1]] for x in edges_out]
-    return cheapest_node, removed_target_ids
+        costs.append((source, target, node_value, edges_out))
+    cheapest_nodes = sorted(costs, key=lambda x: x[2])[0]
+    cheapest_links = [x for x in edge_list if x[0] == cheapest_nodes[0] and x[1] == cheapest_nodes[1]]
+    return cheapest_links
 
 
-def _generate_upload_order(
+def _remove_edges_get_removed_class_instances(
+    g: rx.PyDiGraph,  # type: ignore[type-arg] # pylint: disable=no-member,
+    edges_to_remove: list[tuple[int, int, XMLLink | ResptrLink]],
+    node_index_lookup: dict[int, str],
+    edge_list: list[tuple[int, int, XMLLink | ResptrLink]],
+    remaining_nodes: set[int],
+) -> ResourceStashInfo:
+    source, target = edges_to_remove[0][0], edges_to_remove[0][1]
+    links_to_stash = [x[2] for x in edges_to_remove]
+    # if only one (source, target) is entered, it removes only one edge, not all
+    to_remove_list = [(x[0], x[1]) for x in edges_to_remove]
+    phantom_links = []
+    for instance in links_to_stash:
+        if isinstance(instance, XMLLink):
+            phantom_links.extend(_find_remove_phantom_xml_edges(source, target, edge_list, instance, remaining_nodes))
+    to_remove_list.extend(phantom_links)
+    g.remove_edges_from(to_remove_list)
+    return ResourceStashInfo(node_index_lookup[source], links_to_stash)
+
+
+def _find_remove_phantom_xml_edges(
+    source: int,
+    target: int,
+    edge_list: list[tuple[int, int, XMLLink | ResptrLink]],
+    xml_instance: XMLLink,
+    remaining_nodes: set[int],
+) -> list[tuple[int, int]]:
+    def check(x: tuple[int, int, XMLLink | ResptrLink]) -> bool:
+        return x[0] == source and x[1] != target and x[2] == xml_instance and x[1] in remaining_nodes
+
+    return [(x[0], x[1]) for x in edge_list if check(x)]
+
+
+def generate_upload_order(
     g: rx.PyDiGraph,  # type: ignore[type-arg] # pylint: disable=no-member
     node_index_lookup: dict[int, str],
-) -> list[UploadResource]:
+    edge_list: list[tuple[int, int, XMLLink | ResptrLink]],
+    node_indices: set[int],
+) -> tuple[list[ResourceStashInfo], int]:
     """
     This function takes a graph and a dictionary with the mapping between the graph indices and original ids.
     It generates the order in which the resources should be uploaded to the DSP-API based on the dependencies.
 
     Args:
         g: graph
         node_index_lookup: reference between graph indices and original id
+        edge_list: list of edges in the graph as tuple (source_node, target_node, Class Instance)
+        node_indices: index numbers of the nodes still in the graph
 
     Returns:
         List of instances that contain the information of the resource id and its links.
+        The number of links in the stash.
     """
     removed_nodes = []
-    leaf_nodes = _remove_leaf_nodes(g, node_index_lookup)
+    leaf_nodes, node_indices = _remove_leaf_nodes(g, node_index_lookup, node_indices)
     removed_nodes.extend(leaf_nodes)
-    removed_from_cycle = 0
-    while g.num_nodes():
-        print(f"total number of nodes remaining: {g.num_nodes()}")
-        cycle = rx.digraph_find_cycle(g)  # type: ignore[attr-defined]  # pylint: disable=no-member
-        print("-" * 10)
-        print(f"cycle: {cycle}")
-        node = _find_cheapest_node(g, cycle, node_index_lookup)
-        source, targets = node
-        removed_nodes.append(UploadResource(g[source][0], targets))
-        g.remove_node(source)
-        removed_from_cycle += 1
-        print(f"removed link: {node}")
-        leaf_nodes = _remove_leaf_nodes(g, node_index_lookup)
+    stash_counter = 0
+    while node_indices:
+        cycle = list(rx.digraph_find_cycle(g))  # type: ignore[attr-defined]  # pylint: disable=no-member
+        links_to_remove = _find_cheapest_outgoing_links(g, cycle, edge_list)
+        stash_counter += len(links_to_remove)
+        removed_nodes.append(
+            _remove_edges_get_removed_class_instances(
+                g=g,
+                edges_to_remove=links_to_remove,
+                node_index_lookup=node_index_lookup,
+                edge_list=edge_list,
+                remaining_nodes=node_indices,
+            )
+        )
+        leaf_nodes, node_indices = _remove_leaf_nodes(g, node_index_lookup, node_indices)
         removed_nodes.extend(leaf_nodes)
-    print("=" * 80)
-    print(f"removed links total: {removed_from_cycle}")
-    return removed_nodes
+    return removed_nodes, stash_counter
 
 
-def analyse_circles_in_data(xml_filepath: Path, tracer_output_file: str) -> list[UploadResource]:
+def analyse_circles_in_data(
+    xml_filepath: Path, tracer_output_file: str, save_tracer: bool = False
+) -> list[ResourceStashInfo]:
     """
     This function takes an XML filepath
     It analyzes how many and which links have to be removed
@@ -189,6 +231,7 @@ def analyse_circles_in_data(xml_filepath: Path, tracer_output_file: str) -> list
     Args:
         xml_filepath: path to the file
         tracer_output_file: name of the file where the viztracer results should be saved
+        save_tracer: True if the output of the viztracer should be saved
 
     Returns:
         The order in which the resources should be uploaded.
@@ -197,32 +240,32 @@ def analyse_circles_in_data(xml_filepath: Path, tracer_output_file: str) -> list
     print("=" * 80)
     tracer = VizTracer(
         minimize_memory=True,
-        ignore_c_function=True,
-        ignore_frozen=True,
-        include_files=["extract_links_from_XML.py", "models.py"],
+        max_stack_depth=3,
     )
     tracer.start()
     tree = etree.parse(xml_filepath)
     root = tree.getroot()
-    resptr_instances, xml_instances, all_resource_ids = _create_info_from_xml_for_graph(root)
+    root, resptr_instances, xml_instances, all_resource_ids = _create_info_from_xml_for_graph(root)
     print(f"Total Number of Resources: {len(all_resource_ids)}")
     print(f"Total Number of resptr Links: {len(resptr_instances)}")
     print(f"Total Number of XML Texts with Links: {len(xml_instances)}")
     print("=" * 80)
-    g, node_index_lookup = _make_graph(resptr_instances, xml_instances, all_resource_ids)
-    print("=" * 80)
-    resource_upload_order = _generate_upload_order(g, node_index_lookup)
-    print("=" * 80)
+    g, node_index_lookup, edges, node_indices = _make_graph(resptr_instances, xml_instances, all_resource_ids)
+    resource_upload_order, stash_size = generate_upload_order(g, node_index_lookup, edges, node_indices)
+    print("Number of Links Stashed:", stash_size)
     tracer.stop()
-    tracer.save(output_file=tracer_output_file)
+    if save_tracer:
+        tracer.save(output_file=tracer_output_file)
     print("=" * 80)
     print("Start time:", start)
     print("End time:", datetime.now())
+    print("=" * 80)
     return resource_upload_order
 
 
 if __name__ == "__main__":
     analyse_circles_in_data(
         xml_filepath=Path("testdata/xml-data/circular-references/test_circular_references_1.xml"),
         tracer_output_file="circular_references_tracer.json",
+        save_tracer=False,
     )
diff --git a/src/dsp_tools/analyse_xml_data/models.py b/src/dsp_tools/analyse_xml_data/models.py
@@ -1,17 +1,24 @@
 from __future__ import annotations
 
+import uuid
 from dataclasses import dataclass, field
 
 
-@dataclass
+@dataclass(frozen=True)
 class ResptrLink:
     """This class represents a link between two resources."""
 
     subject_id: str
     object_id: str
+    link_uuid: str = field(default_factory=lambda: str(uuid.uuid4()))
 
+    @property
+    def cost_links(self) -> float:
+        """The cost of this outgoing is consistently 1"""
+        return 1
 
-@dataclass
+
+@dataclass(frozen=True)
 class XMLLink:
     """
     This class represents a link between a resource and an XML text
@@ -20,6 +27,7 @@ class XMLLink:
 
     subject_id: str
     object_link_ids: set[str]
+    link_uuid: str = field(default_factory=lambda: str(uuid.uuid4()))
 
     @property
     def cost_links(self) -> float:
@@ -28,13 +36,13 @@ def cost_links(self) -> float:
 
 
 @dataclass
-class UploadResource:
+class ResourceStashInfo:
     """
     Holds information about a resource that can be uploaded to the DSP.
 
     May hold information about the links that need to be stashed from this resource before it can be uploaded.
-    A ordered list of UploadResources can be used to determine the order in which resources need to be uploaded.
+    An ordered list of UploadResources can be used to determine the order in which resources need to be uploaded.
     """
 
     res_id: str
-    stash_links_to: list[str] = field(default_factory=list)
+    stash_links_to: list[XMLLink | ResptrLink] = field(default_factory=list)
diff --git a/src/dsp_tools/utils/xmlupload/create_upload_order_stash_circles.py b/src/dsp_tools/utils/xmlupload/create_upload_order_stash_circles.py