dasch-swiss · jnussbaum · Oct 26, 2023 · Oct 25, 2023 · Oct 25, 2023 · Oct 25, 2023
diff --git a/src/dsp_tools/analyse_xml_data/construct_and_analyze_graph.py b/src/dsp_tools/analyse_xml_data/construct_and_analyze_graph.py
@@ -29,26 +29,25 @@ def create_info_from_xml_for_graph(
     xml_links = []
     all_resource_ids = []
     for resource in root.iter(tag="{https://dasch.swiss/schema}resource"):
-        resptr, xml, subject_id = _create_info_from_xml_for_graph_from_one_resource(resource)
-        all_resource_ids.append(subject_id)
+        resptr, xml = _create_info_from_xml_for_graph_from_one_resource(resource)
+        all_resource_ids.append(resource.attrib["id"])
         resptr_links.extend(resptr)
         xml_links.extend(xml)
     return resptr_links, xml_links, all_resource_ids
 
 
 def _create_info_from_xml_for_graph_from_one_resource(
     resource: etree._Element,
-) -> tuple[list[ResptrLink], list[XMLLink], str]:
-    subject_id = resource.attrib["id"]
+) -> tuple[list[ResptrLink], list[XMLLink]]:
     resptr_links: list[ResptrLink] = []
     xml_links: list[XMLLink] = []
     for prop in resource.getchildren():
         match prop.tag:
             case "{https://dasch.swiss/schema}resptr-prop":
-                resptr_links.extend(_create_resptr_link_objects(subject_id, prop))
+                resptr_links.extend(_create_resptr_link_objects(resource.attrib["id"], prop))
             case "{https://dasch.swiss/schema}text-prop":
-                xml_links.extend(_create_text_link_objects(subject_id, prop))
-    return resptr_links, xml_links, subject_id
+                xml_links.extend(_create_text_link_objects(resource.attrib["id"], prop))
+    return resptr_links, xml_links
 
 
 def _create_resptr_link_objects(subject_id: str, resptr_prop: etree._Element) -> list[ResptrLink]:

diff --git a/src/dsp_tools/models/xmlresource.py b/src/dsp_tools/models/xmlresource.py
@@ -42,8 +42,8 @@ def __init__(self, node: etree._Element, default_ontology: str) -> None:
         Constructor that parses a resource node from the XML DOM
 
         Args:
-            node: The DOM node to be processed representing a resource (which is a child of the DSP element)
-            default_ontology: The default ontology (given in the attribute default-ontology of the DSP element)
+            node: The DOM node to be processed representing a resource (which is a child of the <knora> element)
+            default_ontology: The default ontology (given in the attribute default-ontology of the <knora> element)
 
         Returns:
             None

diff --git a/src/dsp_tools/utils/xmlupload/create_upload_order_stash_circles.py b/src/dsp_tools/utils/xmlupload/create_upload_order_stash_circles.py
diff --git a/test/unittests/test_analyse_xml_data/test_construct_and_analyze_graph.py b/test/unittests/test_analyse_xml_data/test_construct_and_analyze_graph.py
@@ -40,11 +40,10 @@ def test_create_info_from_xml_for_graph_from_one_resource() -> None:
             </text-prop>
         </resource>"""
     )
-    res_resptr_links, res_xml_links, subject_id = _create_info_from_xml_for_graph_from_one_resource(test_ele)
+    res_resptr_links, res_xml_links = _create_info_from_xml_for_graph_from_one_resource(test_ele)
     res_B_19 = [obj.target_id for obj in res_resptr_links]
     assert "res_B_19" in res_B_19
     assert "res_C_19" in res_B_19
-    assert "res_A_19" == subject_id
     assert res_xml_links[0].source_id == "res_A_19"
     assert res_xml_links[0].target_ids == {"res_B_19", "res_C_19"}
 
@@ -65,8 +64,7 @@ def test_create_info_from_xml_for_graph_from_one_resource_one() -> None:
         </resource>
         """
     )
-    res_resptr, res_xml, subject_id = _create_info_from_xml_for_graph_from_one_resource(test_ele)
-    assert subject_id == "res_A_11"
+    res_resptr, res_xml = _create_info_from_xml_for_graph_from_one_resource(test_ele)
     assert res_resptr[0].target_id == "res_B_11"
     assert isinstance(res_resptr[0], ResptrLink)
     assert res_xml[0].target_ids == {"res_B_11"}
@@ -77,8 +75,7 @@ def test_create_info_from_xml_for_graph_from_one_resource_no_links() -> None:
     test_ele = etree.fromstring(
         '<resource label="res_B_18" restype=":TestThing" id="res_B_18" permissions="res-default"/>'
     )
-    res_resptr, res_xml, sub_id = _create_info_from_xml_for_graph_from_one_resource(test_ele)
-    assert sub_id == "res_B_18"
+    res_resptr, res_xml = _create_info_from_xml_for_graph_from_one_resource(test_ele)
     assert (res_resptr, res_xml) == ([], [])
 
 
@@ -98,8 +95,7 @@ def test_text_only_create_info_from_xml_for_graph_from_one_resource() -> None:
         </resource>
         """
     )
-    res_resptr, res_xml, subject_id = _create_info_from_xml_for_graph_from_one_resource(test_ele)
-    assert subject_id == "res_C_18"
+    res_resptr, res_xml = _create_info_from_xml_for_graph_from_one_resource(test_ele)
     assert not res_resptr
     res_xml_ids = [x.target_ids for x in res_xml]
     assert unordered(res_xml_ids) == [{"res_A_18"}, {"res_B_18"}]

diff --git a/testdata/xml-data/circular-references/analyse_circles_in_xml.py b/testdata/xml-data/circular-references/analyse_circles_in_xml.py
@@ -1,14 +1,14 @@
 from datetime import datetime
 from pathlib import Path
 
-from lxml import etree
 from viztracer import VizTracer
 
 from dsp_tools.analyse_xml_data.construct_and_analyze_graph import (
     create_info_from_xml_for_graph,
     generate_upload_order,
     make_graph,
 )
+from dsp_tools.utils.xml_utils import parse_and_clean_xml_file
 
 
 def analyse_circles_in_data(xml_filepath: Path, tracer_output_file: str, save_tracer: bool = False) -> None:
@@ -30,19 +30,18 @@ def analyse_circles_in_data(xml_filepath: Path, tracer_output_file: str, save_tr
         max_stack_depth=3,
     )
     tracer.start()
-    tree = etree.parse(xml_filepath)
-    root = tree.getroot()
+    root = parse_and_clean_xml_file(xml_filepath)
     resptr_links, xml_links, all_resource_ids = create_info_from_xml_for_graph(root)
     print(f"Total Number of Resources: {len(all_resource_ids)}")
     print(f"Total Number of resptr Links: {len(resptr_links)}")
     print(f"Total Number of XML Texts with Links: {len(xml_links)}")
     print("=" * 80)
     graph, node_to_id, edges = make_graph(resptr_links, xml_links, all_resource_ids)
     _, _, stash_counter = generate_upload_order(graph, node_to_id, edges)
-    print("Number of Links Stashed:", stash_counter)
     tracer.stop()
     if save_tracer:
         tracer.save(output_file=tracer_output_file)
+    print("Number of Links Stashed:", stash_counter)
     print("=" * 80)
     print("Start time:", start)
     print("End time:", datetime.now())