From f2d4488a147ed5b9109cf06b2dca7d8232a696f8 Mon Sep 17 00:00:00 2001 From: Johannes Nussbaum <39048939+jnussbaum@users.noreply.github.com> Date: Thu, 26 Oct 2023 09:42:49 +0200 Subject: [PATCH] chore: tidy up graph analyzing (#595) --- .../analyse_xml_data/construct_and_analyze_graph.py | 13 ++++++------- src/dsp_tools/models/xmlresource.py | 4 ++-- .../xmlupload/create_upload_order_stash_circles.py | 0 .../test_construct_and_analyze_graph.py | 12 ++++-------- .../circular-references/analyse_circles_in_xml.py | 7 +++---- 5 files changed, 15 insertions(+), 21 deletions(-) delete mode 100644 src/dsp_tools/utils/xmlupload/create_upload_order_stash_circles.py diff --git a/src/dsp_tools/analyse_xml_data/construct_and_analyze_graph.py b/src/dsp_tools/analyse_xml_data/construct_and_analyze_graph.py index 821874e6e..11cbcacae 100644 --- a/src/dsp_tools/analyse_xml_data/construct_and_analyze_graph.py +++ b/src/dsp_tools/analyse_xml_data/construct_and_analyze_graph.py @@ -29,8 +29,8 @@ def create_info_from_xml_for_graph( xml_links = [] all_resource_ids = [] for resource in root.iter(tag="{https://dasch.swiss/schema}resource"): - resptr, xml, subject_id = _create_info_from_xml_for_graph_from_one_resource(resource) - all_resource_ids.append(subject_id) + resptr, xml = _create_info_from_xml_for_graph_from_one_resource(resource) + all_resource_ids.append(resource.attrib["id"]) resptr_links.extend(resptr) xml_links.extend(xml) return resptr_links, xml_links, all_resource_ids @@ -38,17 +38,16 @@ def create_info_from_xml_for_graph( def _create_info_from_xml_for_graph_from_one_resource( resource: etree._Element, -) -> tuple[list[ResptrLink], list[XMLLink], str]: - subject_id = resource.attrib["id"] +) -> tuple[list[ResptrLink], list[XMLLink]]: resptr_links: list[ResptrLink] = [] xml_links: list[XMLLink] = [] for prop in resource.getchildren(): match prop.tag: case "{https://dasch.swiss/schema}resptr-prop": - resptr_links.extend(_create_resptr_link_objects(subject_id, prop)) + resptr_links.extend(_create_resptr_link_objects(resource.attrib["id"], prop)) case "{https://dasch.swiss/schema}text-prop": - xml_links.extend(_create_text_link_objects(subject_id, prop)) - return resptr_links, xml_links, subject_id + xml_links.extend(_create_text_link_objects(resource.attrib["id"], prop)) + return resptr_links, xml_links def _create_resptr_link_objects(subject_id: str, resptr_prop: etree._Element) -> list[ResptrLink]: diff --git a/src/dsp_tools/models/xmlresource.py b/src/dsp_tools/models/xmlresource.py index 314a50d3d..6582e9900 100644 --- a/src/dsp_tools/models/xmlresource.py +++ b/src/dsp_tools/models/xmlresource.py @@ -42,8 +42,8 @@ def __init__(self, node: etree._Element, default_ontology: str) -> None: Constructor that parses a resource node from the XML DOM Args: - node: The DOM node to be processed representing a resource (which is a child of the DSP element) - default_ontology: The default ontology (given in the attribute default-ontology of the DSP element) + node: The DOM node to be processed representing a resource (which is a child of the element) + default_ontology: The default ontology (given in the attribute default-ontology of the element) Returns: None diff --git a/src/dsp_tools/utils/xmlupload/create_upload_order_stash_circles.py b/src/dsp_tools/utils/xmlupload/create_upload_order_stash_circles.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/test/unittests/test_analyse_xml_data/test_construct_and_analyze_graph.py b/test/unittests/test_analyse_xml_data/test_construct_and_analyze_graph.py index d81ec930d..e3645dada 100644 --- a/test/unittests/test_analyse_xml_data/test_construct_and_analyze_graph.py +++ b/test/unittests/test_analyse_xml_data/test_construct_and_analyze_graph.py @@ -40,11 +40,10 @@ def test_create_info_from_xml_for_graph_from_one_resource() -> None: """ ) - res_resptr_links, res_xml_links, subject_id = _create_info_from_xml_for_graph_from_one_resource(test_ele) + res_resptr_links, res_xml_links = _create_info_from_xml_for_graph_from_one_resource(test_ele) res_B_19 = [obj.target_id for obj in res_resptr_links] assert "res_B_19" in res_B_19 assert "res_C_19" in res_B_19 - assert "res_A_19" == subject_id assert res_xml_links[0].source_id == "res_A_19" assert res_xml_links[0].target_ids == {"res_B_19", "res_C_19"} @@ -65,8 +64,7 @@ def test_create_info_from_xml_for_graph_from_one_resource_one() -> None: """ ) - res_resptr, res_xml, subject_id = _create_info_from_xml_for_graph_from_one_resource(test_ele) - assert subject_id == "res_A_11" + res_resptr, res_xml = _create_info_from_xml_for_graph_from_one_resource(test_ele) assert res_resptr[0].target_id == "res_B_11" assert isinstance(res_resptr[0], ResptrLink) assert res_xml[0].target_ids == {"res_B_11"} @@ -77,8 +75,7 @@ def test_create_info_from_xml_for_graph_from_one_resource_no_links() -> None: test_ele = etree.fromstring( '' ) - res_resptr, res_xml, sub_id = _create_info_from_xml_for_graph_from_one_resource(test_ele) - assert sub_id == "res_B_18" + res_resptr, res_xml = _create_info_from_xml_for_graph_from_one_resource(test_ele) assert (res_resptr, res_xml) == ([], []) @@ -98,8 +95,7 @@ def test_text_only_create_info_from_xml_for_graph_from_one_resource() -> None: """ ) - res_resptr, res_xml, subject_id = _create_info_from_xml_for_graph_from_one_resource(test_ele) - assert subject_id == "res_C_18" + res_resptr, res_xml = _create_info_from_xml_for_graph_from_one_resource(test_ele) assert not res_resptr res_xml_ids = [x.target_ids for x in res_xml] assert unordered(res_xml_ids) == [{"res_A_18"}, {"res_B_18"}] diff --git a/testdata/xml-data/circular-references/analyse_circles_in_xml.py b/testdata/xml-data/circular-references/analyse_circles_in_xml.py index 6399a9b9a..8b4446b07 100644 --- a/testdata/xml-data/circular-references/analyse_circles_in_xml.py +++ b/testdata/xml-data/circular-references/analyse_circles_in_xml.py @@ -1,7 +1,6 @@ from datetime import datetime from pathlib import Path -from lxml import etree from viztracer import VizTracer from dsp_tools.analyse_xml_data.construct_and_analyze_graph import ( @@ -9,6 +8,7 @@ generate_upload_order, make_graph, ) +from dsp_tools.utils.xml_utils import parse_and_clean_xml_file def analyse_circles_in_data(xml_filepath: Path, tracer_output_file: str, save_tracer: bool = False) -> None: @@ -30,8 +30,7 @@ def analyse_circles_in_data(xml_filepath: Path, tracer_output_file: str, save_tr max_stack_depth=3, ) tracer.start() - tree = etree.parse(xml_filepath) - root = tree.getroot() + root = parse_and_clean_xml_file(xml_filepath) resptr_links, xml_links, all_resource_ids = create_info_from_xml_for_graph(root) print(f"Total Number of Resources: {len(all_resource_ids)}") print(f"Total Number of resptr Links: {len(resptr_links)}") @@ -39,10 +38,10 @@ def analyse_circles_in_data(xml_filepath: Path, tracer_output_file: str, save_tr print("=" * 80) graph, node_to_id, edges = make_graph(resptr_links, xml_links, all_resource_ids) _, _, stash_counter = generate_upload_order(graph, node_to_id, edges) - print("Number of Links Stashed:", stash_counter) tracer.stop() if save_tracer: tracer.save(output_file=tracer_output_file) + print("Number of Links Stashed:", stash_counter) print("=" * 80) print("Start time:", start) print("End time:", datetime.now())