Skip to content

Commit

Permalink
chore: tidy up graph analyzing (#595)
Browse files Browse the repository at this point in the history
  • Loading branch information
jnussbaum committed Oct 26, 2023
1 parent d0779ac commit f2d4488
Show file tree
Hide file tree
Showing 5 changed files with 15 additions and 21 deletions.
13 changes: 6 additions & 7 deletions src/dsp_tools/analyse_xml_data/construct_and_analyze_graph.py
Expand Up @@ -29,26 +29,25 @@ def create_info_from_xml_for_graph(
xml_links = []
all_resource_ids = []
for resource in root.iter(tag="{https://dasch.swiss/schema}resource"):
resptr, xml, subject_id = _create_info_from_xml_for_graph_from_one_resource(resource)
all_resource_ids.append(subject_id)
resptr, xml = _create_info_from_xml_for_graph_from_one_resource(resource)
all_resource_ids.append(resource.attrib["id"])
resptr_links.extend(resptr)
xml_links.extend(xml)
return resptr_links, xml_links, all_resource_ids


def _create_info_from_xml_for_graph_from_one_resource(
resource: etree._Element,
) -> tuple[list[ResptrLink], list[XMLLink], str]:
subject_id = resource.attrib["id"]
) -> tuple[list[ResptrLink], list[XMLLink]]:
resptr_links: list[ResptrLink] = []
xml_links: list[XMLLink] = []
for prop in resource.getchildren():
match prop.tag:
case "{https://dasch.swiss/schema}resptr-prop":
resptr_links.extend(_create_resptr_link_objects(subject_id, prop))
resptr_links.extend(_create_resptr_link_objects(resource.attrib["id"], prop))
case "{https://dasch.swiss/schema}text-prop":
xml_links.extend(_create_text_link_objects(subject_id, prop))
return resptr_links, xml_links, subject_id
xml_links.extend(_create_text_link_objects(resource.attrib["id"], prop))
return resptr_links, xml_links


def _create_resptr_link_objects(subject_id: str, resptr_prop: etree._Element) -> list[ResptrLink]:
Expand Down
4 changes: 2 additions & 2 deletions src/dsp_tools/models/xmlresource.py
Expand Up @@ -42,8 +42,8 @@ def __init__(self, node: etree._Element, default_ontology: str) -> None:
Constructor that parses a resource node from the XML DOM
Args:
node: The DOM node to be processed representing a resource (which is a child of the DSP element)
default_ontology: The default ontology (given in the attribute default-ontology of the DSP element)
node: The DOM node to be processed representing a resource (which is a child of the <knora> element)
default_ontology: The default ontology (given in the attribute default-ontology of the <knora> element)
Returns:
None
Expand Down
Empty file.
Expand Up @@ -40,11 +40,10 @@ def test_create_info_from_xml_for_graph_from_one_resource() -> None:
</text-prop>
</resource>"""
)
res_resptr_links, res_xml_links, subject_id = _create_info_from_xml_for_graph_from_one_resource(test_ele)
res_resptr_links, res_xml_links = _create_info_from_xml_for_graph_from_one_resource(test_ele)
res_B_19 = [obj.target_id for obj in res_resptr_links]
assert "res_B_19" in res_B_19
assert "res_C_19" in res_B_19
assert "res_A_19" == subject_id
assert res_xml_links[0].source_id == "res_A_19"
assert res_xml_links[0].target_ids == {"res_B_19", "res_C_19"}

Expand All @@ -65,8 +64,7 @@ def test_create_info_from_xml_for_graph_from_one_resource_one() -> None:
</resource>
"""
)
res_resptr, res_xml, subject_id = _create_info_from_xml_for_graph_from_one_resource(test_ele)
assert subject_id == "res_A_11"
res_resptr, res_xml = _create_info_from_xml_for_graph_from_one_resource(test_ele)
assert res_resptr[0].target_id == "res_B_11"
assert isinstance(res_resptr[0], ResptrLink)
assert res_xml[0].target_ids == {"res_B_11"}
Expand All @@ -77,8 +75,7 @@ def test_create_info_from_xml_for_graph_from_one_resource_no_links() -> None:
test_ele = etree.fromstring(
'<resource label="res_B_18" restype=":TestThing" id="res_B_18" permissions="res-default"/>'
)
res_resptr, res_xml, sub_id = _create_info_from_xml_for_graph_from_one_resource(test_ele)
assert sub_id == "res_B_18"
res_resptr, res_xml = _create_info_from_xml_for_graph_from_one_resource(test_ele)
assert (res_resptr, res_xml) == ([], [])


Expand All @@ -98,8 +95,7 @@ def test_text_only_create_info_from_xml_for_graph_from_one_resource() -> None:
</resource>
"""
)
res_resptr, res_xml, subject_id = _create_info_from_xml_for_graph_from_one_resource(test_ele)
assert subject_id == "res_C_18"
res_resptr, res_xml = _create_info_from_xml_for_graph_from_one_resource(test_ele)
assert not res_resptr
res_xml_ids = [x.target_ids for x in res_xml]
assert unordered(res_xml_ids) == [{"res_A_18"}, {"res_B_18"}]
Expand Down
@@ -1,14 +1,14 @@
from datetime import datetime
from pathlib import Path

from lxml import etree
from viztracer import VizTracer

from dsp_tools.analyse_xml_data.construct_and_analyze_graph import (
create_info_from_xml_for_graph,
generate_upload_order,
make_graph,
)
from dsp_tools.utils.xml_utils import parse_and_clean_xml_file


def analyse_circles_in_data(xml_filepath: Path, tracer_output_file: str, save_tracer: bool = False) -> None:
Expand All @@ -30,19 +30,18 @@ def analyse_circles_in_data(xml_filepath: Path, tracer_output_file: str, save_tr
max_stack_depth=3,
)
tracer.start()
tree = etree.parse(xml_filepath)
root = tree.getroot()
root = parse_and_clean_xml_file(xml_filepath)
resptr_links, xml_links, all_resource_ids = create_info_from_xml_for_graph(root)
print(f"Total Number of Resources: {len(all_resource_ids)}")
print(f"Total Number of resptr Links: {len(resptr_links)}")
print(f"Total Number of XML Texts with Links: {len(xml_links)}")
print("=" * 80)
graph, node_to_id, edges = make_graph(resptr_links, xml_links, all_resource_ids)
_, _, stash_counter = generate_upload_order(graph, node_to_id, edges)
print("Number of Links Stashed:", stash_counter)
tracer.stop()
if save_tracer:
tracer.save(output_file=tracer_output_file)
print("Number of Links Stashed:", stash_counter)
print("=" * 80)
print("Start time:", start)
print("End time:", datetime.now())
Expand Down

0 comments on commit f2d4488

Please sign in to comment.