Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: tidy up graph analyzing #595

Merged
merged 6 commits into from
Oct 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
13 changes: 6 additions & 7 deletions src/dsp_tools/analyse_xml_data/construct_and_analyze_graph.py
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here I just improved the flow of information: It's not necessary for _create_info_from_xml_for_graph_from_one_resource() to return subject_id. This can be extracted on the caller side.

Original file line number Diff line number Diff line change
Expand Up @@ -29,26 +29,25 @@ def create_info_from_xml_for_graph(
xml_links = []
all_resource_ids = []
for resource in root.iter(tag="{https://dasch.swiss/schema}resource"):
resptr, xml, subject_id = _create_info_from_xml_for_graph_from_one_resource(resource)
all_resource_ids.append(subject_id)
resptr, xml = _create_info_from_xml_for_graph_from_one_resource(resource)
all_resource_ids.append(resource.attrib["id"])
resptr_links.extend(resptr)
xml_links.extend(xml)
return resptr_links, xml_links, all_resource_ids


def _create_info_from_xml_for_graph_from_one_resource(
resource: etree._Element,
) -> tuple[list[ResptrLink], list[XMLLink], str]:
subject_id = resource.attrib["id"]
) -> tuple[list[ResptrLink], list[XMLLink]]:
resptr_links: list[ResptrLink] = []
xml_links: list[XMLLink] = []
for prop in resource.getchildren():
match prop.tag:
case "{https://dasch.swiss/schema}resptr-prop":
resptr_links.extend(_create_resptr_link_objects(subject_id, prop))
resptr_links.extend(_create_resptr_link_objects(resource.attrib["id"], prop))
case "{https://dasch.swiss/schema}text-prop":
xml_links.extend(_create_text_link_objects(subject_id, prop))
return resptr_links, xml_links, subject_id
xml_links.extend(_create_text_link_objects(resource.attrib["id"], prop))
return resptr_links, xml_links


def _create_resptr_link_objects(subject_id: str, resptr_prop: etree._Element) -> list[ResptrLink]:
Expand Down
4 changes: 2 additions & 2 deletions src/dsp_tools/models/xmlresource.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ def __init__(self, node: etree._Element, default_ontology: str) -> None:
Constructor that parses a resource node from the XML DOM

Args:
node: The DOM node to be processed representing a resource (which is a child of the DSP element)
default_ontology: The default ontology (given in the attribute default-ontology of the DSP element)
node: The DOM node to be processed representing a resource (which is a child of the <knora> element)
default_ontology: The default ontology (given in the attribute default-ontology of the <knora> element)

Returns:
None
Expand Down
Empty file.
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now, _create_info_from_xml_for_graph_from_one_resource() doesn't return the subject_id any more. So, the tests become easier, because they don't have to test this unnecessary thing any more.

Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,10 @@ def test_create_info_from_xml_for_graph_from_one_resource() -> None:
</text-prop>
</resource>"""
)
res_resptr_links, res_xml_links, subject_id = _create_info_from_xml_for_graph_from_one_resource(test_ele)
res_resptr_links, res_xml_links = _create_info_from_xml_for_graph_from_one_resource(test_ele)
res_B_19 = [obj.target_id for obj in res_resptr_links]
assert "res_B_19" in res_B_19
assert "res_C_19" in res_B_19
assert "res_A_19" == subject_id
assert res_xml_links[0].source_id == "res_A_19"
assert res_xml_links[0].target_ids == {"res_B_19", "res_C_19"}

Expand All @@ -65,8 +64,7 @@ def test_create_info_from_xml_for_graph_from_one_resource_one() -> None:
</resource>
"""
)
res_resptr, res_xml, subject_id = _create_info_from_xml_for_graph_from_one_resource(test_ele)
assert subject_id == "res_A_11"
res_resptr, res_xml = _create_info_from_xml_for_graph_from_one_resource(test_ele)
assert res_resptr[0].target_id == "res_B_11"
assert isinstance(res_resptr[0], ResptrLink)
assert res_xml[0].target_ids == {"res_B_11"}
Expand All @@ -77,8 +75,7 @@ def test_create_info_from_xml_for_graph_from_one_resource_no_links() -> None:
test_ele = etree.fromstring(
'<resource label="res_B_18" restype=":TestThing" id="res_B_18" permissions="res-default"/>'
)
res_resptr, res_xml, sub_id = _create_info_from_xml_for_graph_from_one_resource(test_ele)
assert sub_id == "res_B_18"
res_resptr, res_xml = _create_info_from_xml_for_graph_from_one_resource(test_ele)
assert (res_resptr, res_xml) == ([], [])


Expand All @@ -98,8 +95,7 @@ def test_text_only_create_info_from_xml_for_graph_from_one_resource() -> None:
</resource>
"""
)
res_resptr, res_xml, subject_id = _create_info_from_xml_for_graph_from_one_resource(test_ele)
assert subject_id == "res_C_18"
res_resptr, res_xml = _create_info_from_xml_for_graph_from_one_resource(test_ele)
assert not res_resptr
res_xml_ids = [x.target_ids for x in res_xml]
assert unordered(res_xml_ids) == [{"res_A_18"}, {"res_B_18"}]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
from datetime import datetime
from pathlib import Path

from lxml import etree
from viztracer import VizTracer

from dsp_tools.analyse_xml_data.construct_and_analyze_graph import (
create_info_from_xml_for_graph,
generate_upload_order,
make_graph,
)
from dsp_tools.utils.xml_utils import parse_and_clean_xml_file


def analyse_circles_in_data(xml_filepath: Path, tracer_output_file: str, save_tracer: bool = False) -> None:
Expand All @@ -30,19 +30,18 @@ def analyse_circles_in_data(xml_filepath: Path, tracer_output_file: str, save_tr
max_stack_depth=3,
)
tracer.start()
tree = etree.parse(xml_filepath)
root = tree.getroot()
root = parse_and_clean_xml_file(xml_filepath)
Copy link
Collaborator Author

@jnussbaum jnussbaum Oct 26, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function transforms the special tags <annotation>, <region>, and <link> to their technically correct form <resource restype="Annotation">, <resource restype="Region">, and <resource restype="LinkObj">. This is necessary, otherwise these special tags aren't considered when analyzing the links

resptr_links, xml_links, all_resource_ids = create_info_from_xml_for_graph(root)
print(f"Total Number of Resources: {len(all_resource_ids)}")
print(f"Total Number of resptr Links: {len(resptr_links)}")
print(f"Total Number of XML Texts with Links: {len(xml_links)}")
print("=" * 80)
graph, node_to_id, edges = make_graph(resptr_links, xml_links, all_resource_ids)
_, _, stash_counter = generate_upload_order(graph, node_to_id, edges)
print("Number of Links Stashed:", stash_counter)
tracer.stop()
if save_tracer:
tracer.save(output_file=tracer_output_file)
print("Number of Links Stashed:", stash_counter)
print("=" * 80)
print("Start time:", start)
print("End time:", datetime.now())
Expand Down