Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: use sets instead of lists for stashing circular references (DEV-2771) #548

9 changes: 9 additions & 0 deletions src/dsp_tools/models/xmlresource.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import Optional, Union

import regex
from lxml import etree

from dsp_tools.models.exceptions import BaseError
Expand Down Expand Up @@ -109,6 +110,14 @@ def get_resptrs(self) -> list[str]:
resptrs.extend(value.resrefs)
return resptrs

def get_internal_resptrs(self) -> set[str]:
"""
Get a set of all resource IDs that are referenced by this resource by means of an internal ID.
Returns:
Set of resources identified by their unique id's (as given in the XML)
"""
return {x for x in self.get_resptrs() if not regex.search(r"https?://rdfh.ch/[a-fA-F0-9]{4}/\w{22}", x)}

def get_propvals(
self,
resiri_lookup: dict[str, str],
Expand Down
36 changes: 11 additions & 25 deletions src/dsp_tools/utils/xmlupload/stash_circular_references.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
from datetime import datetime
from typing import cast

import regex

from dsp_tools.models.exceptions import BaseError
from dsp_tools.models.value import KnoraStandoffXml
from dsp_tools.models.xmlproperty import XMLProperty
Expand All @@ -16,13 +14,13 @@

def _stash_circular_references(
nok_resources: list[XMLResource],
ok_res_ids: list[str],
ok_res_ids: set[str],
ok_resources: list[XMLResource],
stashed_xml_texts: dict[XMLResource, dict[XMLProperty, dict[str, KnoraStandoffXml]]],
stashed_resptr_props: dict[XMLResource, dict[XMLProperty, list[str]]],
) -> tuple[
list[XMLResource],
list[str],
set[str],
list[XMLResource],
dict[XMLResource, dict[XMLProperty, dict[str, KnoraStandoffXml]]],
dict[XMLResource, dict[XMLProperty, list[str]]],
Expand Down Expand Up @@ -69,7 +67,7 @@ def _stash_circular_references(
res.properties.remove(link_prop)

ok_resources.append(res)
ok_res_ids.append(res.id)
ok_res_ids.add(res.id)
nok_resources.remove(res)

return nok_resources, ok_res_ids, ok_resources, stashed_xml_texts, stashed_resptr_props
Expand Down Expand Up @@ -112,32 +110,20 @@ def remove_circular_references(
# resources with circular references
nok_resources: list[XMLResource] = []
# internal ids for the resources that do not have circular references
ok_res_ids: list[str] = []
ok_res_ids: set[str] = set()
cnt = 0
nok_len = 9999999
while len(resources) > 0 and cnt < 10000:
for resource in resources:
resptrs = resource.get_resptrs()
# get all the resptrs which have an internal id, i.e. that do not exist in the triplestore
resptrs = [x for x in resptrs if not regex.search(r"https?://rdfh.ch/[a-fA-F0-9]{4}/\w{22}", x)]
# if there are no resptrs references, append to the ok resources
if len(resptrs) == 0:
resptrs = resource.get_internal_resptrs()
# if there are no resptrs references
# or all of them are in the ok resources,
# append the resource to the ok resources
if len(resptrs) == 0 or resptrs.issubset(ok_res_ids):
ok_resources.append(resource)
ok_res_ids.append(resource.id)
ok_res_ids.add(resource.id)
else:
ok = True
# iterate over the list with all the resptrs that have internal links
for resptr in resptrs:
# if that resptr is not in the ok list, set the flag to false
if resptr not in ok_res_ids:
ok = False
# if all the resptr are in the ok list, then there are no circular references
if ok:
ok_resources.append(resource)
ok_res_ids.append(resource.id)
# if any of the resptr are not in the ok list append the resource to the not ok list
else:
nok_resources.append(resource)
nok_resources.append(resource)
resources = nok_resources
if len(nok_resources) == nok_len:
# there are circular references. go through all problematic resources, and stash the problematic references.
Expand Down