Skip to content

Commit

Permalink
feat(id2iri): replace IDs also inside salsah-links, not only inside <…
Browse files Browse the repository at this point in the history
…resptr> tags (DEV-2578) (#490)
  • Loading branch information
jnussbaum committed Aug 28, 2023
1 parent f814667 commit 047ba15
Show file tree
Hide file tree
Showing 10 changed files with 186 additions and 56 deletions.
19 changes: 2 additions & 17 deletions .gitignore
Expand Up @@ -27,19 +27,7 @@ wheels/
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Environments
.env
.venv
env/
venv/
ENV/
Expand All @@ -62,16 +50,13 @@ venv.bak/
.vscode

# created files
lists.json
out.json
*id2iri_mapping*
stashed*
**/~$*.*
testdata/tmp/
cwd/
testdata/test-list.json
src/dsp_tools/docker/sipi.docker-config.lua
*_replaced_*.xml
metrics/

# for testing in development
tmp/
/out/
3 changes: 2 additions & 1 deletion docs/cli-commands.md
Expand Up @@ -257,7 +257,8 @@ which is described [here](./excel2xml-module.md).

## `id2iri`

This command replaces internal IDs contained in the `<resptr>` tags of an XML file
This command replaces internal IDs of an XML file
(`<resptr>` tags and salsah-links inside `<text>` tags)
by IRIs provided in a mapping file.

```bash
Expand Down
6 changes: 2 additions & 4 deletions docs/incremental-xmlupload.md
Expand Up @@ -52,7 +52,9 @@ dsp-tools xmlupload --incremental additional_data.xml

The third case, however, is a bit more complicated:
The file `additional_data.xml` contains references like `<resptr>book_1</resptr>`,
or `<text><a class="salsah-link" href="IRI:book_1:IRI">link to book_1</a></text>`,
where `book_1` was the internal ID of a resource that had previously been uploaded to DSP.

Before such an XML file can be uploaded,
its internal IDs must be replaced by their respective IRIs.
That's where the JSON mapping file comes in:
Expand All @@ -70,10 +72,6 @@ like this:
dsp-tools id2iri additional_data.xml id2iri_mapping_[timestamp].json
```

| <center>Important</center> |
|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Only internal IDs inside the `<resptr>` tag are replaced. Salsah-links inside text properties (e.g. `<a class="salsah-link" href="IRI:book_1:IRI">link to an ID</a>`) are NOT replaced. |



### incremental xmlupload
Expand Down
2 changes: 1 addition & 1 deletion src/dsp_tools/cli.py
Expand Up @@ -220,7 +220,7 @@ def _make_parser(
# id2iri
parser_id2iri = subparsers.add_parser(
name="id2iri",
help="Replace internal IDs contained in the <resptr> tags of an XML file by IRIs provided in a mapping file",
help="Replace internal IDs of an XML file (resptr tags or salsah-links) by IRIs provided in a mapping file.",
)
parser_id2iri.set_defaults(action="id2iri")
parser_id2iri.add_argument("xmlfile", help="path to the XML file containing the data to be replaced")
Expand Down
99 changes: 87 additions & 12 deletions src/dsp_tools/utils/id_to_iri.py
Expand Up @@ -3,6 +3,7 @@
from pathlib import Path

from lxml import etree
import regex

from dsp_tools.models.exceptions import UserError
from dsp_tools.utils.logging import get_logger
Expand Down Expand Up @@ -58,12 +59,78 @@ def _parse_json_file(json_file: Path) -> dict[str, str]:
return mapping


def _replace_resptrs(
tree: etree._Element,
mapping: dict[str, str],
used_mapping_entries: set[str],
) -> tuple[etree._Element, set[str]]:
"""
Replace the internal IDs in the <resptr> tags by IRIs.
Args:
tree: parsed XML file
mapping: mapping of internal IDs to IRIs
used_mapping_entries: IDs of the mapping that have been found in the XML and have been replaced
Returns:
a tuple of the modified XML tree and the set of the IDs that have been replaced
"""
resptr_elems = tree.xpath("/knora/resource/resptr-prop/resptr")
resptr_elems_replaced = 0
for resptr_elem in resptr_elems:
value_before = resptr_elem.text
if value_after := mapping.get(value_before):
resptr_elem.text = value_after
resptr_elems_replaced += 1
used_mapping_entries.add(value_before)

logger.info(f"Replaced {resptr_elems_replaced}/{len(resptr_elems)} resptr links in the XML file")
print(f"Replaced {resptr_elems_replaced}/{len(resptr_elems)} resptr links in the XML file")

return tree, used_mapping_entries


def _replace_salsah_links(
tree: etree._Element,
mapping: dict[str, str],
used_mapping_entries: set[str],
) -> tuple[etree._Element, set[str]]:
"""
Replace the internal IDs in the salsah-links of the <text> tags by IRIs.
Args:
tree: parsed XML file
mapping: mapping of internal IDs to IRIs
used_mapping_entries: IDs of the mapping that have been found in the XML and have been replaced
Returns:
a tuple of the modified XML tree and the set of the IDs that have been replaced
"""
salsah_links = [
x for x in tree.xpath("/knora/resource/text-prop/text//a") if x.attrib.get("class") == "salsah-link"
]
salsah_links_replaced = 0
for salsah_link in salsah_links:
value_before = regex.sub("IRI:|:IRI", "", salsah_link.attrib.get("href", ""))
if value_after := mapping.get(value_before):
salsah_link.attrib["href"] = value_after
salsah_links_replaced += 1
used_mapping_entries.add(value_before)

logger.info(f"Replaced {salsah_links_replaced}/{len(salsah_links)} salsah-links in the XML file")
print(f"Replaced {salsah_links_replaced}/{len(salsah_links)} salsah-links in the XML file")

return tree, used_mapping_entries


def _replace_ids_by_iris(
tree: etree._Element,
mapping: dict[str, str],
) -> tuple[etree._Element, bool]:
"""
Iterate over the `<resptr>` tags and replace the internal IDs by IRIs.
Iterate over the <resptr> tags and the salsah-links of the <text> tags,
and replace the internal IDs by IRIs.
If an internal ID cannot be found in the mapping, the original ID is kept.
Args:
tree: parsed XML file
Expand All @@ -73,16 +140,22 @@ def _replace_ids_by_iris(
modified XML tree
"""
success = True
resource_elements = tree.xpath("/knora/resource/resptr-prop/resptr")
for resptr_prop in resource_elements:
value_before = resptr_prop.text
value_after = mapping.get(resptr_prop.text)
if value_after:
resptr_prop.text = value_after
else:
logger.warning(f"Could not find internal ID '{value_before}' in mapping file. Skipping...")
print(f"WARNING: Could not find internal ID '{value_before}' in mapping file. Skipping...")
success = False
used_mapping_entries: set[str] = set()

tree, used_mapping_entries = _replace_resptrs(
tree=tree,
mapping=mapping,
used_mapping_entries=used_mapping_entries,
)

tree, used_mapping_entries = _replace_salsah_links(
tree=tree,
mapping=mapping,
used_mapping_entries=used_mapping_entries,
)

logger.info(f"Used {len(used_mapping_entries)}/{len(mapping)} entries from the mapping file")
print(f"Used {len(used_mapping_entries)}/{len(mapping)} entries from the mapping file")

return tree, success

Expand Down Expand Up @@ -111,8 +184,10 @@ def id_to_iri(
json_file: str,
) -> bool:
"""
Replace internal IDs contained in the `<resptr>` tags of an XML file
Replace internal IDs of an XML file
(<resptr> tags and salsah-links inside <text> tags)
by IRIs provided in a mapping file.
If an internal ID cannot be found in the mapping, the original ID is kept.
The output is written to a new XML file named "[original name]_replaced_[timestamp].xml".
Args:
Expand Down
16 changes: 16 additions & 0 deletions src/dsp_tools/utils/shared.py
@@ -1,6 +1,7 @@
from __future__ import annotations

import copy
import glob
import importlib.resources
import json
import time
Expand Down Expand Up @@ -407,3 +408,18 @@ def parse_json_input(project_file_as_path_or_parsed: Union[str, Path, dict[str,
else:
raise BaseError("Invalid input: The input must be a path to a JSON file or a parsed JSON object.")
return project_definition


def get_most_recent_glob_match(glob_pattern: Union[str, Path]) -> Path:
"""
Find the most recently created file that matches a glob pattern.
Args:
glob_pattern: glob pattern, either absolute or relative to the cwd of the caller
Returns:
the most recently created file that matches the glob pattern
"""
candidates = [Path(x) for x in glob.glob(str(glob_pattern))]
most_recent_file = max(candidates, key=lambda item: item.stat().st_ctime)
return most_recent_file
15 changes: 8 additions & 7 deletions test/e2e/test_cli.py
Expand Up @@ -20,6 +20,7 @@
import regex

from dsp_tools.utils.project_create_lists import create_lists
from dsp_tools.utils.shared import get_most_recent_glob_match


class TestCLI(unittest.TestCase):
Expand Down Expand Up @@ -47,7 +48,7 @@ def tearDownClass(cls) -> None:
"""Is executed after the methods of this class have all run through"""
shutil.rmtree(cls.testdata_tmp)
shutil.rmtree(cls.cwd)
for f in Path().glob("id2iri_*.json"):
for f in Path().glob("*id2iri_*.json"):
f.unlink()

def _make_cli_call(
Expand Down Expand Up @@ -205,15 +206,15 @@ def test_xml_upload_incremental(self) -> None:
working_directory=Path("."),
)

mapping_file = list(Path().glob("test-data-systematic_id2iri_mapping_*.json"))[0]
mapping_file = get_most_recent_glob_match("test-data-systematic_id2iri_mapping_*.json")
second_xml_file_orig = Path("testdata/id2iri/test-id2iri-data.xml")
self._make_cli_call(f"dsp-tools id2iri {second_xml_file_orig.absolute()} {mapping_file.absolute()}")
mapping_file.unlink()

second_xml_file_replaced = list(self.cwd.glob(f"{second_xml_file_orig.stem}_replaced_*.xml"))[0]
second_xml_file_replaced = get_most_recent_glob_match(self.cwd / f"{second_xml_file_orig.stem}_replaced_*.xml")
self._make_cli_call(f"dsp-tools xmlupload --incremental -v {second_xml_file_replaced.absolute()}")
self.assertListEqual(list(Path(self.cwd).glob("stashed_*_properties_*.txt")), [])
mapping_file.unlink()
second_xml_file_replaced.unlink()
self.assertListEqual(list(Path(self.cwd).glob("stashed_*_properties_*.txt")), [])

def test_excel_to_json_project(self) -> None:
excel_folder = Path("testdata/excel2json/excel2json_files")
Expand Down Expand Up @@ -263,13 +264,13 @@ def test_id_to_iri(self) -> None:
xml_file = Path("testdata/id2iri/test-id2iri-data.xml")
mapping_file = Path("testdata/id2iri/test-id2iri-mapping.json")
self._make_cli_call(f"dsp-tools id2iri {xml_file.absolute()} {mapping_file.absolute()}")
out_file = list(self.cwd.glob(f"{xml_file.stem}_replaced_*.xml"))[0]
out_file = get_most_recent_glob_match(self.cwd / "test-id2iri-data_replaced_*.xml")
with open(out_file, encoding="utf-8") as f:
output_actual = f.read()
out_file.unlink()
with open("testdata/id2iri/test-id2iri-output-expected.xml", encoding="utf-8") as f:
output_expected = f.read()
self.assertEqual(output_actual, output_expected)
out_file.unlink()

@pytest.mark.filterwarnings("ignore")
def test_excel2xml(self) -> None:
Expand Down
34 changes: 20 additions & 14 deletions test/unittests/test_id_to_iri.py
Expand Up @@ -5,10 +5,11 @@
import unittest

import pytest
import regex

from dsp_tools.models.exceptions import BaseError
from dsp_tools.utils.id_to_iri import id_to_iri
from dsp_tools.utils.xml_upload import parse_xml_file
from dsp_tools.utils.shared import get_most_recent_glob_match


class TestIdToIri(unittest.TestCase):
Expand Down Expand Up @@ -39,24 +40,29 @@ def test_invalid_json_file_name(self) -> None:
)

def test_replace_id_with_iri(self) -> None:
"""Check that the correct IRIs appear in the correct order in the output file"""
id_to_iri(
xml_file="testdata/id2iri/test-id2iri-data.xml",
json_file="testdata/id2iri/test-id2iri-mapping.json",
)
out_file = list(Path(".").glob("test-id2iri-data_replaced_*.xml"))[0]
out_file_parsed = parse_xml_file(out_file)
out_file = get_most_recent_glob_match("test-id2iri-data_replaced_*.xml")
with open(out_file, encoding="utf-8", mode="r") as file:
out_file_content = file.read()
out_file.unlink()
resptr_props = out_file_parsed.xpath("/knora/resource/resptr-prop/resptr")
resptr_props_contents = [r.text for r in resptr_props]
self.assertEqual(
resptr_props_contents,
[
"http://rdfh.ch/082E/ylRvrg7tQI6aVpcTJbVrwg",
"http://rdfh.ch/082E/qwasddoiu876flkjh67dss",
"http://rdfh.ch/082E/ylRvrg7tQI6aVpcTJbVrwg",
"http://rdfh.ch/082E/qwasddoiu876flkjh67dss",
],
)
iris = regex.findall(r"http://rdfh\.ch/082E/\w+", out_file_content)
iris_expected = [
"http://rdfh.ch/082E/ylRvrg7tQI6aVpcTJbVrwg",
"http://rdfh.ch/082E/qwasddoiu876flkjh67dss",
"http://rdfh.ch/082E/JK63OpYWTDWNYVOYFN7FdQ",
"http://rdfh.ch/082E/1l63Oasdfopiujlkmn78ak",
"http://rdfh.ch/082E/qwasddoiu876flkjh67dss",
"http://rdfh.ch/082E/JK63OpYWTDWNYVOYFN7FdQ",
"http://rdfh.ch/082E/1l63Oasdfopiujlkmn78ak",
"http://rdfh.ch/082E/qwasddoiu876flkjh67dss",
"http://rdfh.ch/082E/ylRvrg7tQI6aVpcTJbVrwg",
"http://rdfh.ch/082E/qwasddoiu876flkjh67dss",
]
self.assertListEqual(iris, iris_expected)


if __name__ == "__main__":
Expand Down
24 changes: 24 additions & 0 deletions testdata/id2iri/test-id2iri-data.xml
Expand Up @@ -38,6 +38,18 @@
<text-prop name=":hasSimpleText">
<text encoding="utf8">Text</text>
</text-prop>
<text-prop name=":hasRichtext">
<text encoding="xml">
Text with a <a class="salsah-link" href="IRI:test_thing_1:IRI">link to test_thing_1</a>
and <strong>a bold
<em>and italicized
<a class="salsah-link" href="IRI:test_thing_2:IRI">link to test_thing_2</a>
</em>
</strong>
and trailing text
and a <a class="salsah-link" href="IRI:test_thing_with_ark_1:IRI">link to test_thing_with_ark_1</a>
</text>
</text-prop>
<boolean-prop name=":hasBoolean">
<boolean>true</boolean>
</boolean-prop>
Expand All @@ -47,6 +59,18 @@
<text-prop name=":hasSimpleText">
<text encoding="utf8">Text</text>
</text-prop>
<text-prop name=":hasRichtext">
<text encoding="xml">
Text with a <a class="salsah-link" href="IRI:test_thing_1:IRI">link to test_thing_1</a>
and <strong>a bold
<em>and italicized
<a class="salsah-link" href="IRI:test_thing_2:IRI">link to test_thing_2</a>
</em>
</strong>
and trailing text
and a <a class="salsah-link" href="IRI:test_thing_with_ark_1:IRI">link to test_thing_with_ark_1</a>
</text>
</text-prop>
<resptr-prop name=":hasTestThing2">
<resptr>test_thing_0</resptr>
<resptr>test_thing_with_ark_1</resptr>
Expand Down

0 comments on commit 047ba15

Please sign in to comment.