From df1cf13eecf30bb95adbf6a81dd2e4b7a5a0c20a Mon Sep 17 00:00:00 2001 From: Johannes Nussbaum <39048939+jnussbaum@users.noreply.github.com> Date: Fri, 1 Sep 2023 09:20:03 +0200 Subject: [PATCH] feat(xmlupload)!: allow both IDs and IRIs, remove --incremental flag (DEV-1339) (#494) --- docs/cli-commands.md | 5 ++-- docs/file-formats/xml-data-file.md | 7 +++--- docs/incremental-xmlupload.md | 14 ++--------- src/dsp_tools/cli.py | 7 ------ src/dsp_tools/fast_xmlupload/upload_xml.py | 1 - src/dsp_tools/models/xmlresource.py | 2 +- src/dsp_tools/utils/rosetta.py | 1 - src/dsp_tools/utils/xml_upload.py | 11 +++------ test/e2e/test_00A1_import_scripts.py | 1 - test/e2e/test_cli.py | 2 +- test/e2e/test_xmlupload.py | 2 -- testdata/id2iri/test-id2iri-data.xml | 24 +++++++++++++++++++ .../id2iri/test-id2iri-output-expected.xml | 24 +++++++++++++++++++ 13 files changed, 60 insertions(+), 41 deletions(-) diff --git a/docs/cli-commands.md b/docs/cli-commands.md index 8f2f9c73e..e90287e63 100644 --- a/docs/cli-commands.md +++ b/docs/cli-commands.md @@ -135,8 +135,6 @@ The following options are available: - `-u` | `--user` (optional, default: `root@example.com`): username (e-mail) used for authentication with the DSP-API - `-p` | `--password` (optional, default: `test`): password used for authentication with the DSP-API - `-i` | `--imgdir` (optional, default: `.`): folder from where the paths in the `` tags are evaluated -- `-I` | `--incremental` (optional) : The links in the XML file point to IRIs (on the server) - instead of IDs (in the same XML file). - `-V` | `--validate` (optional): validate the XML file without uploading it - `-v` | `--verbose` (optional): print more information about the progress to the console - `-m` | `--metrics` (optional): write metrics into a 'metrics' folder @@ -144,7 +142,8 @@ The following options are available: Output: - A file named `id2iri_mapping_[timestamp].json` is written to the current working directory. - This file should be kept if data is later added with the [`--incremental` option](./incremental-xmlupload.md) + This file should be kept if a second data delivery is added at a later point of time + [see here](./incremental-xmlupload.md). The defaults are intended for local testing: diff --git a/docs/file-formats/xml-data-file.md b/docs/file-formats/xml-data-file.md index 807574696..7c95cd097 100644 --- a/docs/file-formats/xml-data-file.md +++ b/docs/file-formats/xml-data-file.md @@ -9,8 +9,8 @@ After a successful upload of the data, an output file is written (called `id2iri_mapping_[timestamp].json`) with the mapping from the internal IDs used inside the XML to their corresponding IRIs which uniquely identify them inside DSP. -This file should be kept if data is later added with the -`--incremental` [option](../incremental-xmlupload.md). +This file should be kept if a second data delivery is added at a later point of time +[see here](../incremental-xmlupload.md). The import file must start with the standard XML header: @@ -627,8 +627,7 @@ Attributes: #### `` The `` element contains either the internal ID of another resource inside the XML or the IRI of an already -existing resource on DSP. Inside the same XML file, a mixture of the two is not possible. If referencing existing -resources, `xmlupload --incremental` has to be used. +existing resource on DSP. Attributes: diff --git a/docs/incremental-xmlupload.md b/docs/incremental-xmlupload.md index e6d425b67..110073eba 100644 --- a/docs/incremental-xmlupload.md +++ b/docs/incremental-xmlupload.md @@ -44,7 +44,7 @@ The file `additional_data.xml` contains references like `http://rdfh.ch/ Such a file can be uploaded with ```bash -dsp-tools xmlupload --incremental additional_data.xml +dsp-tools xmlupload additional_data.xml ``` @@ -61,9 +61,6 @@ its internal IDs must be replaced by their respective IRIs. That's where the JSON mapping file comes in: It contains a mapping from `book_1` to `http://rdfh.ch/4123/nyOODvYySV2nJ5RWRdmOdQ`. - -### id2iri - As a first step, a new file must be generated with the [`id2iri` command](./cli-commands.md#id2iri), @@ -74,19 +71,12 @@ dsp-tools id2iri additional_data.xml id2iri_mapping_[timestamp].json ``` - -### incremental xmlupload - As second step, the newly generated XML file can be uploaded to DSP: ```bash -dsp-tools xmlupload --incremental additional_data_replaced_[timestamp].xml +dsp-tools xmlupload additional_data_replaced_[timestamp].xml ``` -|
Important
| -|-------------------------------------------------------------------------------------------------------------------------------------------------| -| Internal IDs and IRIs cannot be mixed within the same file. An XML file uploaded with the incremental option must not contain any internal IDs. | - ## 4. Continue an interruped xmlupload diff --git a/src/dsp_tools/cli.py b/src/dsp_tools/cli.py index 20f138369..7647a9838 100644 --- a/src/dsp_tools/cli.py +++ b/src/dsp_tools/cli.py @@ -112,12 +112,6 @@ def _make_parser( parser_upload.add_argument( "-i", "--imgdir", default=".", help="folder from where the paths in the tags are evaluated" ) - parser_upload.add_argument( - "-I", - "--incremental", - action="store_true", - help="The links in the XML file point to IRIs (on the server) instead of IDs (in the same XML file).", - ) parser_upload.add_argument( "-V", "--validate-only", action="store_true", help="validate the XML file without uploading it" ) @@ -466,7 +460,6 @@ def _call_requested_action(args: argparse.Namespace) -> bool: imgdir=args.imgdir, sipi=args.sipi_url, verbose=args.verbose, - incremental=args.incremental, save_metrics=args.metrics, preprocessing_done=False, ) diff --git a/src/dsp_tools/fast_xmlupload/upload_xml.py b/src/dsp_tools/fast_xmlupload/upload_xml.py index bc770f392..3f927bca0 100644 --- a/src/dsp_tools/fast_xmlupload/upload_xml.py +++ b/src/dsp_tools/fast_xmlupload/upload_xml.py @@ -118,7 +118,6 @@ def fast_xmlupload( imgdir=".", sipi=sipi_url, verbose=False, - incremental=False, save_metrics=False, preprocessing_done=True, ) diff --git a/src/dsp_tools/models/xmlresource.py b/src/dsp_tools/models/xmlresource.py index c9ba049f1..0e4b4a343 100644 --- a/src/dsp_tools/models/xmlresource.py +++ b/src/dsp_tools/models/xmlresource.py @@ -93,7 +93,7 @@ def get_props_with_links(self) -> list[XMLProperty]: def get_resptrs(self) -> list[str]: """ - Get a list of all resource id's that are referenced by this resource + Get a list of all resource IDs/IRIs that are referenced by this resource. Returns: List of resources identified by their unique id's (as given in the XML) diff --git a/src/dsp_tools/utils/rosetta.py b/src/dsp_tools/utils/rosetta.py index 45ed6cc95..dd24fed3b 100644 --- a/src/dsp_tools/utils/rosetta.py +++ b/src/dsp_tools/utils/rosetta.py @@ -97,7 +97,6 @@ def _upload_xml(rosetta_folder: Path) -> bool: imgdir=str(rosetta_folder), sipi="http://0.0.0.0:1024", verbose=False, - incremental=False, save_metrics=False, preprocessing_done=False, ) diff --git a/src/dsp_tools/utils/xml_upload.py b/src/dsp_tools/utils/xml_upload.py index b6d8cd669..88c682e7c 100644 --- a/src/dsp_tools/utils/xml_upload.py +++ b/src/dsp_tools/utils/xml_upload.py @@ -178,6 +178,7 @@ def _remove_circular_references( while len(resources) > 0 and cnt < 10000: for resource in resources: resptrs = resource.get_resptrs() + resptrs = [x for x in resptrs if not regex.search(r"https?://rdfh.ch/[a-fA-F0-9]{4}/\w{22}", x)] if len(resptrs) == 0: ok_resources.append(resource) ok_res_ids.append(resource.id) @@ -518,7 +519,6 @@ def xml_upload( imgdir: str, sipi: str, verbose: bool = False, - incremental: bool = False, save_metrics: bool = False, preprocessing_done: bool = False, ) -> bool: @@ -533,7 +533,6 @@ def xml_upload( imgdir: the image directory sipi: the sipi instance to be used verbose: verbose option for the command, if used more output is given to the user - incremental: if set, IRIs instead of internal IDs are expected as resource pointers save_metrics: if true, saves time measurements into a "metrics" folder in the current working directory preprocessing_done: if set, all multimedia files referenced in the XML file must already be on the server @@ -607,12 +606,8 @@ def xml_upload( verbose=verbose, ) - # temporarily remove circular references, but only if not an incremental upload - if not incremental: - resources, stashed_xml_texts, stashed_resptr_props = _remove_circular_references(resources, verbose) - else: - stashed_xml_texts = dict() - stashed_resptr_props = dict() + # temporarily remove circular references + resources, stashed_xml_texts, stashed_resptr_props = _remove_circular_references(resources, verbose) preparation_duration = datetime.now() - preparation_start preparation_duration_ms = preparation_duration.seconds * 1000 + int(preparation_duration.microseconds / 1000) diff --git a/test/e2e/test_00A1_import_scripts.py b/test/e2e/test_00A1_import_scripts.py index 8040402b1..69bd87d80 100644 --- a/test/e2e/test_00A1_import_scripts.py +++ b/test/e2e/test_00A1_import_scripts.py @@ -66,7 +66,6 @@ def test_import_scripts(self) -> None: imgdir="src/dsp_tools/import_scripts/", sipi="http://0.0.0.0:1024", verbose=False, - incremental=False, save_metrics=False, preprocessing_done=False, ) diff --git a/test/e2e/test_cli.py b/test/e2e/test_cli.py index 39f9652fd..79dc5af5e 100644 --- a/test/e2e/test_cli.py +++ b/test/e2e/test_cli.py @@ -212,7 +212,7 @@ def test_xml_upload_incremental(self) -> None: mapping_file.unlink() second_xml_file_replaced = get_most_recent_glob_match(self.cwd / f"{second_xml_file_orig.stem}_replaced_*.xml") - self._make_cli_call(f"dsp-tools xmlupload --incremental -v {second_xml_file_replaced.absolute()}") + self._make_cli_call(f"dsp-tools xmlupload -v {second_xml_file_replaced.absolute()}") second_xml_file_replaced.unlink() self.assertListEqual(list(Path(self.cwd).glob("stashed_*_properties_*.txt")), []) diff --git a/test/e2e/test_xmlupload.py b/test/e2e/test_xmlupload.py index 7cc40921b..bb907a1b5 100644 --- a/test/e2e/test_xmlupload.py +++ b/test/e2e/test_xmlupload.py @@ -25,7 +25,6 @@ def test_xml_upload(self) -> None: imgdir=self.imgdir, sipi=self.sipi, verbose=False, - incremental=False, save_metrics=False, preprocessing_done=False, ) @@ -43,7 +42,6 @@ def test_xml_upload(self) -> None: imgdir=self.imgdir, sipi=self.sipi, verbose=False, - incremental=False, save_metrics=False, preprocessing_done=False, ) diff --git a/testdata/id2iri/test-id2iri-data.xml b/testdata/id2iri/test-id2iri-data.xml index 11c5c46d2..f3f018c62 100644 --- a/testdata/id2iri/test-id2iri-data.xml +++ b/testdata/id2iri/test-id2iri-data.xml @@ -80,4 +80,28 @@ + + + Text + + + + + + Text + + + + Text with a link to no_replacements + and a link to resptr_only + + + + test_thing_2_in_same_file + + + true + + + diff --git a/testdata/id2iri/test-id2iri-output-expected.xml b/testdata/id2iri/test-id2iri-output-expected.xml index 4735e8cba..f91a69c20 100644 --- a/testdata/id2iri/test-id2iri-output-expected.xml +++ b/testdata/id2iri/test-id2iri-output-expected.xml @@ -75,4 +75,28 @@ + + + Text + + + + + + Text + + + + Text with a link to no_replacements + and a link to resptr_only + + + + test_thing_2_in_same_file + + + true + + +