Skip to content

Commit

Permalink
Merge pull request #287 from mih/docs
Browse files Browse the repository at this point in the history
`add_sibling_dataverse` docstring
  • Loading branch information
mih committed Mar 17, 2023
2 parents bc6c059 + e861580 commit a48ca22
Show file tree
Hide file tree
Showing 6 changed files with 155 additions and 211 deletions.
135 changes: 73 additions & 62 deletions datalad_dataverse/add_sibling_dataverse.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,53 +33,75 @@

@build_doc
class AddSiblingDataverse(ValidatedInterface):
"""Add a dataset sibling(-tandem) connecting to a Dataverse dataset.
"""Add a Dataverse dataset as a sibling(-tandem)
Dataverse is a web application to share and cite research data.
Research data published in Dataverse receives an academic citation which
allows to grant full credit and increases visibility of your work.
This command registers an existing Dataverse dataset as a sibling of a
DataLad dataset. Both dataset version history and file content can then be
deposited at a Dataverse site via the standard ``push`` command.
Dataverse imposes strict limits on directory names (and to some degree also
file name). Therefore, names of files that conflict with these rules (e.g.,
a directory name with any character not found in the English alphabet) are
mangled on-push. This mangling does not impact file names in the DataLad
dataset (also not for clones from Dataverse). See the package documentation
for details.
If a DataLad's dataset version history was deposited on Dataverse, a
dataset can also be cloned from Dataverse again, via the standard ``clone``
command.
In order to be able to use this command, a personal access token has to be
generated on the Dataverse platform. You can find it by clicking on your
name at the top right corner, and then clicking on Api Token>Create Token.
Furthermore, a dataset on such a dataverse instance has to already exist in
order to add it as a sibling to a DataLad dataset.
name at the top right corner, and then clicking on API Token>Create Token.
"""

_examples_ = [
dict(text="add a dataverse dataset sibling for sharing and citing",
code_py="""\
> ds = Dataset('.')
> ds.add_sibling_dataverse(url='https://demo.dataverse.org', name='dataverse', ds_pid='doi:10.5072/FK2/PMPMZM')
""",
code_cmd="datalad add-sibling-dataverse demo.dataverse.org doi:10.5072/FK2/PMPMZM -s dataverse",
dict(
text="Add a dataverse dataset sibling for sharing and citing",
code_py="""\
>>> ds = Dataset('.')
>>> ds.add_sibling_dataverse(
... url='https://demo.dataverse.org',
... name='dataverse',
... ds_pid='doi:10.5072/FK2/PMPMZM')
""",
code_cmd="""\
datalad add-sibling-dataverse \\
-s dataverse \\
https://demo.dataverse.org doi:10.5072/FK2/PMPMZM \\
""",
),
]

_validator_ = EnsureCommandParameterization(dict(
dv_url=EnsureURL(required=['scheme']),
ds_pid=EnsureStr(),
dataset=EnsureDataset(installed=True, purpose="add dataverse sibling"),
name=EnsureStr(),
storage_name=EnsureStr(),
existing=EnsureChoice('skip', 'error', 'reconfigure'),
mode=EnsureChoice(
_validator_ = EnsureCommandParameterization(
param_constraints=dict(
dv_url=EnsureURL(required=['scheme']),
ds_pid=EnsureStr(),
dataset=EnsureDataset(
installed=True, purpose="add dataverse sibling"),
name=EnsureStr(),
storage_name=EnsureStr(),
existing=EnsureChoice('skip', 'error', 'reconfigure'),
mode=EnsureChoice(
'annex', 'filetree', 'annex-only', 'filetree-only',
'git-only')),
validate_defaults=("dataset",)
'git-only')
),
validate_defaults=("dataset",),
)

_params_ = dict(
dv_url=Parameter(
args=("dv_url",),
metavar='URL',
doc="URL identifying the dataverse instance to connect to",),
doc="""URL identifying the dataverse instance to connect to
(e.g., https://demo.dataverse.org)""",),
ds_pid=Parameter(
args=("ds_pid",),
doc="""Persistent identifier of the dataverse dataset to connect to.
This can be found on the dataset's page. Either right at the top
args=("PID",),
doc="""Persistent identifier of the dataverse dataset to
use as a sibling. This PID can be found on the dataset's
landing page on Dataverse. Either right at the top
underneath the title of the dataset as an URL or in the dataset's
metadata. Both formats (doi:10.5072/FK2/PMPMZM and
https://doi.org/10.5072/FK2/PMPMZM) are supported for this
Expand All @@ -90,10 +112,11 @@ class AddSiblingDataverse(ValidatedInterface):
metavar='PATH',
doc="""optional alternative root path for the sibling inside the
Dataverse dataset. This can be used to represent multiple DataLad
datasets within a single Dataverse dataset without conflict."""),
datasets within a single Dataverse dataset without conflict.
Must be given in POSIX notation."""),
dataset=Parameter(
args=("-d", "--dataset"),
doc="""specify the dataset to process. If
doc="""specify the dataset to add the sibling to. If
no dataset is given, an attempt is made to identify the dataset
based on the current working directory""",),
name=Parameter(
Expand All @@ -115,49 +138,37 @@ class AddSiblingDataverse(ValidatedInterface):
doc="""
name of the credential providing an API token for the dataverse
installation of your choice, to be used for authorization.
The credential can be supplied via
configuration setting 'datalad.credential.<name>.token', or
environment variable DATALAD_CREDENTIAL_<NAME>_TOKEN, or will
be queried from the active credential store using the provided
name. If none is provided, the last-used credential for the
dataverse url will be used. Only if a credential name was given, it
will be encoded in the URL of the created dataverse Git remote,
credential auto-discovery will be performed on each remote access.""",
If no credential is given or known, a credential discovery will
attempted based on the Dataverse URL. If no credential can be
found, a token is prompted for.""",
),
existing=Parameter(
args=("--existing",),
choices=('skip', 'reconfigure', 'error'),
doc="""action to perform, if a (storage) sibling is already
configured under the given name.
In this case, sibling creation can be skipped ('skip') or the
sibling (re-)configured ('reconfigure') in the dataset, or the
command be instructed to fail ('error').""", ),
mode=Parameter(
args=("--mode",),
choices=('annex', 'filetree', 'annex-only', 'filetree-only',
'git-only'),
doc="""
TODO: Not sure yet, what modes we can/want support here.
Siblings can be created in various modes:
full-featured sibling tandem, one for a dataset's Git history
and one storage sibling to host any number of file versions
('annex').
A single sibling for the Git history only ('git-only').
A single annex sibling for multi-version file storage only
('annex-only').
As an alternative to the standard (annex) storage sibling setup
that is capable of storing any number of historical file versions
using a content hash layout ('annex'|'annex-only'), the 'filetree'
mode can used.
This mode offers a human-readable data organization on the dataverse
remote that matches the file tree of a dataset (branch).
Note, however, that dataverse comes with restrictions on what file
and directory names are possible.
This mode is useful for depositing a single dataset
snapshot for consumption without DataLad. The 'filetree' mode
nevertheless allows for cloning such a single-version dataset,
because the full dataset history can still be pushed to the WebDAV
server.
Git history hosting can also be turned off for this setup
('filetree-only').
Different sibling setups with varying ability to accept file
content and dataset versions are supported:
'annex' for a sibling tandem, one for a dataset's Git history
and one storage sibling to host any number of file versions;
'git-only' for a single sibling for the Git history only;
'annex-only' for a single annex sibling for multi-version file
storage, but no dataset Git history;
'filetree' for a human-readable data organization on the dataverse
end that matches the file tree of a dataset branch. This mode
is useful for depositing a single dataset snapshot for consumption
without DataLad. A dataset's Git history is included in the export
and enabled cloning from Dataverse.
'filetree-only' disables the Git history export, and removes the
ability to clone from Dataverse.
When both a storage sibling and a regular sibling are created
together, a publication dependency on the storage sibling is
configured for the regular sibling in the local dataset clone.
Expand All @@ -171,13 +182,13 @@ def __call__(
dv_url: str,
ds_pid: str,
*,
root_path: PurePosixPath | None = None,
dataset: DatasetParameter | None = None,
name: str = 'dataverse',
storage_name: str | None = None,
mode: str = 'annex',
credential: str | None = None,
existing: str = 'error',
root_path: PurePosixPath | None = None,
):
# dataset is a next' DatasetParameter
ds = dataset.ds
Expand Down
98 changes: 33 additions & 65 deletions datalad_dataverse/baseremote.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""git-annex special remote"""

from __future__ import annotations

from pathlib import (
Expand Down Expand Up @@ -26,76 +28,42 @@


class DataverseRemote(SpecialRemote):
"""Special remote to interface dataverse datasets.
"""Special remote for IO with Dataverse datasets.
This remote provides the standard set of operations: CHECKPRESENT,
STORE, RETRIEVE, and REMOVE.
It uses the pyDataverse package internally, which presently imposes some
limitations, such as poor handling of large-file downloads.
The following sections contain notes on dataverse and this particular
implementation.
Dataverse
---------
Dataverse datasets come with their own versioning. A version is created upon
publishing a draft version. When a change is pushed, it is altering an
already existing draft version or - if none existed - the push (implicitly)
creates a new draft version. Publishing is not part of this special remotes
operations as it has no means to "discover" that this should happen (it only
communicates with git-annex on a per-file basis and does not even know what
annex command ran).
Files put on dataverse have a database ID associated with them, while there
"path" in the dataverse dataset is treated as metadata to that file. The ID
is persistent, but not technically a content identifier as it is not created
from the content like hash. However, once files are published (by being part
of a published dataset version) those IDs can serve as a content identifier
for practical purposes, since they are not going to change anymore. There's
no "real" guarantee for that, but in reality changing it would require some
strange DB migration to be performed on the side of the respective dataverse
instance. Note, however, that a file can be pushed into a draft version and
replaced/removed before it was ever published. In that case the ID of an
annex key could be changed. Hence, to some extent the special remote needs
to be aware of whether an annex key and its ID was part of a released
version of the dataverse dataset in order to make use of those IDs.
Recording the IDs allows accessing older versions of a file even in export
mode, as well as faster accessing keys for download. The latter is because
the API requires the ID, and a path based approach would therefore require
looking up the ID first (adding a request). Therefore, the special remote
records the IDs of annex keys and tries to rely on them if possible.
There is one more trap to mention with dataverse and that is its limitations
to directory and file names.
See https://github.com/IQSS/dataverse/issues/8807#issuecomment-1164434278
Dataverse datasets come with their own versioning. A version is created
upon publishing a draft version. When a change is pushed, it is altering an
already existing draft version or, if none existed, the push (implicitly)
creates a new draft version. Publishing is not part of this special
remote's operations.
Regular special remote
----------------------
In principle the regular special remote simply maintains a flat list of
annex keys in the dataverse dataset, where the presented file names are the
anney keys. Therefore, it is feasible to simply rely on the remote path of a
key when checking for its presence. However, as laid out above, it is faster
to utilize knowledge about the database ID, so the idea is to use path
matching only as a fallback.
Implementation note
-------------------
The special remote at first only retrieves a record of what is in the latest
version (draft or not) of the dataverse dataset including an annotation of
content on whether it is released. This annotation is crucial, since it has
implications on what to record should changes be pushed to it.
For example:
It is not possible to actually remove content from a released version. That
means, if annex asks the special remote to remove content, it can only make
sure that the respective key is not part of the current draft anymore. Its
ID, however, remains on record. If the content was not released yet, it is
actually gone and the ID is taken off the record.
This record is retrieved lazily when first required, but only once (avoiding
an additional per-key request) and then updated locally when changes are
pushed. (Note, that we know that we only ever push into a draft version)
In case of checking the presence of a key that does not appear to be part of
the latest version, a request for such a record on all known dataverse
dataset versions is made. Again, this is lazy and only one request. This may
potentially be a relatively expensive request, but the introduced latency by
having smaller but possibly much more requests is likely a lot more
expensive.
Files uploaded to Dataverse have an associated database file ID. Their
"path" inside a dataset is a combination of a ``label`` and a
``directoryLabel`` that jointly must be unique in a Dataverse dataset.
However, the are only metadata associated with the file ID.
A file ID is persistent, but not technically a content identifier as it is
not created from the content like hash.
Recording the IDs with git-annex enables faster accessing for download,
because a dataset content listing request can be avoided. Therefore, the
special remote records the IDs of annex keys and tries to rely on them if
possible.
Dataverse imposes strict nameing limitations for directories and files.
See https://github.com/IQSS/dataverse/issues/8807#issuecomment-1164434278
Therefore, remote paths are mangles to match these limitations.
"""

def __init__(self, *args):
Expand Down
14 changes: 14 additions & 0 deletions datalad_dataverse/dataset.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""Dataverse IO abstraction"""

from __future__ import annotations

from dataclasses import dataclass
Expand Down Expand Up @@ -54,6 +56,18 @@ class OnlineDataverseDataset:
root path for all dataset operations. It will not be possible to upload,
download, rename (etc) files from outside this prefix scope, or across
scopes.
On initialization only a record of what is in the latest version (draft or
not) of the dataverse dataset is retrived, including an annotation of
content on whether it is released. This annotation is crucial, since it has
implications on what to record should changes be uploaded. For
example: It is not possible to actually remove content from a released
version.
This record is later maintained locally when changes are made without ever
requesting a full update again. In case of checking the presence of a file
that does not appear to be part of the latest version, a request for such a
record on all known dataverse dataset versions is made.
"""
def __init__(self, api, dsid: str, root_path: str | None = None):
# dataverse native API handle
Expand Down

0 comments on commit a48ca22

Please sign in to comment.