Merge pull request #287 from mih/docs

`add_sibling_dataverse` docstring
datalad · Mar 17, 2023 · a48ca22 · a48ca22
2 parents bc6c059 + e861580
commit a48ca22
Show file tree

Hide file tree

Showing 6 changed files with 155 additions and 211 deletions.
diff --git a/datalad_dataverse/add_sibling_dataverse.py b/datalad_dataverse/add_sibling_dataverse.py
@@ -33,53 +33,75 @@
 
 @build_doc
 class AddSiblingDataverse(ValidatedInterface):
-    """Add a dataset sibling(-tandem) connecting to a Dataverse dataset.
+    """Add a Dataverse dataset as a sibling(-tandem)
 
     Dataverse is a web application to share and cite research data.
 
-    Research data published in Dataverse receives an academic citation which
-    allows to grant full credit and increases visibility of your work.
+    This command registers an existing Dataverse dataset as a sibling of a
+    DataLad dataset. Both dataset version history and file content can then be
+    deposited at a Dataverse site via the standard ``push`` command.
+
+    Dataverse imposes strict limits on directory names (and to some degree also
+    file name). Therefore, names of files that conflict with these rules (e.g.,
+    a directory name with any character not found in the English alphabet) are
+    mangled on-push. This mangling does not impact file names in the DataLad
+    dataset (also not for clones from Dataverse). See the package documentation
+    for details.
+
+    If a DataLad's dataset version history was deposited on Dataverse, a
+    dataset can also be cloned from Dataverse again, via the standard ``clone``
+    command.
 
     In order to be able to use this command, a personal access token has to be
     generated on the Dataverse platform. You can find it by clicking on your
-    name at the top right corner, and then clicking on Api Token>Create Token.
-
-    Furthermore, a dataset on such a dataverse instance has to already exist in
-    order to add it as a sibling to a DataLad dataset.
+    name at the top right corner, and then clicking on API Token>Create Token.
     """
 
     _examples_ = [
-        dict(text="add a dataverse dataset sibling for sharing and citing",
-             code_py="""\
-                 > ds = Dataset('.')
-                 > ds.add_sibling_dataverse(url='https://demo.dataverse.org', name='dataverse', ds_pid='doi:10.5072/FK2/PMPMZM')
-             """,
-             code_cmd="datalad add-sibling-dataverse demo.dataverse.org doi:10.5072/FK2/PMPMZM -s dataverse",
+        dict(
+            text="Add a dataverse dataset sibling for sharing and citing",
+            code_py="""\
+            >>> ds = Dataset('.')
+            >>> ds.add_sibling_dataverse(
+            ...   url='https://demo.dataverse.org',
+            ...   name='dataverse',
+            ...   ds_pid='doi:10.5072/FK2/PMPMZM')
+            """,
+            code_cmd="""\
+            datalad add-sibling-dataverse \\
+              -s dataverse \\
+              https://demo.dataverse.org doi:10.5072/FK2/PMPMZM \\
+            """,
         ),
     ]
 
-    _validator_ = EnsureCommandParameterization(dict(
-        dv_url=EnsureURL(required=['scheme']),
-        ds_pid=EnsureStr(),
-        dataset=EnsureDataset(installed=True, purpose="add dataverse sibling"),
-        name=EnsureStr(),
-        storage_name=EnsureStr(),
-        existing=EnsureChoice('skip', 'error', 'reconfigure'),
-        mode=EnsureChoice(
+    _validator_ = EnsureCommandParameterization(
+        param_constraints=dict(
+            dv_url=EnsureURL(required=['scheme']),
+            ds_pid=EnsureStr(),
+            dataset=EnsureDataset(
+                installed=True, purpose="add dataverse sibling"),
+            name=EnsureStr(),
+            storage_name=EnsureStr(),
+            existing=EnsureChoice('skip', 'error', 'reconfigure'),
+            mode=EnsureChoice(
                 'annex', 'filetree', 'annex-only', 'filetree-only',
-                'git-only')),
-        validate_defaults=("dataset",)
+                'git-only')
+        ),
+        validate_defaults=("dataset",),
     )
 
     _params_ = dict(
         dv_url=Parameter(
             args=("dv_url",),
             metavar='URL',
-            doc="URL identifying the dataverse instance to connect to",),
+            doc="""URL identifying the dataverse instance to connect to
+            (e.g., https://demo.dataverse.org)""",),
         ds_pid=Parameter(
-            args=("ds_pid",),
-            doc="""Persistent identifier of the dataverse dataset to connect to.
-            This can be found on the dataset's page. Either right at the top
+            args=("PID",),
+            doc="""Persistent identifier of the dataverse dataset to
+            use as a sibling. This PID can be found on the dataset's
+            landing page on Dataverse. Either right at the top
             underneath the title of the dataset as an URL or in the dataset's
             metadata. Both formats (doi:10.5072/FK2/PMPMZM and
             https://doi.org/10.5072/FK2/PMPMZM) are supported for this
@@ -90,10 +112,11 @@ class AddSiblingDataverse(ValidatedInterface):
             metavar='PATH',
             doc="""optional alternative root path for the sibling inside the
             Dataverse dataset. This can be used to represent multiple DataLad
-            datasets within a single Dataverse dataset without conflict."""),
+            datasets within a single Dataverse dataset without conflict.
+            Must be given in POSIX notation."""),
         dataset=Parameter(
             args=("-d", "--dataset"),
-            doc="""specify the dataset to process.  If
+            doc="""specify the dataset to add the sibling to.  If
             no dataset is given, an attempt is made to identify the dataset
             based on the current working directory""",),
         name=Parameter(
@@ -115,49 +138,37 @@ class AddSiblingDataverse(ValidatedInterface):
             doc="""
             name of the credential providing an API token for the dataverse
             installation of your choice, to be used for authorization.
-            The credential can be supplied via
-            configuration setting 'datalad.credential.<name>.token', or
-            environment variable DATALAD_CREDENTIAL_<NAME>_TOKEN, or will
-            be queried from the active credential store using the provided
-            name. If none is provided, the last-used credential for the
-            dataverse url will be used. Only if a credential name was given, it 
-            will be encoded in the URL of the created dataverse Git remote, 
-            credential auto-discovery will be performed on each remote access.""",
+            If no credential is given or known, a credential discovery will
+            attempted based on the Dataverse URL. If no credential can be
+            found, a token is prompted for.""",
         ),
         existing=Parameter(
             args=("--existing",),
+            choices=('skip', 'reconfigure', 'error'),
             doc="""action to perform, if a (storage) sibling is already
             configured under the given name.
             In this case, sibling creation can be skipped ('skip') or the
             sibling (re-)configured ('reconfigure') in the dataset, or the
             command be instructed to fail ('error').""", ),
         mode=Parameter(
             args=("--mode",),
+            choices=('annex', 'filetree', 'annex-only', 'filetree-only',
+                     'git-only'),
             doc="""
-            TODO: Not sure yet, what modes we can/want support here.
-
-            Siblings can be created in various modes:
-            full-featured sibling tandem, one for a dataset's Git history
-            and one storage sibling to host any number of file versions
-            ('annex').
-            A single sibling for the Git history only ('git-only').
-            A single annex sibling for multi-version file storage only
-            ('annex-only').
-            As an alternative to the standard (annex) storage sibling setup
-            that is capable of storing any number of historical file versions
-            using a content hash layout ('annex'|'annex-only'), the 'filetree'
-            mode can used.
-            This mode offers a human-readable data organization on the dataverse
-            remote that matches the file tree of a dataset (branch).
-            Note, however, that dataverse comes with restrictions on what file
-            and directory names are possible.
-            This mode is useful for depositing a single dataset
-            snapshot for consumption without DataLad. The 'filetree' mode
-            nevertheless allows for cloning such a single-version dataset,
-            because the full dataset history can still be pushed to the WebDAV
-            server.
-            Git history hosting can also be turned off for this setup
-            ('filetree-only').
+            Different sibling setups with varying ability to accept file
+            content and dataset versions are supported:
+            'annex' for a sibling tandem, one for a dataset's Git history
+            and one storage sibling to host any number of file versions;
+            'git-only' for a single sibling for the Git history only;
+            'annex-only' for a single annex sibling for multi-version file
+            storage, but no dataset Git history;
+            'filetree' for a human-readable data organization on the dataverse
+            end that matches the file tree of a dataset branch. This mode
+            is useful for depositing a single dataset snapshot for consumption
+            without DataLad. A dataset's Git history is included in the export
+            and enabled cloning from Dataverse.
+            'filetree-only' disables the Git history export, and removes the
+            ability to clone from Dataverse.
             When both a storage sibling and a regular sibling are created
             together, a publication dependency on the storage sibling is
             configured for the regular sibling in the local dataset clone.
@@ -171,13 +182,13 @@ def __call__(
             dv_url: str,
             ds_pid: str,
             *,
-            root_path: PurePosixPath | None = None,
             dataset: DatasetParameter | None = None,
             name: str = 'dataverse',
             storage_name: str | None = None,
             mode: str = 'annex',
             credential: str | None = None,
             existing: str = 'error',
+            root_path: PurePosixPath | None = None,
     ):
         # dataset is a next' DatasetParameter
         ds = dataset.ds

diff --git a/datalad_dataverse/baseremote.py b/datalad_dataverse/baseremote.py
@@ -1,3 +1,5 @@
+"""git-annex special remote"""
+
 from __future__ import annotations
 
 from pathlib import (
@@ -26,76 +28,42 @@
 
 
 class DataverseRemote(SpecialRemote):
-    """Special remote to interface dataverse datasets.
+    """Special remote for IO with Dataverse datasets.
+
+    This remote provides the standard set of operations: CHECKPRESENT,
+    STORE, RETRIEVE, and REMOVE.
+
+    It uses the pyDataverse package internally, which presently imposes some
+    limitations, such as poor handling of large-file downloads.
+
+    The following sections contain notes on dataverse and this particular
+    implementation.
 
     Dataverse
     ---------
 
-    Dataverse datasets come with their own versioning. A version is created upon
-    publishing a draft version. When a change is pushed, it is altering an
-    already existing draft version or - if none existed - the push (implicitly)
-    creates a new draft version. Publishing is not part of this special remotes
-    operations as it has no means to "discover" that this should happen (it only
-    communicates with git-annex on a per-file basis and does not even know what
-    annex command ran).
-
-    Files put on dataverse have a database ID associated with them, while there
-    "path" in the dataverse dataset is treated as metadata to that file. The ID
-    is persistent, but not technically a content identifier as it is not created
-    from the content like hash. However, once files are published (by being part
-    of a published dataset version) those IDs can serve as a content identifier
-    for practical purposes, since they are not going to change anymore. There's
-    no "real" guarantee for that, but in reality changing it would require some
-    strange DB migration to be performed on the side of the respective dataverse
-    instance. Note, however, that a file can be pushed into a draft version and
-    replaced/removed before it was ever published. In that case the ID of an
-    annex key could be changed. Hence, to some extent the special remote needs
-    to be aware of whether an annex key and its ID was part of a released
-    version of the dataverse dataset in order to make use of those IDs.
-
-    Recording the IDs allows accessing older versions of a file even in export
-    mode, as well as faster accessing keys for download. The latter is because
-    the API requires the ID, and a path based approach would therefore require
-    looking up the ID first (adding a request). Therefore, the special remote
-    records the IDs of annex keys and tries to rely on them if possible.
-
-    There is one more trap to mention with dataverse and that is its limitations
-    to directory and file names.
-    See https://github.com/IQSS/dataverse/issues/8807#issuecomment-1164434278
+    Dataverse datasets come with their own versioning. A version is created
+    upon publishing a draft version. When a change is pushed, it is altering an
+    already existing draft version or, if none existed, the push (implicitly)
+    creates a new draft version. Publishing is not part of this special
+    remote's operations.
 
-    Regular special remote
-    ----------------------
-
-    In principle the regular special remote simply maintains a flat list of
-    annex keys in the dataverse dataset, where the presented file names are the
-    anney keys. Therefore, it is feasible to simply rely on the remote path of a
-    key when checking for its presence. However, as laid out above, it is faster
-    to utilize knowledge about the database ID, so the idea is to use path
-    matching only as a fallback.
-
-    Implementation note
-    -------------------
-
-    The special remote at first only retrieves a record of what is in the latest
-    version (draft or not) of the dataverse dataset including an annotation of
-    content on whether it is released. This annotation is crucial, since it has
-    implications on what to record should changes be pushed to it.
-    For example:
-    It is not possible to actually remove content from a released version. That
-    means, if annex asks the special remote to remove content, it can only make
-    sure that the respective key is not part of the current draft anymore. Its
-    ID, however, remains on record. If the content was not released yet, it is
-    actually gone and the ID is taken off the record.
-
-    This record is retrieved lazily when first required, but only once (avoiding
-    an additional per-key request) and then updated locally when changes are
-    pushed. (Note, that we know that we only ever push into a draft version)
-    In case of checking the presence of a key that does not appear to be part of
-    the latest version, a request for such a record on all known dataverse
-    dataset versions is made. Again, this is lazy and only one request. This may
-    potentially be a relatively expensive request, but the introduced latency by
-    having smaller but possibly much more requests is likely a lot more
-    expensive.
+    Files uploaded to Dataverse have an associated database file ID. Their
+    "path" inside a dataset is a combination of a ``label`` and a
+    ``directoryLabel`` that jointly must be unique in a Dataverse dataset.
+    However, the are only metadata associated with the file ID.
+
+    A file ID is persistent, but not technically a content identifier as it is
+    not created from the content like hash.
+
+    Recording the IDs with git-annex enables faster accessing for download,
+    because a dataset content listing request can be avoided.  Therefore, the
+    special remote records the IDs of annex keys and tries to rely on them if
+    possible.
+
+    Dataverse imposes strict nameing limitations for directories and files.
+    See https://github.com/IQSS/dataverse/issues/8807#issuecomment-1164434278
+    Therefore, remote paths are mangles to match these limitations.
     """
 
     def __init__(self, *args):

diff --git a/datalad_dataverse/dataset.py b/datalad_dataverse/dataset.py
@@ -1,3 +1,5 @@
+"""Dataverse IO abstraction"""
+
 from __future__ import annotations
 
 from dataclasses import dataclass
@@ -54,6 +56,18 @@ class OnlineDataverseDataset:
     root path for all dataset operations. It will not be possible to upload,
     download, rename (etc) files from outside this prefix scope, or across
     scopes.
+
+    On initialization only a record of what is in the latest version (draft or
+    not) of the dataverse dataset is retrived, including an annotation of
+    content on whether it is released. This annotation is crucial, since it has
+    implications on what to record should changes be uploaded.  For
+    example: It is not possible to actually remove content from a released
+    version.
+
+    This record is later maintained locally when changes are made without ever
+    requesting a full update again. In case of checking the presence of a file
+    that does not appear to be part of the latest version, a request for such a
+    record on all known dataverse dataset versions is made.
     """
     def __init__(self, api, dsid: str, root_path: str | None = None):
         # dataverse native API handle