Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix RIA/ORA publication dependencies #5415

Merged
merged 1 commit into from Mar 3, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
151 changes: 101 additions & 50 deletions datalad/core/distributed/clone.py
Expand Up @@ -723,48 +723,17 @@ def postclonecfg_ria(ds, props):
repo = ds.repo
RIA_REMOTE_NAME = 'origin' # don't hardcode everywhere

# chances are that if this dataset came from a RIA store, its subdatasets
# may live there too. Place a subdataset source candidate config that makes
# get probe this RIA store when obtaining subdatasets
ds.config.set(
# we use the label 'origin' for this candidate in order to not have to
# generate a complicated name from the actual source specification.
# we pick a cost of 200 to sort it before datalad's default candidates
# for non-RIA URLs, because they prioritize hierarchical layouts that
# cannot be found in a RIA store
'datalad.get.subdataset-source-candidate-200origin',
# use the entire original URL, up to the fragment + plus dataset ID
# placeholder, this should make things work with any store setup we
# support (paths, ports, ...)
props['source'].split('#', maxsplit=1)[0] + '#{id}',
where='local')

# setup publication dependency, if a corresponding special remote exists
# and was enabled (there could be RIA stores that actually only have repos)
# make this function be a generator
ora_remotes = [s for s in ds.siblings('query', result_renderer='disabled')
if s.get('annex-externaltype') == 'ora']
if not ora_remotes and any(
r.get('externaltype') == 'ora'
for r in (repo.get_special_remotes().values()
if hasattr(repo, 'get_special_remotes')
else [])):
# no ORA remote autoenabled, but configuration known about at least one.
# Let's check origin's config for datalad.ora-remote.uuid as stored by
# create-sibling-ria and enable try enabling that one.
lgr.debug("Found no autoenabled ORA special remote. Trying to look it "
"up in source config ...")

def get_uuid_from_store(store_url):
# First figure whether we cloned via SSH, HTTP or local path and then
# get that config file the same way:
config_content = None
scheme = props['giturl'].split(':', 1)[0]
scheme = store_url.split(':', 1)[0]
if scheme in ['http', 'https']:
try:
config_content = download_url(
"{}{}config".format(
props['giturl'],
'/' if not props['giturl'].endswith('/') else ''))
store_url,
'/' if not store_url.endswith('/') else ''))
except DownloadError as e:
lgr.debug("Failed to get config file from source:\n%s",
exc_str(e))
Expand All @@ -773,8 +742,8 @@ def postclonecfg_ria(ds, props):
# SSHRemoteIO ignores the path part ATM. No remote CWD! (To be
# changed with command abstractions). So we need to get that part to
# have a valid path to origin's config file:
cfg_path = PurePosixPath(URL(props['giturl']).path) / 'config'
op = SSHRemoteIO(props['giturl'])
cfg_path = PurePosixPath(URL(store_url).path) / 'config'
op = SSHRemoteIO(store_url)
try:
config_content = op.read_file(cfg_path)
except RIARemoteError as e:
Expand All @@ -784,7 +753,7 @@ def postclonecfg_ria(ds, props):
elif scheme == 'file':
# TODO: switch the following to proper command abstraction:
op = LocalIO()
cfg_path = Path(URL(props['giturl']).localpath) / 'config'
cfg_path = Path(URL(store_url).localpath) / 'config'
try:
config_content = op.read_file(cfg_path)
except (RIARemoteError, OSError) as e:
Expand All @@ -794,8 +763,8 @@ def postclonecfg_ria(ds, props):
lgr.debug("Unknown URL-Scheme %s in %s. Can handle SSH, HTTP or "
"FILE scheme URLs.", scheme, props['source'])

# 3. And read it
org_uuid = None
# And read it
uuid = None
if config_content:
# TODO: We might be able to spare the saving to a file.
# "git config -f -" is not explicitly documented but happens
Expand All @@ -809,17 +778,60 @@ def postclonecfg_ria(ds, props):
'datalad.ora-remote.uuid'],
protocol=StdOutCapture
)
org_uuid = result['stdout'].strip()
uuid = result['stdout'].strip()
except CommandError as e:
# doesn't contain what we are looking for
lgr.debug("Found no UUID for ORA special remote at "
"'%s' (%s)", RIA_REMOTE_NAME, exc_str(e))

return uuid




# chances are that if this dataset came from a RIA store, its subdatasets
# may live there too. Place a subdataset source candidate config that makes
# get probe this RIA store when obtaining subdatasets
ria_store_url = props['source'].split('#', maxsplit=1)[0]
ds.config.set(
# we use the label 'origin' for this candidate in order to not have to
# generate a complicated name from the actual source specification.
# we pick a cost of 200 to sort it before datalad's default candidates
# for non-RIA URLs, because they prioritize hierarchical layouts that
# cannot be found in a RIA store
'datalad.get.subdataset-source-candidate-200origin',
# use the entire original URL, up to the fragment + plus dataset ID
# placeholder, this should make things work with any store setup we
# support (paths, ports, ...)
ria_store_url + '#{id}',
where='local')

# setup publication dependency, if a corresponding special remote exists
# and was enabled (there could be RIA stores that actually only have repos)
# make this function be a generator
ora_remotes = [s for s in ds.siblings('query', result_renderer='disabled')
if s.get('annex-externaltype') == 'ora']
# get full special remotes' config for access to stored URL
srs = repo.get_special_remotes() \
if hasattr(repo, 'get_special_remotes') else dict()

if (not ora_remotes and any(
r.get('externaltype') == 'ora' for r in srs.values())) or \
all(not srs[r['annex-uuid']]['url'].startswith(ria_store_url)
for r in ora_remotes):
# No ORA remote autoenabled, but configuration known about at least one,
# or enabled ORA remotes seem to not match clone URL.
# Let's check origin's config for datalad.ora-remote.uuid as stored by
# create-sibling-ria and enable try enabling that one.
lgr.debug("Found no autoenabled ORA special remote. Trying to look it "
"up in source config ...")

org_uuid = get_uuid_from_store(props['giturl'])

# Now, enable it. If annex-init didn't fail to enable it as stored, we
# wouldn't end up here, so enable with store URL as suggested by the URL
# we cloned from.
if org_uuid:
srs = repo.get_special_remotes()
if org_uuid in srs.keys():
# TODO: - Double-check autoenable value and only do this when
# true?
Expand Down Expand Up @@ -847,20 +859,59 @@ def postclonecfg_ria(ds, props):
else:
lgr.debug("Unknown ORA special remote uuid at '%s': %s",
RIA_REMOTE_NAME, org_uuid)

# Set publication dependency for origin on the respective ORA remote:
if ora_remotes:
if len(ora_remotes) == 1:
url_matching_remotes = [r for r in ora_remotes
if srs[r['annex-uuid']]['url'] == ria_store_url]

if len(url_matching_remotes) == 1:
# We have exactly one ORA remote with the same store URL we used for
# cloning (includes previously reconfigured remote).
# Set publication dependency:
yield from ds.siblings('configure',
name=RIA_REMOTE_NAME,
publish_depends=ora_remotes[0]['name'],
publish_depends=url_matching_remotes[0]['name'],
result_filter=None,
result_renderer='disabled')

elif not url_matching_remotes:
# No matches but we have successfully autoenabled ORA remotes. Could
# be the same store accessed by different method (cloning via HTTP
# but special remote access via SSH). We can confidently set
# publication dependency if the store knows the UUID.
org_uuid = get_uuid_from_store(props['giturl'])
uuid_matching_remotes = [r for r in ora_remotes
if r['annex-uuid'] == org_uuid]
if uuid_matching_remotes:
# Multiple uuid matches are actually possible via same-as.
# However, in that case we can't decide which one is supposed to
# be used with publishing to origin.
if len(uuid_matching_remotes) == 1:
yield from ds.siblings(
'configure',
name=RIA_REMOTE_NAME,
publish_depends=uuid_matching_remotes[0]['name'],
result_filter=None,
result_renderer='disabled')
else:
lgr.warning(
"Found multiple matching ORA remotes. Couldn't decide "
"which one publishing to 'origin' should depend on: %s."
" Consider running 'datalad siblings configure -s "
"origin --publish-depends ORAREMOTENAME' to set "
"publication dependency manually.",
[r['name'] for r in uuid_matching_remotes])

else:
lgr.warning("Found multiple ORA remotes. Couldn't decide which "
"publishing to 'origin' should depend on: %s. Consider "
"running 'datalad siblings configure -s origin "
"--publish-depends ORAREMOTENAME' to set publication "
"dependency manually.",
[r['name'] for r in ora_remotes])
# We have multiple ORA remotes with the same store URL we cloned
# from.
lgr.warning("Found multiple matching ORA remotes. Couldn't decide "
"which one publishing to 'origin' should depend on: %s."
" Consider running 'datalad siblings configure -s "
"origin --publish-depends ORAREMOTENAME' to set "
"publication dependency manually.",
[r['name'] for r in url_matching_remotes])


def postclonecfg_annexdataset(ds, reckless, description=None):
Expand Down
54 changes: 50 additions & 4 deletions datalad/core/distributed/tests/test_clone.py
Expand Up @@ -40,6 +40,7 @@
assert_false,
assert_in,
assert_in_results,
assert_not_is_instance,
assert_message,
assert_not_in,
assert_raises,
Expand Down Expand Up @@ -891,6 +892,21 @@ def _test_ria_postclonecfg(url, dsid, clone_path, superds):
if url.startswith('http')
else "store-storage"))

# Second ORA remote is enabled and not reconfigured:
untouched_remote = riaclone.siblings(name='anotherstore-storage',
return_type='item-or-list')
assert_not_is_instance(untouched_remote, list)
untouched_url = riaclone.repo.get_special_remotes()[
untouched_remote['annex-uuid']]['url']
ok_(untouched_url.startswith("ria+file://"))
ok_(not untouched_url.startswith("ria+{}".format(url)))

# publication dependency was set for store-storage but not for
# anotherstore-storage:
eq_(riaclone.config.get("remote.origin.datalad-publish-depends",
get_all=True),
"store-storage")

# same thing for the sub ds (we don't need a store-url and id - get should
# figure those itself):
with swallow_logs(new_level=logging.INFO) as cml:
Expand All @@ -909,6 +925,12 @@ def _test_ria_postclonecfg(url, dsid, clone_path, superds):
if url.startswith('http')
else "store-storage"))

# publication dependency was set for store-storage but not for
# anotherstore-storage:
eq_(riaclonesub.config.get("remote.origin.datalad-publish-depends",
get_all=True),
"store-storage")

# finally get the plain git subdataset.
# Clone should figure to also clone it from a ria+ URL
# (subdataset-source-candidate), notice that there wasn't an autoenabled ORA
Expand All @@ -917,6 +939,11 @@ def _test_ria_postclonecfg(url, dsid, clone_path, superds):
assert_result_count(res, 1, status='ok', type='dataset', action='install')
assert_result_count(res, 1, status='notneeded', type='file')
assert_result_count(res, 2)
# no ORA remote, no publication dependency:
riaclonesubgit = Dataset(riaclone.pathobj / 'subdir' / 'subgit')
eq_(riaclonesubgit.config.get("remote.origin.datalad-publish-depends",
get_all=True),
None)

# Now, test that if cloning into a dataset, ria-URL is preserved and
# post-clone configuration is triggered again, when we remove the subds and
Expand Down Expand Up @@ -958,7 +985,7 @@ def _test_ria_postclonecfg(url, dsid, clone_path, superds):


@with_tempfile
def _postclonetest_prepare(lcl, storepath, link):
def _postclonetest_prepare(lcl, storepath, storepath2, link):

from datalad.customremotes.ria_utils import (
create_store,
Expand All @@ -980,11 +1007,13 @@ def _postclonetest_prepare(lcl, storepath, link):
},
})

# create a local dataset with a subdataset
lcl = Path(lcl)
storepath = Path(storepath)
storepath2 = Path(storepath2)
link = Path(link)
link.symlink_to(storepath)

# create a local dataset with a subdataset
subds = Dataset(lcl / 'ds' / 'subdir' / 'subds').create(force=True)
subds.save()
# add a plain git dataset as well
Expand All @@ -996,6 +1025,22 @@ def _postclonetest_prepare(lcl, storepath, link):
assert_repo_status(ds.path)

io = LocalIO()

# Have a second store with valid ORA remote. This should not interfere with
# reconfiguration of the first one, when that second store is not the one we
# clone from. However, don't push data into it for easier get-based testing
# later on.
# Doing this first, so datasets in "first"/primary store know about this.
create_store(io, storepath2, '1')
url2 = "ria+{}".format(get_local_file_url(str(storepath2)))
for d in (ds, subds, subgit):
create_ds_in_store(io, storepath2, d.id, '2', '1')
d.create_sibling_ria(url2, "anotherstore")
d.push('.', to='anotherstore', data='nothing')
store2_loc, _, _ = get_layout_locations(1, storepath2, d.id)
Runner(cwd=str(store2_loc)).run(['git', 'update-server-info'])

# Now the store to clone from:
create_store(io, storepath, '1')

# URL to use for upload. Point is, that this should be invalid for the clone
Expand Down Expand Up @@ -1045,8 +1090,9 @@ def test_ria_postclonecfg():
from datalad.utils import make_tempfile
from datalad.tests.utils import HTTPPath

with make_tempfile(mkdir=True) as lcl, make_tempfile(mkdir=True) as store:
id = _postclonetest_prepare(lcl, store)
with make_tempfile(mkdir=True) as lcl, make_tempfile(mkdir=True) as store, \
make_tempfile(mkdir=True) as store2:
id = _postclonetest_prepare(lcl, store, store2)

# test cloning via ria+file://
yield _test_ria_postclonecfg, \
Expand Down