Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More flexible clone URL generation for submodules #3828

Merged
merged 14 commits into from
Oct 28, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 125 additions & 39 deletions datalad/distribution/get.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,36 +77,99 @@
lgr = logging.getLogger('datalad.distribution.get')


def _get_flexible_source_candidates_for_submodule(ds, sm_path, sm_url=None):
"""Retrieve candidates from where to install the submodule
def _get_remotes_having_commit(repo, commit_hexsha, with_urls_only=True):
"""Traverse all branches of the remote and check if commit in any of their ancestry

Even if url for submodule is provided explicitly -- first tries urls under
parent's module tracking branch remote.
It is a generator yielding names of the remotes
"""
clone_urls = []
remote_branches = [
b['refname:strip=2']
for b in repo.for_each_ref_(
fields='refname:strip=2',
pattern='refs/remotes',
contains=commit_hexsha)]
return [
remote
for remote in repo.get_remotes(with_urls_only=with_urls_only)
if any(rb.startswith(remote + '/') for rb in remote_branches)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure I see the point of using get_remote(with_urls_only=...) here, especially given this is only used in one spot. Wouldn't for_each_ref_s output (with an additional layer stripped) be sufficient?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is just keeping what the keep did before. I tried to get rid of it, but couldn't figure out why things would break. I'll leave it for another time.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is just keeping what the keep did before.

Yes...

I tried to get rid of it, but couldn't figure out why things would break.

ah, all right, so my guess that it isn't need is wrong. Thanks.

]


def _get_flexible_source_candidates_for_submodule(ds, sm):
"""Assemble candidates from where to install a submodule

Even if a URL for submodule is provided explicitly -- first tries urls under
parent's module tracking branch remote.

Additional candidate URLs can be generated based on templates specified as
configuration variables with the pattern

`datalad.get.subdataset-source-candidate-<name>`

where `name` is an arbitrary identifier.

A template string assigned to such a variable can utilize the Python format
mini language and may reference a number of properties that are inferred
from the parent dataset's knowledge about the target subdataset. Properties
include any submodule property specified in the respective .gitmodules
record. For convenience, an existing `datalad-id` record is made available
under the shortened name `id`.

Additionally, the URL of any configured remote that contains the respective
submodule commit is available as `remote-<name>` properties, where `name`
is the configured remote name.

Parameters
----------
ds : Dataset
Parent dataset of to-be-installed subdataset.
sm : dict
Submodule record as produced by `subdatasets()`.

Returns
-------
list of tuples
Where each tuples consists of a name and a URL. Names are not unique
and either derived from the name of the respective remote, template
configuration variable, or 'origin' for the candidate URL that was
obtained from the .gitmodule record.
"""
# short cuts
ds_repo = ds.repo
sm_url = sm.get('gitmodule_url', None)
sm_path = op.relpath(sm['path'], start=sm['parentds'])

# should be our first candidate
clone_urls = []

# CANDIDATE: tracking remote of the current branch
tracking_remote, tracking_branch = ds_repo.get_tracking_branch()
candidate_remotes = [tracking_remote] if tracking_remote else []

# if we have a remote, let's check the location of that remote
# for the presence of the desired submodule
try:
last_commit = ds_repo.get_last_commit_hash(sm_path)
last_commit = ds_repo.get_last_commit_hexsha(sm_path)
if last_commit:
# CANDIDATE: any remote that has the commit when the submodule was
# last modified

# ideally should also give preference to the remotes which have
# the same branch checked out I guess
candidate_remotes += list(ds_repo._get_remotes_having_commit(last_commit))
except StopIteration:
# no commit for it known yet, ... oh well
pass
candidate_remotes += list(_get_remotes_having_commit(ds_repo, last_commit))

# prepare a dict to generate URL candidates from templates
sm_candidate_props = {
k[10:].replace('datalad-id', 'id'): v
for k, v in sm.items()
if k.startswith('gitmodule_')
}

for remote in unique(candidate_remotes):
remote_url = ds_repo.get_remote_url(remote, push=False)

# Directly on parent's ds url
if remote_url:
# make remotes and their URLs available to template rendering
sm_candidate_props['remoteurl-{}'.format(remote)] = remote_url
# attempt: submodule checkout at parent remote URL
# We might need to quote sm_path portion, e.g. for spaces etc
if isinstance(RI(remote_url), URL):
Expand All @@ -115,41 +178,67 @@ def _get_flexible_source_candidates_for_submodule(ds, sm_path, sm_url=None):
sm_path_url = sm_path

clone_urls.extend(
_get_flexible_source_candidates(
(remote, url)
for url in _get_flexible_source_candidates(
# alternate suffixes are tested by `clone` anyways
sm_path_url, remote_url, alternate_suffix=False))
sm_path_url, remote_url, alternate_suffix=False)
)

# attempt: provided (configured?) submodule URL
# TODO: consider supporting DataLadRI here? or would confuse
# git and we wouldn't want that (i.e. not allow pure git clone
# --recursive)
if sm_url:
clone_urls += _get_flexible_source_candidates(
sm_url,
remote_url,
alternate_suffix=False
clone_urls.extend(
(remote, url)
for url in _get_flexible_source_candidates(
sm_url,
remote_url,
alternate_suffix=False)
)

# Do based on the ds.path as the last resort
for name, tmpl in [(c[40:], ds_repo.config[c])
for c in ds_repo.config.keys()
if c.startswith(
'datalad.get.subdataset-source-candidate-')]:
url = tmpl.format(**sm_candidate_props)
# we don't want "flexible_source_candidates" here, this is
# configuration that can be made arbitrarily precise from the
# outside. Additional guesswork can only make it slower
clone_urls.append((name, url))

# CANDIDATE: the actual configured gitmodule URL
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I find this new wording more confusing than what it replaced. In Git's eyes, relative paths have the following resolution: (1) relative to the URL of the tracking branch, (2) if there is not a tracking branch, relative to the URL of "origin", or (3) if there is not a configured remote, relative to the working directory. So the new description is only accurately describing what Git considers to be the configured URL in cases that fall through to no. 3.

(As a sidenote that's not directly related to these changes, it looks like _get_flexible_source_candidates_for_submodule doesn't handle case no. 2, so there's a scenario where git submodule can clone something while datalad install can't.)

if sm_url:
clone_urls += _get_flexible_source_candidates(
sm_url,
ds.path,
alternate_suffix=False)
clone_urls.extend(
('local', url)
for url in _get_flexible_source_candidates(
sm_url,
ds.path,
alternate_suffix=False)
)

return unique(clone_urls, lambda x: x[1])

return unique(clone_urls)

def _install_subds_from_flexible_source(ds, sm, **kwargs):
"""Tries to obtain a given subdataset from several meaningful locations

def _install_subds_from_flexible_source(
ds, sm_path, sm_url, reckless, description=None):
"""Tries to obtain a given subdataset from several meaningful locations"""
Parameters
----------
ds : Dataset
Parent dataset of to-be-installed subdataset.
sm : dict
Submodule record as produced by `subdatasets()`.
**kwargs
Passed onto clone()
"""
sm_path = op.relpath(sm['path'], start=sm['parentds'])
# compose a list of candidate clone URLs
clone_urls = _get_flexible_source_candidates_for_submodule(
ds, sm_path, sm_url)
clone_urls = _get_flexible_source_candidates_for_submodule(ds, sm)

# prevent inevitable exception from `clone`
dest_path = op.join(ds.path, sm_path)
clone_urls = [src for src in clone_urls if src != dest_path]
clone_urls = [src for name, src in clone_urls if src != dest_path]

if not clone_urls:
# yield error
Expand All @@ -171,15 +260,14 @@ def _install_subds_from_flexible_source(
# pretend no parent -- we don't want clone to add to ds
# because this is a submodule already!
dataset=None,
reckless=reckless,
# if we have more than one source, pass as alternatives
alt_sources=clone_urls[1:],
description=description,
result_xfm=None,
# we yield all an have the caller decide
on_failure='ignore',
result_renderer='disabled',
return_type='generator'):
return_type='generator',
**kwargs):
yield res

subds = Dataset(dest_path)
Expand Down Expand Up @@ -263,9 +351,8 @@ def _install_necessary_subdatasets(
# get the module from
for res in _install_subds_from_flexible_source(
Dataset(cur_subds['parentds']),
op.relpath(cur_subds['path'], start=cur_subds['parentds']),
cur_subds['gitmodule_url'],
reckless,
cur_subds,
reckless=reckless,
description=description):
if res.get('action', None) == 'install':
if res['status'] == 'ok':
Expand Down Expand Up @@ -324,9 +411,8 @@ def _recursive_install_subds_underneath(ds, recursion_limit, reckless, start=Non
# try to get this dataset
for res in _install_subds_from_flexible_source(
ds,
op.relpath(sub['path'], start=ds.path),
sub['gitmodule_url'],
reckless,
sub,
reckless=reckless,
description=description):
# yield everything to let the caller decide how to deal with
# errors
Expand Down
84 changes: 70 additions & 14 deletions datalad/distribution/tests/test_get.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import os.path as op
from os.path import join as opj, basename
from glob import glob
from mock import patch

from datalad.api import create
from datalad.api import get
Expand Down Expand Up @@ -64,34 +65,89 @@ def _make_dataset_hierarchy(path):

@with_tempfile
@with_tempfile
def test_get_flexible_source_candidates_for_submodule(t, t2):
@with_tempfile
def test_get_flexible_source_candidates_for_submodule(t, t2, t3):
f = _get_flexible_source_candidates_for_submodule
# for now without mocking -- let's just really build a dataset
ds = create(t)
sub = ds.create('sub')
clone = install(
t2, source=t,
result_xfm='datasets', return_type='item-or-list')

# first one could just know about itself or explicit url provided
sshurl = 'ssh://e.c'
httpurl = 'http://e.c'
# Expansion with '/.git' no longer done in this helper
#sm_httpurls = [httpurl, httpurl + '/.git']
sm_httpurls = [httpurl]
eq_(f(ds, 'sub'), [])
eq_(f(ds, 'sub', sshurl), [sshurl])
eq_(f(ds, 'sub', httpurl), sm_httpurls)
eq_(f(ds, 'sub', None), []) # otherwise really we have no clue were to get from
ds_subpath = str(ds.pathobj / 'sub')
eq_(f(ds, dict(path=ds_subpath, parentds=ds.path)), [])
eq_(f(ds, dict(path=ds_subpath, parentds=ds.path, gitmodule_url=sshurl)),
[('local', sshurl)])
eq_(f(ds, dict(path=ds_subpath, parentds=ds.path, gitmodule_url=httpurl)),
[('local', httpurl)])

# but if we work on dsclone then it should also add urls deduced from its
# own location default remote for current branch
subpath = op.sep.join((t, 'sub'))
eq_(f(clone, 'sub'), [subpath])
eq_(f(clone, 'sub', sshurl), [subpath, sshurl])
eq_(f(clone, 'sub', httpurl), [subpath] + sm_httpurls)
eq_(f(clone, 'sub'), [subpath]) # otherwise really we have no clue were to get from
# TODO: check that http:// urls for the dataset itself get resolved
eq_(f(clone, dict(path=ds_subpath, parentds=t)), [('origin', ds_subpath)])
eq_(f(clone, dict(path=ds_subpath, parentds=t, gitmodule_url=sshurl)),
[('origin', ds_subpath), ('origin', sshurl)])
eq_(f(clone, dict(path=ds_subpath, parentds=t, gitmodule_url=httpurl)),
[('origin', ds_subpath), ('origin', httpurl)])

# make sure it does meaningful things in an actual clone with an actual
# record of a subdataset
clone_subpath = str(clone.pathobj / 'sub')
eq_(f(clone, clone.subdatasets(return_type='item-or-list')),
[
('origin', ds_subpath),
('local', clone_subpath),
])

# check that a configured remote WITHOUT the desired submodule commit
# does not show up as a candidate
clone.siblings('add', name='myremote', url='http://example.com',
result_renderer='disabled')
eq_(f(clone, clone.subdatasets(return_type='item-or-list')),
[
('origin', ds_subpath),
('local', clone_subpath),
])
# inject a source URL config, should alter the result accordingly
with patch.dict(
'os.environ',
{'DATALAD_GET_SUBDATASET__SOURCE__CANDIDATE__BANG': 'youredead'}):
eq_(f(clone, clone.subdatasets(return_type='item-or-list')),
[
('origin', ds_subpath),
('bang', 'youredead'),
('local', clone_subpath),
])
# verify template instantiation works
with patch.dict(
'os.environ',
{'DATALAD_GET_SUBDATASET__SOURCE__CANDIDATE__BANG': 'pre-{id}-post'}):
eq_(f(clone, clone.subdatasets(return_type='item-or-list')),
[
('origin', ds_subpath),
('bang', 'pre-{}-post'.format(sub.id)),
('local', clone_subpath),
])
# now again, but have an additional remote besides origin that
# actually has the relevant commit
clone3 = install(
t3, source=t2,
result_xfm='datasets', return_type='item-or-list')
clone3.siblings('add', name='myremote', url=ds.path,
result_renderer='disabled')
clone3.update(sibling='myremote')
# we should end up with three pieces
eq_(f(clone3, clone3.subdatasets(return_type='item-or-list')),
[
('origin', clone_subpath),
('myremote', ds_subpath),
('local', str(clone3.pathobj / 'sub')),
])

# TODO: check that http:// urls for the dataset itself get resolved
# TODO: many more!!


Expand Down
6 changes: 0 additions & 6 deletions datalad/distribution/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,10 +95,4 @@ def _get_flexible_source_candidates(src, base_url=None, alternate_suffix=True):
candidates.append(
'{0}/.git'.format(src.rstrip('/')))

# TODO:
# We need to provide some error msg with InstallFailedError, since now
# it just swallows everything.
# yoh: not sure if this comment applies here, but could be still applicable
# outisde

return candidates
2 changes: 1 addition & 1 deletion datalad/metadata/aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -569,7 +569,7 @@ def _filterpaths(basepath, paths, exclude):
if not relevant_paths:
return None

return ds.repo.get_last_commit_hash(relevant_paths)
return ds.repo.get_last_commit_hexsha(relevant_paths)


def _get_obj_location(hash_str, ref_type, dumper):
Expand Down
2 changes: 1 addition & 1 deletion datalad/metadata/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,7 @@ def _mk_search_index(self, force_reindex):
from .metadata import get_ds_aggregate_db_locations
dbloc, db_base_path = get_ds_aggregate_db_locations(self.ds)
# what is the lastest state of aggregated metadata
metadata_state = self.ds.repo.get_last_commit_hash(relpath(dbloc, start=self.ds.path))
metadata_state = self.ds.repo.get_last_commit_hexsha(relpath(dbloc, start=self.ds.path))
# use location common to all index types, they would all invalidate
# simultaneously
stamp_fname = opj(self.index_dir, 'datalad_metadata_state')
Expand Down
Loading