datalad · mih · Oct 28, 2019 · Oct 23, 2019 · Oct 23, 2019 · Oct 23, 2019
diff --git a/datalad/distribution/get.py b/datalad/distribution/get.py
@@ -77,36 +77,99 @@
 lgr = logging.getLogger('datalad.distribution.get')
 
 
-def _get_flexible_source_candidates_for_submodule(ds, sm_path, sm_url=None):
-    """Retrieve candidates from where to install the submodule
+def _get_remotes_having_commit(repo, commit_hexsha, with_urls_only=True):
+    """Traverse all branches of the remote and check if commit in any of their ancestry
 
-    Even if url for submodule is provided explicitly -- first tries urls under
-    parent's module tracking branch remote.
+    It is a generator yielding names of the remotes
     """
-    clone_urls = []
+    remote_branches = [
+        b['refname:strip=2']
+        for b in repo.for_each_ref_(
+            fields='refname:strip=2',
+            pattern='refs/remotes',
+            contains=commit_hexsha)]
+    return [
+        remote
+        for remote in repo.get_remotes(with_urls_only=with_urls_only)
+        if any(rb.startswith(remote + '/') for rb in remote_branches)
+    ]
+
+
+def _get_flexible_source_candidates_for_submodule(ds, sm):
+    """Assemble candidates from where to install a submodule
+
+    Even if a URL for submodule is provided explicitly -- first tries urls under
+    parent's module tracking branch remote.
+
+    Additional candidate URLs can be generated based on templates specified as
+    configuration variables with the pattern
+
+      `datalad.get.subdataset-source-candidate-<name>`
+
+    where `name` is an arbitrary identifier.
+
+    A template string assigned to such a variable can utilize the Python format
+    mini language and may reference a number of properties that are inferred
+    from the parent dataset's knowledge about the target subdataset. Properties
+    include any submodule property specified in the respective .gitmodules
+    record. For convenience, an existing `datalad-id` record is made available
+    under the shortened name `id`.
 
+    Additionally, the URL of any configured remote that contains the respective
+    submodule commit is available as `remote-<name>` properties, where `name`
+    is the configured remote name.
+
+    Parameters
+    ----------
+    ds : Dataset
+      Parent dataset of to-be-installed subdataset.
+    sm : dict
+      Submodule record as produced by `subdatasets()`.
+
+    Returns
+    -------
+    list of tuples
+      Where each tuples consists of a name and a URL. Names are not unique
+      and either derived from the name of the respective remote, template
+      configuration variable, or 'origin' for the candidate URL that was
+      obtained from the .gitmodule record.
+    """
+    # short cuts
     ds_repo = ds.repo
+    sm_url = sm.get('gitmodule_url', None)
+    sm_path = op.relpath(sm['path'], start=sm['parentds'])
 
-    # should be our first candidate
+    clone_urls = []
+
+    # CANDIDATE: tracking remote of the current branch
     tracking_remote, tracking_branch = ds_repo.get_tracking_branch()
     candidate_remotes = [tracking_remote] if tracking_remote else []
 
     # if we have a remote, let's check the location of that remote
     # for the presence of the desired submodule
-    try:
-        last_commit = ds_repo.get_last_commit_hash(sm_path)
+    last_commit = ds_repo.get_last_commit_hexsha(sm_path)
+    if last_commit:
+        # CANDIDATE: any remote that has the commit when the submodule was
+        # last modified
+
         # ideally should also give preference to the remotes which have
         # the same branch checked out I guess
-        candidate_remotes += list(ds_repo._get_remotes_having_commit(last_commit))
-    except StopIteration:
-        # no commit for it known yet, ... oh well
-        pass
+        candidate_remotes += list(_get_remotes_having_commit(ds_repo, last_commit))
+
+    # prepare a dict to generate URL candidates from templates
+    sm_candidate_props = {
+        k[10:].replace('datalad-id', 'id'): v
+        for k, v in sm.items()
+        if k.startswith('gitmodule_')
+    }
 
     for remote in unique(candidate_remotes):
         remote_url = ds_repo.get_remote_url(remote, push=False)
 
         # Directly on parent's ds url
         if remote_url:
+            # make remotes and their URLs available to template rendering
+            sm_candidate_props['remoteurl-{}'.format(remote)] = remote_url
             # attempt: submodule checkout at parent remote URL
             # We might need to quote sm_path portion, e.g. for spaces etc
             if isinstance(RI(remote_url), URL):
@@ -115,41 +178,67 @@ def _get_flexible_source_candidates_for_submodule(ds, sm_path, sm_url=None):
                 sm_path_url = sm_path
 
             clone_urls.extend(
-                _get_flexible_source_candidates(
+                (remote, url)
+                for url in _get_flexible_source_candidates(
                     # alternate suffixes are tested by `clone` anyways
-                    sm_path_url, remote_url, alternate_suffix=False))
+                    sm_path_url, remote_url, alternate_suffix=False)
+            )
 
             # attempt: provided (configured?) submodule URL
             # TODO: consider supporting DataLadRI here?  or would confuse
             #  git and we wouldn't want that (i.e. not allow pure git clone
             #  --recursive)
             if sm_url:
-                clone_urls += _get_flexible_source_candidates(
-                    sm_url,
-                    remote_url,
-                    alternate_suffix=False
+                clone_urls.extend(
+                    (remote, url)
+                    for url in _get_flexible_source_candidates(
+                        sm_url,
+                        remote_url,
+                        alternate_suffix=False)
                 )
 
-    # Do based on the ds.path as the last resort
+        for name, tmpl in [(c[40:], ds_repo.config[c])
+                           for c in ds_repo.config.keys()
+                           if c.startswith(
+                               'datalad.get.subdataset-source-candidate-')]:
+            url = tmpl.format(**sm_candidate_props)
+            # we don't want "flexible_source_candidates" here, this is
+            # configuration that can be made arbitrarily precise from the
+            # outside. Additional guesswork can only make it slower
+            clone_urls.append((name, url))
+
+    # CANDIDATE: the actual configured gitmodule URL
     if sm_url:
-        clone_urls += _get_flexible_source_candidates(
-            sm_url,
-            ds.path,
-            alternate_suffix=False)
+        clone_urls.extend(
+            ('local', url)
+            for url in _get_flexible_source_candidates(
+                sm_url,
+                ds.path,
+                alternate_suffix=False)
+        )
+
+    return unique(clone_urls, lambda x: x[1])
 
-    return unique(clone_urls)
 
+def _install_subds_from_flexible_source(ds, sm, **kwargs):
+    """Tries to obtain a given subdataset from several meaningful locations
 
-def _install_subds_from_flexible_source(
-        ds, sm_path, sm_url, reckless, description=None):
-    """Tries to obtain a given subdataset from several meaningful locations"""
+    Parameters
+    ----------
+    ds : Dataset
+      Parent dataset of to-be-installed subdataset.
+    sm : dict
+      Submodule record as produced by `subdatasets()`.
+    **kwargs
+      Passed onto clone()
+    """
+    sm_path = op.relpath(sm['path'], start=sm['parentds'])
     # compose a list of candidate clone URLs
-    clone_urls = _get_flexible_source_candidates_for_submodule(
-        ds, sm_path, sm_url)
+    clone_urls = _get_flexible_source_candidates_for_submodule(ds, sm)
 
     # prevent inevitable exception from `clone`
     dest_path = op.join(ds.path, sm_path)
-    clone_urls = [src for src in clone_urls if src != dest_path]
+    clone_urls = [src for name, src in clone_urls if src != dest_path]
 
     if not clone_urls:
         # yield error
@@ -171,15 +260,14 @@ def _install_subds_from_flexible_source(
             # pretend no parent -- we don't want clone to add to ds
             # because this is a submodule already!
             dataset=None,
-            reckless=reckless,
             # if we have more than one source, pass as alternatives
             alt_sources=clone_urls[1:],
-            description=description,
             result_xfm=None,
             # we yield all an have the caller decide
             on_failure='ignore',
             result_renderer='disabled',
-            return_type='generator'):
+            return_type='generator',
+            **kwargs):
         yield res
 
     subds = Dataset(dest_path)
@@ -263,9 +351,8 @@ def _install_necessary_subdatasets(
         # get the module from
         for res in _install_subds_from_flexible_source(
                 Dataset(cur_subds['parentds']),
-                op.relpath(cur_subds['path'], start=cur_subds['parentds']),
-                cur_subds['gitmodule_url'],
-                reckless,
+                cur_subds,
+                reckless=reckless,
                 description=description):
             if res.get('action', None) == 'install':
                 if res['status'] == 'ok':
@@ -324,9 +411,8 @@ def _recursive_install_subds_underneath(ds, recursion_limit, reckless, start=Non
             # try to get this dataset
             for res in _install_subds_from_flexible_source(
                     ds,
-                    op.relpath(sub['path'], start=ds.path),
-                    sub['gitmodule_url'],
-                    reckless,
+                    sub,
+                    reckless=reckless,
                     description=description):
                 # yield everything to let the caller decide how to deal with
                 # errors

diff --git a/datalad/distribution/tests/test_get.py b/datalad/distribution/tests/test_get.py
@@ -15,6 +15,7 @@
 import os.path as op
 from os.path import join as opj, basename
 from glob import glob
+from mock import patch
 
 from datalad.api import create
 from datalad.api import get
@@ -64,34 +65,89 @@ def _make_dataset_hierarchy(path):
 
 @with_tempfile
 @with_tempfile
-def test_get_flexible_source_candidates_for_submodule(t, t2):
+@with_tempfile
+def test_get_flexible_source_candidates_for_submodule(t, t2, t3):
     f = _get_flexible_source_candidates_for_submodule
     # for now without mocking -- let's just really build a dataset
     ds = create(t)
+    sub = ds.create('sub')
     clone = install(
         t2, source=t,
         result_xfm='datasets', return_type='item-or-list')
 
     # first one could just know about itself or explicit url provided
     sshurl = 'ssh://e.c'
     httpurl = 'http://e.c'
-    # Expansion with '/.git' no longer done in this helper
-    #sm_httpurls = [httpurl, httpurl + '/.git']
-    sm_httpurls = [httpurl]
-    eq_(f(ds, 'sub'), [])
-    eq_(f(ds, 'sub', sshurl), [sshurl])
-    eq_(f(ds, 'sub', httpurl), sm_httpurls)
-    eq_(f(ds, 'sub', None), [])  # otherwise really we have no clue were to get from
+    ds_subpath = str(ds.pathobj / 'sub')
+    eq_(f(ds, dict(path=ds_subpath, parentds=ds.path)), [])
+    eq_(f(ds, dict(path=ds_subpath, parentds=ds.path, gitmodule_url=sshurl)),
+        [('local', sshurl)])
+    eq_(f(ds, dict(path=ds_subpath, parentds=ds.path, gitmodule_url=httpurl)),
+        [('local', httpurl)])
 
     # but if we work on dsclone then it should also add urls deduced from its
     # own location default remote for current branch
-    subpath = op.sep.join((t, 'sub'))
-    eq_(f(clone, 'sub'), [subpath])
-    eq_(f(clone, 'sub', sshurl), [subpath, sshurl])
-    eq_(f(clone, 'sub', httpurl), [subpath] + sm_httpurls)
-    eq_(f(clone, 'sub'), [subpath])  # otherwise really we have no clue were to get from
-    # TODO: check that http:// urls for the dataset itself get resolved
+    eq_(f(clone, dict(path=ds_subpath, parentds=t)), [('origin', ds_subpath)])
+    eq_(f(clone, dict(path=ds_subpath, parentds=t, gitmodule_url=sshurl)),
+        [('origin', ds_subpath), ('origin', sshurl)])
+    eq_(f(clone, dict(path=ds_subpath, parentds=t, gitmodule_url=httpurl)),
+        [('origin', ds_subpath), ('origin', httpurl)])
+
+    # make sure it does meaningful things in an actual clone with an actual
+    # record of a subdataset
+    clone_subpath = str(clone.pathobj / 'sub')
+    eq_(f(clone, clone.subdatasets(return_type='item-or-list')),
+        [
+            ('origin', ds_subpath),
+            ('local', clone_subpath),
+    ])
+
+    # check that a configured remote WITHOUT the desired submodule commit
+    # does not show up as a candidate
+    clone.siblings('add', name='myremote', url='http://example.com',
+                   result_renderer='disabled')
+    eq_(f(clone, clone.subdatasets(return_type='item-or-list')),
+        [
+            ('origin', ds_subpath),
+            ('local', clone_subpath),
+    ])
+    # inject a source URL config, should alter the result accordingly
+    with patch.dict(
+            'os.environ',
+            {'DATALAD_GET_SUBDATASET__SOURCE__CANDIDATE__BANG': 'youredead'}):
+        eq_(f(clone, clone.subdatasets(return_type='item-or-list')),
+            [
+                ('origin', ds_subpath),
+                ('bang', 'youredead'),
+                ('local', clone_subpath),
+        ])
+    # verify template instantiation works
+    with patch.dict(
+            'os.environ',
+            {'DATALAD_GET_SUBDATASET__SOURCE__CANDIDATE__BANG': 'pre-{id}-post'}):
+        eq_(f(clone, clone.subdatasets(return_type='item-or-list')),
+            [
+                ('origin', ds_subpath),
+                ('bang', 'pre-{}-post'.format(sub.id)),
+                ('local', clone_subpath),
+        ])
+    # now again, but have an additional remote besides origin that
+    # actually has the relevant commit
+    clone3 = install(
+        t3, source=t2,
+        result_xfm='datasets', return_type='item-or-list')
+    clone3.siblings('add', name='myremote', url=ds.path,
+                    result_renderer='disabled')
+    clone3.update(sibling='myremote')
+    # we should end up with three pieces
+    eq_(f(clone3, clone3.subdatasets(return_type='item-or-list')),
+        [
+            ('origin', clone_subpath),
+            ('myremote', ds_subpath),
+            ('local', str(clone3.pathobj / 'sub')),
+    ])
 
+    # TODO: check that http:// urls for the dataset itself get resolved
     # TODO: many more!!
 
 

diff --git a/datalad/distribution/utils.py b/datalad/distribution/utils.py
@@ -95,10 +95,4 @@ def _get_flexible_source_candidates(src, base_url=None, alternate_suffix=True):
                 candidates.append(
                     '{0}/.git'.format(src.rstrip('/')))
 
-    # TODO:
-    # We need to provide some error msg with InstallFailedError, since now
-    # it just swallows everything.
-    # yoh: not sure if this comment applies here, but could be still applicable
-    # outisde
-
     return candidates
diff --git a/datalad/metadata/aggregate.py b/datalad/metadata/aggregate.py
@@ -569,7 +569,7 @@ def _filterpaths(basepath, paths, exclude):
     if not relevant_paths:
         return None
 
-    return ds.repo.get_last_commit_hash(relevant_paths)
+    return ds.repo.get_last_commit_hexsha(relevant_paths)
 
 
 def _get_obj_location(hash_str, ref_type, dumper):

diff --git a/datalad/metadata/search.py b/datalad/metadata/search.py
@@ -320,7 +320,7 @@ def _mk_search_index(self, force_reindex):
         from .metadata import get_ds_aggregate_db_locations
         dbloc, db_base_path = get_ds_aggregate_db_locations(self.ds)
         # what is the lastest state of aggregated metadata
-        metadata_state = self.ds.repo.get_last_commit_hash(relpath(dbloc, start=self.ds.path))
+        metadata_state = self.ds.repo.get_last_commit_hexsha(relpath(dbloc, start=self.ds.path))
         # use location common to all index types, they would all invalidate
         # simultaneously
         stamp_fname = opj(self.index_dir, 'datalad_metadata_state')