diff --git a/datalad/config.py b/datalad/config.py index 77c8ee07e9..746614c619 100644 --- a/datalad/config.py +++ b/datalad/config.py @@ -722,3 +722,61 @@ def unset(self, var, where='dataset', reload=True): # use unset all as it is simpler for now self._run(['--unset-all', var], where=where, reload=reload) + + +def rewrite_url(cfg, url): + """Any matching 'url..insteadOf' configuration is applied + + Any URL that starts with such a configuration will be rewritten + to start, instead, with . When more than one insteadOf + strings match a given URL, the longest match is used. + + Parameters + ---------- + cfg : ConfigManager or dict + dict-like with configuration variable name/value-pairs. + url : str + URL to be rewritten, if matching configuration is found. + + Returns + ------- + str + Rewritten or unmodified URL. + """ + insteadof = { + # only leave the base url + k[4:-10]: v + for k, v in cfg.items() + if k.startswith('url.') and k.endswith('.insteadof') + } + + # all config that applies + matches = { + key: v + for key, val in insteadof.items() + for v in (val if isinstance(val, tuple) else (val,)) + if url.startswith(v) + } + # find longest match, like Git does + if matches: + rewrite_base, match = sorted( + matches.items(), + key=lambda x: len(x[1]), + reverse=True, + )[0] + if sum(match == v for v in matches.values()) > 1: + lgr.warning( + "Ignoring URL rewrite configuration for '%s', " + "multiple conflicting definitions exists: %s", + match, + ['url.{}.insteadof'.format(k) + for k, v in matches.items() + if v == match] + ) + else: + url = '{}{}'.format(rewrite_base, url[len(match):]) + return url + + +# for convenience, bind to class too +ConfigManager.rewrite_url = rewrite_url diff --git a/datalad/core/distributed/clone.py b/datalad/core/distributed/clone.py index 4fec4961b6..5c786bd311 100644 --- a/datalad/core/distributed/clone.py +++ b/datalad/core/distributed/clone.py @@ -771,6 +771,9 @@ def decode_source_spec(spec, cfg=None): props['type'] = 'dataladri' props['giturl'] = source_ri.as_git_url() elif isinstance(source_ri, URL) and source_ri.scheme.startswith('ria+'): + # Git never gets to see these URLs, so let's manually apply any + # rewrite configuration Git might know about + source_ri = RI(cfg.rewrite_url(spec)) # parse a RIA URI dsid, version = source_ri.fragment.split('@', maxsplit=1) \ if '@' in source_ri.fragment else (source_ri.fragment, None) diff --git a/datalad/core/distributed/tests/test_clone.py b/datalad/core/distributed/tests/test_clone.py index cdd5c59986..4bd610cb5b 100644 --- a/datalad/core/distributed/tests/test_clone.py +++ b/datalad/core/distributed/tests/test_clone.py @@ -64,6 +64,7 @@ with_sameas_remote, known_failure, known_failure_appveyor, + patch_config, ) from datalad.core.distributed.clone import ( decode_source_spec, @@ -686,6 +687,29 @@ def test_ria_http(lcl, storepath, url): lcl / 'clone_failed') assert_in("not found in upstream", str(cme.exception)) + # lastly test if URL rewriting is in effect + # on the surface we clone from an SSH source identified by some custom + # label, no full URL, but URL rewriting setup maps it back to the + # HTTP URL used above + with patch_config({ + 'url.ria+{}#.insteadof'.format(url): 'ria+ssh://somelabel#'}): + cloned_by_label = clone( + 'ria+ssh://somelabel#{}'.format(origds.id), + lcl / 'cloned_by_label', + ) + # so we get the same setup as above, but.... + eq_(origds.id, cloned_by_label.id) + if not ds.repo.is_managed_branch(): + # test logic cannot handle adjusted branches + eq_(origds.repo.get_hexsha(), cloned_by_label.repo.get_hexsha()) + ok_(cloned_by_label.config.get('remote.origin.url').startswith(url)) + eq_(cloned_by_label.config.get('remote.origin.annex-ignore'), 'true') + # ... the clone candidates go with the label-based URL such that + # future get() requests acknowlege a (system-wide) configuration + # update + eq_(cloned_by_label.config.get('datalad.get.subdataset-source-candidate-origin'), + 'ria+ssh://somelabel#{id}') + @skip_if_no_network @with_tempfile() diff --git a/datalad/tests/test_config.py b/datalad/tests/test_config.py index f7a258474d..67cfdd00ff 100644 --- a/datalad/tests/test_config.py +++ b/datalad/tests/test_config.py @@ -10,6 +10,7 @@ """ +import logging import os from os.path import exists from os.path import join as opj @@ -32,7 +33,10 @@ from datalad.distribution.dataset import Dataset from datalad.api import create -from datalad.config import ConfigManager +from datalad.config import ( + ConfigManager, + rewrite_url, +) from datalad.cmd import CommandError from datalad.support.external_versions import external_versions @@ -373,3 +377,44 @@ def test_overrides(): cfg._cfgfiles, [Path(f).read_text() for f in cfg._cfgfiles if Path(f).exists()], )) + + +def test_rewrite_url(): + test_cases = ( + # no match + ('unicorn', 'unicorn'), + # custom label replacement + ('example:datalad/datalad.git', 'git@example.com:datalad/datalad.git'), + # protocol enforcement + ('git://example.com/some', 'https://example.com/some'), + # multi-match + ('mylabel', 'ria+ssh://fully.qualified.com'), + ('myotherlabel', 'ria+ssh://fully.qualified.com'), + # conflicts, same label pointing to different URLs + ('conflict', 'conflict'), + # also conflicts, but hidden in a multi-value definition + ('conflict2', 'conflict2'), + ) + cfg_in = { + # label rewrite + 'git@example.com:': 'example:', + # protocol change + 'https://example': 'git://example', + # multi-value + 'ria+ssh://fully.qualified.com': ('mylabel', 'myotherlabel'), + # conflicting definitions + 'http://host1': 'conflict', + 'http://host2': 'conflict', + # hidden conflict + 'http://host3': 'conflict2', + 'http://host4': ('someokish', 'conflict2'), + } + cfg = { + 'url.{}.insteadof'.format(k): v + for k, v in cfg_in.items() + } + for input, output in test_cases: + with swallow_logs(logging.WARNING) as msg: + assert_equal(rewrite_url(cfg, input), output) + if input.startswith('conflict'): + assert_in("Ignoring URL rewrite", msg.out)