diff --git a/datalad/config.py b/datalad/config.py
index 77c8ee07e9..746614c619 100644
--- a/datalad/config.py
+++ b/datalad/config.py
@@ -722,3 +722,61 @@ def unset(self, var, where='dataset', reload=True):
# use unset all as it is simpler for now
self._run(['--unset-all', var], where=where, reload=reload)
+
+
+def rewrite_url(cfg, url):
+ """Any matching 'url..insteadOf' configuration is applied
+
+ Any URL that starts with such a configuration will be rewritten
+ to start, instead, with . When more than one insteadOf
+ strings match a given URL, the longest match is used.
+
+ Parameters
+ ----------
+ cfg : ConfigManager or dict
+ dict-like with configuration variable name/value-pairs.
+ url : str
+ URL to be rewritten, if matching configuration is found.
+
+ Returns
+ -------
+ str
+ Rewritten or unmodified URL.
+ """
+ insteadof = {
+ # only leave the base url
+ k[4:-10]: v
+ for k, v in cfg.items()
+ if k.startswith('url.') and k.endswith('.insteadof')
+ }
+
+ # all config that applies
+ matches = {
+ key: v
+ for key, val in insteadof.items()
+ for v in (val if isinstance(val, tuple) else (val,))
+ if url.startswith(v)
+ }
+ # find longest match, like Git does
+ if matches:
+ rewrite_base, match = sorted(
+ matches.items(),
+ key=lambda x: len(x[1]),
+ reverse=True,
+ )[0]
+ if sum(match == v for v in matches.values()) > 1:
+ lgr.warning(
+ "Ignoring URL rewrite configuration for '%s', "
+ "multiple conflicting definitions exists: %s",
+ match,
+ ['url.{}.insteadof'.format(k)
+ for k, v in matches.items()
+ if v == match]
+ )
+ else:
+ url = '{}{}'.format(rewrite_base, url[len(match):])
+ return url
+
+
+# for convenience, bind to class too
+ConfigManager.rewrite_url = rewrite_url
diff --git a/datalad/core/distributed/clone.py b/datalad/core/distributed/clone.py
index 4fec4961b6..5c786bd311 100644
--- a/datalad/core/distributed/clone.py
+++ b/datalad/core/distributed/clone.py
@@ -771,6 +771,9 @@ def decode_source_spec(spec, cfg=None):
props['type'] = 'dataladri'
props['giturl'] = source_ri.as_git_url()
elif isinstance(source_ri, URL) and source_ri.scheme.startswith('ria+'):
+ # Git never gets to see these URLs, so let's manually apply any
+ # rewrite configuration Git might know about
+ source_ri = RI(cfg.rewrite_url(spec))
# parse a RIA URI
dsid, version = source_ri.fragment.split('@', maxsplit=1) \
if '@' in source_ri.fragment else (source_ri.fragment, None)
diff --git a/datalad/core/distributed/tests/test_clone.py b/datalad/core/distributed/tests/test_clone.py
index cdd5c59986..4bd610cb5b 100644
--- a/datalad/core/distributed/tests/test_clone.py
+++ b/datalad/core/distributed/tests/test_clone.py
@@ -64,6 +64,7 @@
with_sameas_remote,
known_failure,
known_failure_appveyor,
+ patch_config,
)
from datalad.core.distributed.clone import (
decode_source_spec,
@@ -686,6 +687,29 @@ def test_ria_http(lcl, storepath, url):
lcl / 'clone_failed')
assert_in("not found in upstream", str(cme.exception))
+ # lastly test if URL rewriting is in effect
+ # on the surface we clone from an SSH source identified by some custom
+ # label, no full URL, but URL rewriting setup maps it back to the
+ # HTTP URL used above
+ with patch_config({
+ 'url.ria+{}#.insteadof'.format(url): 'ria+ssh://somelabel#'}):
+ cloned_by_label = clone(
+ 'ria+ssh://somelabel#{}'.format(origds.id),
+ lcl / 'cloned_by_label',
+ )
+ # so we get the same setup as above, but....
+ eq_(origds.id, cloned_by_label.id)
+ if not ds.repo.is_managed_branch():
+ # test logic cannot handle adjusted branches
+ eq_(origds.repo.get_hexsha(), cloned_by_label.repo.get_hexsha())
+ ok_(cloned_by_label.config.get('remote.origin.url').startswith(url))
+ eq_(cloned_by_label.config.get('remote.origin.annex-ignore'), 'true')
+ # ... the clone candidates go with the label-based URL such that
+ # future get() requests acknowlege a (system-wide) configuration
+ # update
+ eq_(cloned_by_label.config.get('datalad.get.subdataset-source-candidate-origin'),
+ 'ria+ssh://somelabel#{id}')
+
@skip_if_no_network
@with_tempfile()
diff --git a/datalad/tests/test_config.py b/datalad/tests/test_config.py
index f7a258474d..67cfdd00ff 100644
--- a/datalad/tests/test_config.py
+++ b/datalad/tests/test_config.py
@@ -10,6 +10,7 @@
"""
+import logging
import os
from os.path import exists
from os.path import join as opj
@@ -32,7 +33,10 @@
from datalad.distribution.dataset import Dataset
from datalad.api import create
-from datalad.config import ConfigManager
+from datalad.config import (
+ ConfigManager,
+ rewrite_url,
+)
from datalad.cmd import CommandError
from datalad.support.external_versions import external_versions
@@ -373,3 +377,44 @@ def test_overrides():
cfg._cfgfiles,
[Path(f).read_text() for f in cfg._cfgfiles if Path(f).exists()],
))
+
+
+def test_rewrite_url():
+ test_cases = (
+ # no match
+ ('unicorn', 'unicorn'),
+ # custom label replacement
+ ('example:datalad/datalad.git', 'git@example.com:datalad/datalad.git'),
+ # protocol enforcement
+ ('git://example.com/some', 'https://example.com/some'),
+ # multi-match
+ ('mylabel', 'ria+ssh://fully.qualified.com'),
+ ('myotherlabel', 'ria+ssh://fully.qualified.com'),
+ # conflicts, same label pointing to different URLs
+ ('conflict', 'conflict'),
+ # also conflicts, but hidden in a multi-value definition
+ ('conflict2', 'conflict2'),
+ )
+ cfg_in = {
+ # label rewrite
+ 'git@example.com:': 'example:',
+ # protocol change
+ 'https://example': 'git://example',
+ # multi-value
+ 'ria+ssh://fully.qualified.com': ('mylabel', 'myotherlabel'),
+ # conflicting definitions
+ 'http://host1': 'conflict',
+ 'http://host2': 'conflict',
+ # hidden conflict
+ 'http://host3': 'conflict2',
+ 'http://host4': ('someokish', 'conflict2'),
+ }
+ cfg = {
+ 'url.{}.insteadof'.format(k): v
+ for k, v in cfg_in.items()
+ }
+ for input, output in test_cases:
+ with swallow_logs(logging.WARNING) as msg:
+ assert_equal(rewrite_url(cfg, input), output)
+ if input.startswith('conflict'):
+ assert_in("Ignoring URL rewrite", msg.out)