Merge 21a8b95 into 8bb550b

datalad · May 10, 2018 · 0e536e0 · 0e536e0
2 parents 8bb550b + 21a8b95
commit 0e536e0
Show file tree

Hide file tree

Showing 7 changed files with 299 additions and 12 deletions.
diff --git a/datalad/interface/rerun.py b/datalad/interface/rerun.py
@@ -14,6 +14,7 @@
 import logging
 from itertools import dropwhile
 import json
+import os
 import re
 import sys
 
@@ -70,6 +71,11 @@ class Rerun(Interface):
         % # now on verify branch
         % datalad diff --revision=master..
         % git log --oneline --left-right --cherry-pick master...
+
+    .. note::
+      Currently the "onto" feature only sets the working tree of the current
+      dataset to a previous state. The working trees of any subdatasets remain
+      unchanged.
     """
     _params_ = dict(
         revision=Parameter(
@@ -313,12 +319,16 @@ def __call__(
                 # bring back the entire state of the tree with #1424, but
                 # we limit ourself to file addition/not-in-place-modification
                 # for now
-                for r in ds.unlock(new_or_modified(ds, hexsha),
-                                   return_type='generator', result_xfm=None):
-                    yield r
+                auto_outputs = (os.path.relpath(ap["path"], ds.path)
+                                for ap in new_or_modified(ds, hexsha))
+                outputs = run_info.get("outputs", [])
+                auto_outputs = [p for p in auto_outputs if p not in outputs]
 
                 for r in run_command(run_info['cmd'],
-                                     ds, message or rev["run_message"],
+                                     dataset=ds,
+                                     inputs=run_info.get("inputs", []),
+                                     outputs=outputs + auto_outputs,
+                                     message=message or rev["run_message"],
                                      rerun_info=run_info):
                     yield r
 

diff --git a/datalad/interface/run.py b/datalad/interface/run.py
@@ -31,12 +31,16 @@
 from datalad.support.param import Parameter
 
 from datalad.distribution.add import Add
+from datalad.distribution.get import Get
+from datalad.distribution.remove import Remove
 from datalad.distribution.dataset import require_dataset
 from datalad.distribution.dataset import EnsureDataset
 from datalad.distribution.dataset import datasetmethod
+from datalad.interface.unlock import Unlock
 
 from datalad.utils import get_dataset_root
 from datalad.utils import getpwd
+from datalad.utils import partition
 
 lgr = logging.getLogger('datalad.interface.run')
 
@@ -78,6 +82,31 @@ class Run(Interface):
             working directory. If a dataset is given, the command will be
             executed in the root directory of this dataset.""",
             constraints=EnsureDataset() | EnsureNone()),
+        inputs=Parameter(
+            args=("--input",),
+            dest="inputs",
+            metavar=("PATH"),
+            action='append',
+            doc="""A dependency for the run. Before running the command, the
+            content of this file will be retrieved. A value of "." means "run
+            :command:`datalad get .`", and any other values given are ignored.
+            If the value doesn't match any known annex file, it will be treated
+            as a glob and passed to :command:`git annex find --include=`. The
+            value should be specified relative to the top-level directory of
+            the current dataset. Using '*' for this value means "all current
+            annex files". Note: Globbing currently only considers the current
+            dataset, not any subdatasets. [CMD: This option can be given more
+            than once. CMD]"""),
+        outputs=Parameter(
+            args=("--output",),
+            dest="outputs",
+            metavar=("PATH"),
+            action='append',
+            doc="""Prepare this file to be an output file of the command. If
+            the content of this file is present, unlock the file. Otherwise,
+            remove it. Globs are treated in the same way described for [PY:
+            `inputs` PY][CMD: --input CMD]. [CMD: This option can be given more
+            than once. CMD]"""),
         message=save_message_opt,
         rerun=Parameter(
             args=('--rerun',),
@@ -94,6 +123,8 @@ class Run(Interface):
     def __call__(
             cmd=None,
             dataset=None,
+            inputs=None,
+            outputs=None,
             message=None,
             rerun=False):
         if rerun:
@@ -106,14 +137,28 @@ def __call__(
                 yield r
         else:
             if cmd:
-                for r in run_command(cmd, dataset, message):
+                for r in run_command(cmd, dataset=dataset,
+                                     inputs=inputs, outputs=outputs,
+                                     message=message):
                     yield r
             else:
                 lgr.warning("No command given")
 
 
+def _resolve_files(dset, globs_or_files):
+    """Expand --include globs in `globs_or_files` to file names.
+    """
+    globs, files = partition(
+        globs_or_files,
+        lambda f: dset.repo.is_under_annex([f], batch=True)[0])
+    globs, files = list(globs), list(files)
+    globbed = dset.repo.get_annexed_files(patterns=globs) if globs else []
+    return files + globbed
+
+
 # This helper function is used to add the rerun_info argument.
-def run_command(cmd, dataset=None, message=None, rerun_info=None):
+def run_command(cmd, dataset=None, inputs=None, outputs=None,
+                message=None, rerun_info=None):
     rel_pwd = rerun_info.get('pwd') if rerun_info else None
     if rel_pwd and dataset:
         # recording is relative to the dataset
@@ -153,6 +198,40 @@ def run_command(cmd, dataset=None, message=None, rerun_info=None):
                      'cannot detect changes by command'))
         return
 
+    if inputs is None:
+        inputs = []
+    elif any(i.strip() == "." for i in inputs):
+        inputs = ["."]
+        for res in ds.get(inputs):
+            yield res
+    elif inputs:
+        if not rerun_info:
+            inputs = _resolve_files(ds, inputs)
+        if not inputs:
+            lgr.warning("No matching files found for --input")
+        else:
+            for res in ds.get(inputs):
+                yield res
+
+    if outputs is None:
+        outputs = []
+    elif outputs:
+        if not rerun_info:
+            outputs = _resolve_files(ds, outputs)
+        if not outputs:
+            lgr.warning("No matching files found for --output")
+        else:
+            for res in ds.unlock(outputs, on_failure="ignore"):
+                if res["status"] == "impossible":
+                    if "no content" in res["message"]:
+                        for rem_res in ds.remove(res["path"],
+                                                 check=False, save=False):
+                            yield rem_res
+                        continue
+                    elif "path does not exist" in res["message"]:
+                        continue
+                yield res
+
     # anticipate quoted compound shell commands
     cmd = cmd[0] if isinstance(cmd, list) and len(cmd) == 1 else cmd
 
@@ -203,6 +282,10 @@ def run_command(cmd, dataset=None, message=None, rerun_info=None):
         'cmd': cmd,
         'exit': cmd_exitcode if cmd_exitcode is not None else 0,
         'chain': rerun_info["chain"] if rerun_info else [],
+        'inputs': inputs,
+        # Get outputs from the rerun_info because rerun adds new/modified files
+        # to the outputs argument.
+        'outputs': rerun_info["outputs"] if rerun_info else outputs
     }
     if rel_pwd is not None:
         # only when inside the dataset to not leak information

diff --git a/datalad/interface/tests/test_run.py b/datalad/interface/tests/test_run.py
@@ -54,6 +54,7 @@
 from datalad.tests.utils import skip_if_on_windows
 from datalad.tests.utils import ignore_nose_capturing_stdout
 from datalad.tests.utils import slow
+from datalad.tests.utils import with_testrepos
 
 
 @with_tempfile(mkdir=True)
@@ -156,6 +157,10 @@ def test_rerun(path, nodspath):
     # Or --since= to run all reachable commits.
     ds.rerun(since="")
     eq_('xxxxxxxxxx\n', open(probe_path).read())
+    # If a file is dropped, we remove it instead of unlocking it.
+    ds.drop(probe_path, check=False)
+    ds.rerun()
+    eq_('x\n', open(probe_path).read())
     # If the history to rerun has a merge commit, we abort.
     ds.repo.checkout("HEAD~3", options=["-b", "topic"])
     with open(opj(path, "topic-file"), "w") as f:
@@ -404,7 +409,11 @@ def test_rerun_cherry_pick(path):
     for onto, text in [("HEAD", "skipping"), ("prerun", "cherry picking")]:
         results = ds.rerun(since="prerun", onto=onto)
         assert_in_results(results, status='ok', path=ds.path)
-        assert any(r.get("message", "").endswith(text) for r in results)
+
+        messages = (r.get("message", "") for r in results)
+        # Message may be a tuple.
+        messages = (m for m in messages if hasattr(m, "endswith"))
+        assert any(m.endswith(text) for m in messages)
 
 
 @ignore_nose_capturing_stdout
@@ -558,6 +567,98 @@ def test_rerun_script(path):
                   cmout.getvalue().splitlines())
 
 
+@slow  # ~10s
+@ignore_nose_capturing_stdout
+@skip_if_on_windows
+@with_testrepos('basic_annex', flavors=['clone'])
+@known_failure_direct_mode  #FIXME
+@known_failure_v6  #FIXME
+def test_run_inputs_outputs(path):
+    ds = Dataset(path)
+
+    assert_false(ds.repo.file_has_content("test-annex.dat"))
+
+    # If we specify test-annex.dat as an input, it will be retrieved before the
+    # run.
+    ds.run("cat test-annex.dat test-annex.dat >doubled.dat",
+           inputs=["test-annex.dat"])
+
+    ok_clean_git(ds.path)
+    ok_(ds.repo.file_has_content("test-annex.dat"))
+    ok_(ds.repo.file_has_content("doubled.dat"))
+
+    # Rerunning the commit will also get the input file.
+    ds.repo.drop("test-annex.dat", options=["--force"])
+    assert_false(ds.repo.file_has_content("test-annex.dat"))
+    ds.rerun()
+    ok_(ds.repo.file_has_content("test-annex.dat"))
+
+    with swallow_logs(new_level=logging.WARN) as cml:
+        ds.run("touch dummy", inputs=["*.not-an-extension"])
+        assert_in("No matching files found for --input", cml.out)
+
+    # Test different combinations of globs and explicit files.
+    inputs = ["a.dat", "b.dat", "c.txt", "d.txt"]
+    create_tree(ds.path, {i: i for i in inputs})
+
+    ds.add(".")
+    ds.repo.copy_to(inputs, remote="origin")
+    ds.repo.drop(inputs, options=["--force"])
+
+    test_cases = [(["*.dat"], ["a.dat", "b.dat"]),
+                  (["*.dat", "c.txt"], ["a.dat", "b.dat", "c.txt"]),
+                  (["*"], inputs)]
+
+    for idx, (inputs_arg, expected_present) in enumerate(test_cases):
+        assert_false(any(ds.repo.file_has_content(i) for i in inputs))
+
+        ds.run("touch dummy{}".format(idx), inputs=inputs_arg)
+        ok_(all(ds.repo.file_has_content(f) for f in expected_present))
+
+        ds.repo.drop(inputs, options=["--force"])
+
+    # --input=. runs "datalad get ."
+    ds.run("touch dot-dummy", inputs=["."])
+    eq_(ds.repo.get_annexed_files(),
+        ds.repo.get_annexed_files(with_content_only=True))
+    # On rerun, we get all files, even those that weren't in the tree at the
+    # time of the run.
+    create_tree(ds.path, {"after-dot-run": "after-dot-run content"})
+    ds.add(".")
+    ds.repo.copy_to(["after-dot-run"], remote="origin")
+    ds.repo.drop(["after-dot-run"], options=["--force"])
+    ds.rerun("HEAD^")
+    ds.repo.file_has_content("after-dot-run")
+
+    # --output will unlock files that are present.
+    ds.repo.get("a.dat")
+    ds.run("echo ' appended' >>a.dat", outputs=["a.dat"])
+    with open(opj(path, "a.dat")) as fh:
+        eq_(fh.read(), "a.dat appended\n")
+
+    # --output will remove files that are not present.
+    ds.repo.drop("a.dat", options=["--force"])
+    ds.run("echo ' appended' >>a.dat", outputs=["a.dat"])
+    with open(opj(path, "a.dat")) as fh:
+        eq_(fh.read(), " appended\n")
+
+    # --input can be combined with --output.
+    ds.repo.repo.git.reset("--hard", "HEAD~2")
+    ds.run("echo ' appended' >>a.dat", inputs=["a.dat"], outputs=["a.dat"])
+    with open(opj(path, "a.dat")) as fh:
+        eq_(fh.read(), "a.dat appended\n")
+
+    with swallow_logs(new_level=logging.WARN) as cml:
+        ds.run("echo blah", outputs=["*.not-an-extension"])
+        assert_in("No matching files found for --output", cml.out)
+
+    ds.create('sub')
+    ds.run("echo sub_orig >sub/subfile")
+    ds.run("echo sub_overwrite >sub/subfile", outputs=["sub/subfile"])
+    ds.drop("sub/subfile", check=False)
+    ds.run("echo sub_overwrite >sub/subfile", outputs=["sub/subfile"])
+
+
 def test_rerun_commit_message_check():
     assert_raises(ValueError,
                   get_run_info,

diff --git a/datalad/support/annexrepo.py b/datalad/support/annexrepo.py
@@ -2565,11 +2565,35 @@ def repo_info(self, fast=False):
         assert(info.pop('command') == 'info')
         return info  # just as is for now
 
-    def get_annexed_files(self, with_content_only=False):
+    def get_annexed_files(self, with_content_only=False, patterns=None):
         """Get a list of files in annex
+
+        Parameters
+        ----------
+        with_content_only : bool, optional
+            Only list files whose content is present.
+        patterns : list, optional
+            Globs to pass to annex's `--include=`. Files that match any of
+            these will be returned (i.e., they'll be separated by `--or`).
+
+        Returns
+        -------
+        A list of file names
         """
-        # TODO: Review!!
-        args = [] if with_content_only else ['--include', "*"]
+        if not patterns:
+            args = [] if with_content_only else ['--include', "*"]
+        else:
+            if len(patterns) == 1:
+                args = ['--include', patterns[0]]
+            else:
+                args = ['-(']
+                for pat in patterns[:-1]:
+                    args.extend(['--include', pat, "--or"])
+                args.extend(['--include', patterns[-1]])
+                args.append('-)')
+
+            if with_content_only:
+                args.extend(['--in', 'here'])
         out, err = self._run_annex_command('find', annex_options=args)
         # TODO: JSON
         return out.splitlines()

diff --git a/datalad/support/tests/test_annexrepo.py b/datalad/support/tests/test_annexrepo.py
@@ -1327,6 +1327,33 @@ def test_annex_drop(src, dst):
     assert_raises(CommandError, ar.drop, ['.'], options=['--all'])
 
 
+@with_tree({"a.txt": "a", "b.txt": "b", "c.py": "c", "d": "d"})
+def test_annex_get_annexed_files(path):
+    repo = AnnexRepo(path)
+    repo.add(".", commit=True)
+    eq_(set(repo.get_annexed_files()), {"a.txt", "b.txt", "c.py", "d"})
+
+    repo.drop("a.txt", options=["--force"])
+    eq_(set(repo.get_annexed_files()), {"a.txt", "b.txt", "c.py", "d"})
+    eq_(set(repo.get_annexed_files(with_content_only=True)),
+        {"b.txt", "c.py", "d"})
+
+    eq_(set(repo.get_annexed_files(patterns=["*.txt"])),
+        {"a.txt", "b.txt"})
+    eq_(set(repo.get_annexed_files(with_content_only=True,
+                                   patterns=["*.txt"])),
+        {"b.txt"})
+
+    eq_(set(repo.get_annexed_files(patterns=["*.txt", "*.py"])),
+        {"a.txt", "b.txt", "c.py"})
+
+    eq_(set(repo.get_annexed_files()),
+        set(repo.get_annexed_files(patterns=["*"])))
+
+    eq_(set(repo.get_annexed_files(with_content_only=True)),
+        set(repo.get_annexed_files(with_content_only=True, patterns=["*"])))
+
+
 @with_testrepos('basic_annex', flavors=['clone'])
 def test_annex_remove(path):
     repo = AnnexRepo(path, create=False)