Skip to content

Commit

Permalink
Merge 21a8b95 into 8bb550b
Browse files Browse the repository at this point in the history
  • Loading branch information
kyleam committed May 10, 2018
2 parents 8bb550b + 21a8b95 commit 0e536e0
Show file tree
Hide file tree
Showing 7 changed files with 299 additions and 12 deletions.
18 changes: 14 additions & 4 deletions datalad/interface/rerun.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import logging
from itertools import dropwhile
import json
import os
import re
import sys

Expand Down Expand Up @@ -70,6 +71,11 @@ class Rerun(Interface):
% # now on verify branch
% datalad diff --revision=master..
% git log --oneline --left-right --cherry-pick master...
.. note::
Currently the "onto" feature only sets the working tree of the current
dataset to a previous state. The working trees of any subdatasets remain
unchanged.
"""
_params_ = dict(
revision=Parameter(
Expand Down Expand Up @@ -313,12 +319,16 @@ def __call__(
# bring back the entire state of the tree with #1424, but
# we limit ourself to file addition/not-in-place-modification
# for now
for r in ds.unlock(new_or_modified(ds, hexsha),
return_type='generator', result_xfm=None):
yield r
auto_outputs = (os.path.relpath(ap["path"], ds.path)
for ap in new_or_modified(ds, hexsha))
outputs = run_info.get("outputs", [])
auto_outputs = [p for p in auto_outputs if p not in outputs]

for r in run_command(run_info['cmd'],
ds, message or rev["run_message"],
dataset=ds,
inputs=run_info.get("inputs", []),
outputs=outputs + auto_outputs,
message=message or rev["run_message"],
rerun_info=run_info):
yield r

Expand Down
87 changes: 85 additions & 2 deletions datalad/interface/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,16 @@
from datalad.support.param import Parameter

from datalad.distribution.add import Add
from datalad.distribution.get import Get
from datalad.distribution.remove import Remove
from datalad.distribution.dataset import require_dataset
from datalad.distribution.dataset import EnsureDataset
from datalad.distribution.dataset import datasetmethod
from datalad.interface.unlock import Unlock

from datalad.utils import get_dataset_root
from datalad.utils import getpwd
from datalad.utils import partition

lgr = logging.getLogger('datalad.interface.run')

Expand Down Expand Up @@ -78,6 +82,31 @@ class Run(Interface):
working directory. If a dataset is given, the command will be
executed in the root directory of this dataset.""",
constraints=EnsureDataset() | EnsureNone()),
inputs=Parameter(
args=("--input",),
dest="inputs",
metavar=("PATH"),
action='append',
doc="""A dependency for the run. Before running the command, the
content of this file will be retrieved. A value of "." means "run
:command:`datalad get .`", and any other values given are ignored.
If the value doesn't match any known annex file, it will be treated
as a glob and passed to :command:`git annex find --include=`. The
value should be specified relative to the top-level directory of
the current dataset. Using '*' for this value means "all current
annex files". Note: Globbing currently only considers the current
dataset, not any subdatasets. [CMD: This option can be given more
than once. CMD]"""),
outputs=Parameter(
args=("--output",),
dest="outputs",
metavar=("PATH"),
action='append',
doc="""Prepare this file to be an output file of the command. If
the content of this file is present, unlock the file. Otherwise,
remove it. Globs are treated in the same way described for [PY:
`inputs` PY][CMD: --input CMD]. [CMD: This option can be given more
than once. CMD]"""),
message=save_message_opt,
rerun=Parameter(
args=('--rerun',),
Expand All @@ -94,6 +123,8 @@ class Run(Interface):
def __call__(
cmd=None,
dataset=None,
inputs=None,
outputs=None,
message=None,
rerun=False):
if rerun:
Expand All @@ -106,14 +137,28 @@ def __call__(
yield r
else:
if cmd:
for r in run_command(cmd, dataset, message):
for r in run_command(cmd, dataset=dataset,
inputs=inputs, outputs=outputs,
message=message):
yield r
else:
lgr.warning("No command given")


def _resolve_files(dset, globs_or_files):
"""Expand --include globs in `globs_or_files` to file names.
"""
globs, files = partition(
globs_or_files,
lambda f: dset.repo.is_under_annex([f], batch=True)[0])
globs, files = list(globs), list(files)
globbed = dset.repo.get_annexed_files(patterns=globs) if globs else []
return files + globbed


# This helper function is used to add the rerun_info argument.
def run_command(cmd, dataset=None, message=None, rerun_info=None):
def run_command(cmd, dataset=None, inputs=None, outputs=None,
message=None, rerun_info=None):
rel_pwd = rerun_info.get('pwd') if rerun_info else None
if rel_pwd and dataset:
# recording is relative to the dataset
Expand Down Expand Up @@ -153,6 +198,40 @@ def run_command(cmd, dataset=None, message=None, rerun_info=None):
'cannot detect changes by command'))
return

if inputs is None:
inputs = []
elif any(i.strip() == "." for i in inputs):
inputs = ["."]
for res in ds.get(inputs):
yield res
elif inputs:
if not rerun_info:
inputs = _resolve_files(ds, inputs)
if not inputs:
lgr.warning("No matching files found for --input")
else:
for res in ds.get(inputs):
yield res

if outputs is None:
outputs = []
elif outputs:
if not rerun_info:
outputs = _resolve_files(ds, outputs)
if not outputs:
lgr.warning("No matching files found for --output")
else:
for res in ds.unlock(outputs, on_failure="ignore"):
if res["status"] == "impossible":
if "no content" in res["message"]:
for rem_res in ds.remove(res["path"],
check=False, save=False):
yield rem_res
continue
elif "path does not exist" in res["message"]:
continue
yield res

# anticipate quoted compound shell commands
cmd = cmd[0] if isinstance(cmd, list) and len(cmd) == 1 else cmd

Expand Down Expand Up @@ -203,6 +282,10 @@ def run_command(cmd, dataset=None, message=None, rerun_info=None):
'cmd': cmd,
'exit': cmd_exitcode if cmd_exitcode is not None else 0,
'chain': rerun_info["chain"] if rerun_info else [],
'inputs': inputs,
# Get outputs from the rerun_info because rerun adds new/modified files
# to the outputs argument.
'outputs': rerun_info["outputs"] if rerun_info else outputs
}
if rel_pwd is not None:
# only when inside the dataset to not leak information
Expand Down
103 changes: 102 additions & 1 deletion datalad/interface/tests/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
from datalad.tests.utils import skip_if_on_windows
from datalad.tests.utils import ignore_nose_capturing_stdout
from datalad.tests.utils import slow
from datalad.tests.utils import with_testrepos


@with_tempfile(mkdir=True)
Expand Down Expand Up @@ -156,6 +157,10 @@ def test_rerun(path, nodspath):
# Or --since= to run all reachable commits.
ds.rerun(since="")
eq_('xxxxxxxxxx\n', open(probe_path).read())
# If a file is dropped, we remove it instead of unlocking it.
ds.drop(probe_path, check=False)
ds.rerun()
eq_('x\n', open(probe_path).read())
# If the history to rerun has a merge commit, we abort.
ds.repo.checkout("HEAD~3", options=["-b", "topic"])
with open(opj(path, "topic-file"), "w") as f:
Expand Down Expand Up @@ -404,7 +409,11 @@ def test_rerun_cherry_pick(path):
for onto, text in [("HEAD", "skipping"), ("prerun", "cherry picking")]:
results = ds.rerun(since="prerun", onto=onto)
assert_in_results(results, status='ok', path=ds.path)
assert any(r.get("message", "").endswith(text) for r in results)

messages = (r.get("message", "") for r in results)
# Message may be a tuple.
messages = (m for m in messages if hasattr(m, "endswith"))
assert any(m.endswith(text) for m in messages)


@ignore_nose_capturing_stdout
Expand Down Expand Up @@ -558,6 +567,98 @@ def test_rerun_script(path):
cmout.getvalue().splitlines())


@slow # ~10s
@ignore_nose_capturing_stdout
@skip_if_on_windows
@with_testrepos('basic_annex', flavors=['clone'])
@known_failure_direct_mode #FIXME
@known_failure_v6 #FIXME
def test_run_inputs_outputs(path):
ds = Dataset(path)

assert_false(ds.repo.file_has_content("test-annex.dat"))

# If we specify test-annex.dat as an input, it will be retrieved before the
# run.
ds.run("cat test-annex.dat test-annex.dat >doubled.dat",
inputs=["test-annex.dat"])

ok_clean_git(ds.path)
ok_(ds.repo.file_has_content("test-annex.dat"))
ok_(ds.repo.file_has_content("doubled.dat"))

# Rerunning the commit will also get the input file.
ds.repo.drop("test-annex.dat", options=["--force"])
assert_false(ds.repo.file_has_content("test-annex.dat"))
ds.rerun()
ok_(ds.repo.file_has_content("test-annex.dat"))

with swallow_logs(new_level=logging.WARN) as cml:
ds.run("touch dummy", inputs=["*.not-an-extension"])
assert_in("No matching files found for --input", cml.out)

# Test different combinations of globs and explicit files.
inputs = ["a.dat", "b.dat", "c.txt", "d.txt"]
create_tree(ds.path, {i: i for i in inputs})

ds.add(".")
ds.repo.copy_to(inputs, remote="origin")
ds.repo.drop(inputs, options=["--force"])

test_cases = [(["*.dat"], ["a.dat", "b.dat"]),
(["*.dat", "c.txt"], ["a.dat", "b.dat", "c.txt"]),
(["*"], inputs)]

for idx, (inputs_arg, expected_present) in enumerate(test_cases):
assert_false(any(ds.repo.file_has_content(i) for i in inputs))

ds.run("touch dummy{}".format(idx), inputs=inputs_arg)
ok_(all(ds.repo.file_has_content(f) for f in expected_present))

ds.repo.drop(inputs, options=["--force"])

# --input=. runs "datalad get ."
ds.run("touch dot-dummy", inputs=["."])
eq_(ds.repo.get_annexed_files(),
ds.repo.get_annexed_files(with_content_only=True))
# On rerun, we get all files, even those that weren't in the tree at the
# time of the run.
create_tree(ds.path, {"after-dot-run": "after-dot-run content"})
ds.add(".")
ds.repo.copy_to(["after-dot-run"], remote="origin")
ds.repo.drop(["after-dot-run"], options=["--force"])
ds.rerun("HEAD^")
ds.repo.file_has_content("after-dot-run")

# --output will unlock files that are present.
ds.repo.get("a.dat")
ds.run("echo ' appended' >>a.dat", outputs=["a.dat"])
with open(opj(path, "a.dat")) as fh:
eq_(fh.read(), "a.dat appended\n")

# --output will remove files that are not present.
ds.repo.drop("a.dat", options=["--force"])
ds.run("echo ' appended' >>a.dat", outputs=["a.dat"])
with open(opj(path, "a.dat")) as fh:
eq_(fh.read(), " appended\n")

# --input can be combined with --output.
ds.repo.repo.git.reset("--hard", "HEAD~2")
ds.run("echo ' appended' >>a.dat", inputs=["a.dat"], outputs=["a.dat"])
with open(opj(path, "a.dat")) as fh:
eq_(fh.read(), "a.dat appended\n")

with swallow_logs(new_level=logging.WARN) as cml:
ds.run("echo blah", outputs=["*.not-an-extension"])
assert_in("No matching files found for --output", cml.out)

ds.create('sub')
ds.run("echo sub_orig >sub/subfile")
ds.run("echo sub_overwrite >sub/subfile", outputs=["sub/subfile"])
ds.drop("sub/subfile", check=False)
ds.run("echo sub_overwrite >sub/subfile", outputs=["sub/subfile"])


def test_rerun_commit_message_check():
assert_raises(ValueError,
get_run_info,
Expand Down
30 changes: 27 additions & 3 deletions datalad/support/annexrepo.py
Original file line number Diff line number Diff line change
Expand Up @@ -2565,11 +2565,35 @@ def repo_info(self, fast=False):
assert(info.pop('command') == 'info')
return info # just as is for now

def get_annexed_files(self, with_content_only=False):
def get_annexed_files(self, with_content_only=False, patterns=None):
"""Get a list of files in annex
Parameters
----------
with_content_only : bool, optional
Only list files whose content is present.
patterns : list, optional
Globs to pass to annex's `--include=`. Files that match any of
these will be returned (i.e., they'll be separated by `--or`).
Returns
-------
A list of file names
"""
# TODO: Review!!
args = [] if with_content_only else ['--include', "*"]
if not patterns:
args = [] if with_content_only else ['--include', "*"]
else:
if len(patterns) == 1:
args = ['--include', patterns[0]]
else:
args = ['-(']
for pat in patterns[:-1]:
args.extend(['--include', pat, "--or"])
args.extend(['--include', patterns[-1]])
args.append('-)')

if with_content_only:
args.extend(['--in', 'here'])
out, err = self._run_annex_command('find', annex_options=args)
# TODO: JSON
return out.splitlines()
Expand Down
27 changes: 27 additions & 0 deletions datalad/support/tests/test_annexrepo.py
Original file line number Diff line number Diff line change
Expand Up @@ -1327,6 +1327,33 @@ def test_annex_drop(src, dst):
assert_raises(CommandError, ar.drop, ['.'], options=['--all'])


@with_tree({"a.txt": "a", "b.txt": "b", "c.py": "c", "d": "d"})
def test_annex_get_annexed_files(path):
repo = AnnexRepo(path)
repo.add(".", commit=True)
eq_(set(repo.get_annexed_files()), {"a.txt", "b.txt", "c.py", "d"})

repo.drop("a.txt", options=["--force"])
eq_(set(repo.get_annexed_files()), {"a.txt", "b.txt", "c.py", "d"})
eq_(set(repo.get_annexed_files(with_content_only=True)),
{"b.txt", "c.py", "d"})

eq_(set(repo.get_annexed_files(patterns=["*.txt"])),
{"a.txt", "b.txt"})
eq_(set(repo.get_annexed_files(with_content_only=True,
patterns=["*.txt"])),
{"b.txt"})

eq_(set(repo.get_annexed_files(patterns=["*.txt", "*.py"])),
{"a.txt", "b.txt", "c.py"})

eq_(set(repo.get_annexed_files()),
set(repo.get_annexed_files(patterns=["*"])))

eq_(set(repo.get_annexed_files(with_content_only=True)),
set(repo.get_annexed_files(with_content_only=True, patterns=["*"])))


@with_testrepos('basic_annex', flavors=['clone'])
def test_annex_remove(path):
repo = AnnexRepo(path, create=False)
Expand Down
Loading

0 comments on commit 0e536e0

Please sign in to comment.