From 87010b623b2172e36523ba7021da2d65311f3dbc Mon Sep 17 00:00:00 2001 From: Mike Lin Date: Fri, 11 Sep 2020 21:49:02 -1000 Subject: [PATCH] [WDL 2.0] local Directory outputs --- WDL/Tree.py | 21 ---- WDL/Value.py | 7 +- WDL/_util.py | 7 +- WDL/runtime/task.py | 93 ++++++++++++------ WDL/runtime/task_container.py | 32 +++++-- tests/test_7runner.py | 174 ++++++++++++++++++++++++++++++++-- 6 files changed, 264 insertions(+), 70 deletions(-) diff --git a/WDL/Tree.py b/WDL/Tree.py index c7cacd8a..1a12e6a4 100644 --- a/WDL/Tree.py +++ b/WDL/Tree.py @@ -405,11 +405,6 @@ def typecheck( errors.try1( lambda: decl.typecheck(type_env, stdlib=stdlib, check_quant=check_quant) ) - if _has_directories(decl.type): - # FIXME - raise Error.ValidationError( - decl, "Directory outputs aren't supported in this version of miniwdl" - ) # check for cyclic dependencies among decls _detect_cycles( @@ -1067,11 +1062,6 @@ def typecheck(self, doc: "Document", check_quant: bool) -> None: ) ) output_type_env = output_type_env2 - if _has_directories(output.type): - # FIXME - raise Error.ValidationError( - output, "Directory outputs aren't supported in this version of miniwdl" - ) # 6. check for cyclic dependencies _detect_cycles(_workflow_dependency_matrix(self)) @@ -1814,14 +1804,3 @@ def _add_struct_instance_to_type_env( else: ans = ans.bind(namespace + "." + member_name, member_type, ctx) return ans - - -def _has_directories(t: Type.Base): - """ - used to check output declarations for Directory types while we don't support them - """ - if isinstance(t, Type.Directory) or next( - (p for p in t.parameters if _has_directories(p)), None - ): - return True - return False diff --git a/WDL/Value.py b/WDL/Value.py index 24de3b07..e2da40b9 100644 --- a/WDL/Value.py +++ b/WDL/Value.py @@ -199,7 +199,12 @@ def __init__(self, value: str, expr: "Optional[Expr.Base]" = None) -> None: def coerce(self, desired_type: Optional[Type.Base] = None) -> Base: "" - # TODO: similar coercion logic for Directory? outputs when we support those + if self.value is None: + # as above + if isinstance(desired_type, Type.Directory) and desired_type.optional: + return Null(self.expr) + else: + raise FileNotFoundError() return super().coerce(desired_type) diff --git a/WDL/_util.py b/WDL/_util.py index aa0fa848..c2b23158 100644 --- a/WDL/_util.py +++ b/WDL/_util.py @@ -598,12 +598,15 @@ def chmod_R_plus(path: str, file_bits: int = 0, dir_bits: int = 0) -> None: def do1(path1: str, bits: int) -> None: assert 0 <= bits < 0o10000 - if path_really_within(path1, path): + if not os.path.islink(path1) and path_really_within(path1, path): os.chmod(path1, (os.stat(path1).st_mode & 0o7777) | bits) + def raiser(exc: OSError): + raise exc + if os.path.isdir(path): do1(path, dir_bits) - for root, subdirs, files in os.walk(path, followlinks=False): + for root, subdirs, files in os.walk(path, onerror=raiser, followlinks=False): for dn in subdirs: do1(os.path.join(root, dn), dir_bits) for fn in files: diff --git a/WDL/runtime/task.py b/WDL/runtime/task.py index 44d1d001..25a1d98d 100644 --- a/WDL/runtime/task.py +++ b/WDL/runtime/task.py @@ -546,23 +546,31 @@ def _eval_task_outputs( logger: logging.Logger, task: Tree.Task, env: Env.Bindings[Value.Base], container: TaskContainer ) -> Env.Bindings[Value.Base]: - # helper to rewrite Files from in-container paths to host paths - def rewriter(fn: str, output_name: str) -> str: - host_file = container.host_path(fn) - if host_file is None: + # helper to rewrite File/Directory from in-container paths to host paths + def rewriter(v: Union[Value.File, Value.Directory], output_name: str) -> str: + container_path = v.value + if isinstance(v, Value.Directory) and not container_path.endswith("/"): + container_path += "/" + host_path = container.host_path(container_path) + if host_path is None: logger.warning( _( - "output file not found in container (error unless declared type is optional)", - name=output_name, - file=fn, + "output path not found in container (error unless declared type is optional)", + output=output_name, + path=container_path, ) ) + elif isinstance(v, Value.Directory): + if host_path.endswith("/"): + host_path = host_path[:-1] + _check_directory(host_path, output_name) + logger.debug(_("output dir", container=container_path, host=host_path)) else: - logger.debug(_("output file", container=fn, host=host_file)) + logger.debug(_("output file", container=container_path, host=host_path)) # We may overwrite File.value with None, which is an invalid state, then we'll fix it # up (or abort) below. This trickery is because we don't, at this point, know whether - # the 'desired' output type is File or File?. - return host_file # pyre-fixme + # the -declared- output type is optional. + return host_path # pyre-fixme stdlib = OutputStdLib(logger, container) outputs = Env.Bindings() @@ -588,15 +596,15 @@ def rewriter(fn: str, output_name: str) -> str: # First bind the value as-is in the environment, so that subsequent output expressions will # "see" the in-container path(s) if they use this binding. env = env.bind(decl.name, v) - # Rewrite each File.value to either a host path, or None if the file doesn't exist. - v = Value.rewrite_files(v, lambda fn: rewriter(fn, decl.name)) + # Rewrite each File/Directory path to a host path, or None if it doesn't exist. + v = Value.rewrite_paths(v, lambda v: rewriter(v, decl.name)) # File.coerce has a special behavior for us so that, if the value is None: # - produces Value.Null() if the desired type is File? # - raises FileNotFoundError otherwise. try: v = v.coerce(decl.type) except FileNotFoundError: - exn = OutputError("File not found in task output " + decl.name) + exn = OutputError("File/Directory path not found in task output " + decl.name) setattr(exn, "job_id", decl.workflow_node_id) raise exn outputs = outputs.bind(decl.name, v) @@ -604,6 +612,25 @@ def rewriter(fn: str, output_name: str) -> str: return outputs +def _check_directory(host_path: str, output_name: str) -> None: + """ + traverse output directory to check that all symlinks are relative & resolve inside the dir + """ + + def raiser(exc: OSError): + raise exc + + for root, subdirs, files in os.walk(host_path, onerror=raiser, followlinks=False): + for fn in files: + fn = os.path.join(root, fn) + if os.path.islink(fn) and ( + not os.path.exists(fn) + or os.path.isabs(os.readlink(fn)) + or not path_really_within(fn, host_path) + ): + raise OutputError(f"Directory in output {output_name} contains unusable symlink") + + def link_outputs( outputs: Env.Bindings[Value.Base], run_dir: str, hardlinks: bool = False ) -> Env.Bindings[Value.Base]: @@ -614,23 +641,29 @@ def link_outputs( outputs env to use these symlinks. """ - def map_files(v: Value.Base, dn: str) -> Value.Base: - if isinstance(v, Value.File): - if os.path.isfile(v.value): - hardlink = os.path.realpath(v.value) - assert os.path.isfile(hardlink) - newlink = os.path.join(dn, os.path.basename(v.value)) - os.makedirs(dn, exist_ok=False) - if not hardlinks and path_really_within(hardlink, os.path.dirname(run_dir)): + def map_paths(v: Value.Base, dn: str) -> Value.Base: + if isinstance(v, (Value.File, Value.Directory)): + if os.path.exists(v.value): + target = os.path.realpath(v.value) + if not hardlinks and path_really_within(target, os.path.dirname(run_dir)): # make symlink relative - hardlink = os.path.relpath(hardlink, start=os.path.realpath(dn)) - (os.link if hardlinks else os.symlink)(hardlink, newlink) - v.value = newlink + target = os.path.relpath(target, start=os.path.realpath(dn)) + link = os.path.join(dn, os.path.basename(v.value)) + os.makedirs(dn, exist_ok=False) + if hardlinks: + # TODO: what if target is an input from a different filesystem? + if isinstance(v, Value.Directory): + shutil.copytree(target, link, symlinks=True, copy_function=os.link) + else: + os.link(target, link) + else: + os.symlink(target, link) + v.value = link # recurse into compound values elif isinstance(v, Value.Array) and v.value: d = int(math.ceil(math.log10(len(v.value)))) # how many digits needed for i in range(len(v.value)): - v.value[i] = map_files(v.value[i], os.path.join(dn, str(i).rjust(d, "0"))) + v.value[i] = map_paths(v.value[i], os.path.join(dn, str(i).rjust(d, "0"))) elif isinstance(v, Value.Map): # create a subdirectory for each key, as long as the key names seem to make reasonable # path components; otherwise, treat the dict as a list of its values @@ -646,18 +679,18 @@ def map_files(v: Value.Base, dn: str) -> Value.Base: for i, b in enumerate(v.value): v.value[i] = ( b[0], - map_files( + map_paths( b[1], os.path.join(dn, str(b[0]) if keys_ok else str(i).rjust(d, "0")) ), ) elif isinstance(v, Value.Pair): v.value = ( - map_files(v.value[0], os.path.join(dn, "left")), - map_files(v.value[1], os.path.join(dn, "right")), + map_paths(v.value[0], os.path.join(dn, "left")), + map_paths(v.value[1], os.path.join(dn, "right")), ) elif isinstance(v, Value.Struct): for key in v.value: - v.value[key] = map_files(v.value[key], os.path.join(dn, key)) + v.value[key] = map_paths(v.value[key], os.path.join(dn, key)) return v os.makedirs(os.path.join(run_dir, "out"), exist_ok=False) @@ -666,7 +699,7 @@ def map_files(v: Value.Base, dn: str) -> Value.Base: return outputs.map( lambda binding: Env.Binding( binding.name, - map_files(copy.deepcopy(binding.value), os.path.join(run_dir, "out", binding.name)), + map_paths(copy.deepcopy(binding.value), os.path.join(run_dir, "out", binding.name)), ) ) diff --git a/WDL/runtime/task_container.py b/WDL/runtime/task_container.py index 3d61cc2e..72eee933 100644 --- a/WDL/runtime/task_container.py +++ b/WDL/runtime/task_container.py @@ -271,19 +271,39 @@ def host_path(self, container_path: str, inputs_only: bool = False) -> Optional[ + container_path ) # relativize the path to the provisioned working directory - container_path = os.path.relpath( + container_relpath = os.path.relpath( container_path, os.path.join(self.container_dir, "work") ) + if container_path.endswith("/") and not container_relpath.endswith("/"): + container_relpath += "/" + container_path = container_relpath ans = os.path.join(self.host_work_dir(), container_path) - if os.path.isfile(ans): - if path_really_within(ans, self.host_work_dir()): - return ans + if container_path.endswith("/") and not ans.endswith("/"): + ans += "/" + if not ( + (container_path.endswith("/") and os.path.isdir(ans)) + or (not container_path.endswith("/") and os.path.isfile(ans)) + ): + return None + if not path_really_within(ans, self.host_work_dir()): raise OutputError( - "task outputs attempted to use a file outside its working directory: " + "task outputs attempted to use a path outside its working directory: " + container_path ) - return None + if ( + ans.endswith("/") + and self.input_path_map + and ( + path_really_within(self.host_work_dir(), ans[:-1]) + or path_really_within( + ans[:-1], os.path.join(self.host_work_dir(), "_miniwdl_inputs") + ) + ) + ): + # prevent output of an input mount point + raise OutputError("unusable output directory: " + container_path) + return ans def host_work_dir(self): return os.path.join( diff --git a/tests/test_7runner.py b/tests/test_7runner.py index b4ceeb51..a3302f80 100644 --- a/tests/test_7runner.py +++ b/tests/test_7runner.py @@ -113,28 +113,182 @@ def test_basic_directory(self): outp = self._run(wdl, {"d": os.path.join(self._dir, "d"), "t.touch": True}, cfg=cfg) assert outp["dsz"] == 10 - def test_no_outputs(self): - with self.assertRaisesRegex(WDL.Error.ValidationError, "Directory outputs"): - self._run(""" + def test_directory_output(self): + wdl = R""" + version development + workflow w { + input { + Directory d + } + call t { + input: + d = d + } + output { + Array[Directory] d_out = t.d_out + } + } + task t { + input { + Directory d + } + command { + set -euxo pipefail + mkdir -p outdir/foo + cd outdir + echo foobar > foo/bar + ln -s foo/bar baz + >&2 ls -Rl + } + output { + Array[Directory] d_out = ["~{d}", "outdir"] + } + } + """ + + os.makedirs(os.path.join(self._dir, "d")) + with open(os.path.join(self._dir, "d/alice.txt"), mode="w") as outfile: + print("Alice", file=outfile) + with open(os.path.join(self._dir, "d/bob.txt"), mode="w") as outfile: + print("Bob", file=outfile) + outp = self._run(wdl, {"d": os.path.join(self._dir, "d")}) + + assert len(outp["d_out"]) == 2 + assert os.path.islink(outp["d_out"][0]) + assert os.path.realpath(outp["d_out"][0]) == os.path.join(self._dir, "d") + assert os.path.isdir(outp["d_out"][1]) + assert os.path.islink(outp["d_out"][1]) + assert os.path.basename(outp["d_out"][1]) == "outdir" + assert os.path.isfile(os.path.join(outp["d_out"][1], "foo/bar")) + assert os.path.islink(os.path.join(outp["d_out"][1], "baz")) + assert os.path.isfile(os.path.join(outp["d_out"][1], "baz")) + + cfg = WDL.runtime.config.Loader(logging.getLogger(self.id()), []) + cfg.override({"file_io": {"output_hardlinks": True}}) + outp = self._run(wdl, {"d": os.path.join(self._dir, "d")}, cfg=cfg) + assert len(outp["d_out"]) == 2 + assert not os.path.islink(outp["d_out"][0]) + assert os.path.realpath(outp["d_out"][0]) != os.path.join(self._dir, "d") + assert os.path.isdir(outp["d_out"][1]) + assert not os.path.islink(outp["d_out"][1]) + assert os.path.basename(outp["d_out"][1]) == "outdir" + assert os.path.isfile(os.path.join(outp["d_out"][1], "foo/bar")) + assert os.path.islink(os.path.join(outp["d_out"][1], "baz")) + assert os.path.isfile(os.path.join(outp["d_out"][1], "baz")) + + outp = self._run(R""" version development task t { command {} output { - Directory d = "." + Directory? d_out = "bogus/dirname" } } """, {}) + assert outp["d_out"] is None - with self.assertRaisesRegex(WDL.Error.ValidationError, "Directory outputs"): - self._run(""" + def test_errors(self): + self._run(R""" version development - workflow w { - Directory d = "." + task t { + command <<< + mkdir outdir + ln -s /etc/passwd outdir/owned + >>> output { - Directory d2 = d + Directory d_out = "outdir" } } - """, {}) + """, {}, expected_exception=WDL.runtime.error.OutputError) + + self._run(R""" + version development + task t { + command <<< + touch secret + mkdir outdir + ln -s ../secret outdir/owned + >&2 ls -Rl + >>> + output { + Directory d_out = "outdir/" + } + } + """, {}, expected_exception=WDL.runtime.error.OutputError) + + self._run(R""" + version development + task t { + command <<< + mkdir outdir + touch outdir/secret + ln -s outdir/secret outdir/owned + rm outdir/secret + >&2 ls -Rl + >>> + output { + Directory d_out = "outdir" + } + } + """, {}, expected_exception=WDL.runtime.error.OutputError) + + self._run(R""" + version development + task t { + command <<< + touch outdir + >>> + output { + Directory d_out = "outdir" + } + } + """, {}, expected_exception=WDL.runtime.error.OutputError) + + self._run(R""" + version development + task t { + command <<< + mkdir outdir + >>> + output { + File f_out = "outdir" + } + } + """, {}, expected_exception=WDL.runtime.error.OutputError) + + with open(os.path.join(self._dir, "foo.txt"), mode="w") as outfile: + print("foo", file=outfile) + self._run(R""" + version development + task t { + input { + File f + } + command <<< + echo `dirname "~{f}"` > outdir + >>> + output { + Directory d_out = read_string("outdir") + } + } + """, {"f": os.path.join(self._dir, "foo.txt")}, + expected_exception=WDL.runtime.error.OutputError) + + self._run(R""" + version development + task t { + input { + File f + } + command <<< + echo $(pwd) > outdir + >>> + output { + Directory d_out = read_string("outdir") + } + } + """, {"f": os.path.join(self._dir, "foo.txt")}, + expected_exception=WDL.runtime.error.OutputError) class TestNoneLiteral(RunnerTestCase): def test_none_eval(self):