From 8bf7090f61ae654e6d6fc1d410b40d9954567e49 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 23 Sep 2022 12:49:29 +0200 Subject: [PATCH 01/13] Add interface for use of relative output paths --- WDL/runtime/task.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/WDL/runtime/task.py b/WDL/runtime/task.py index c157b594..d47b5ee7 100644 --- a/WDL/runtime/task.py +++ b/WDL/runtime/task.py @@ -125,7 +125,8 @@ def run_local_task( ) # create out/ and outputs.json _outputs = link_outputs( - cache, cached, run_dir, hardlinks=cfg["file_io"].get_bool("output_hardlinks") + cache, cached, run_dir, hardlinks=cfg["file_io"].get_bool("output_hardlinks"), + use_relative_output_paths=cfg["file_io"].get_bool("use_relative_output_paths") ) write_values_json( cached, os.path.join(run_dir, "outputs.json"), namespace=task.name @@ -203,7 +204,8 @@ def run_local_task( # create output_links outputs = link_outputs( - cache, outputs, run_dir, hardlinks=cfg["file_io"].get_bool("output_hardlinks") + cache, outputs, run_dir, hardlinks=cfg["file_io"].get_bool("output_hardlinks"), + use_relative_output_paths=cfg["file_io"].get_bool("use_relative_output_paths") ) # process outputs through plugins @@ -719,7 +721,8 @@ def raiser(exc: OSError): def link_outputs( - cache: CallCache, outputs: Env.Bindings[Value.Base], run_dir: str, hardlinks: bool = False + cache: CallCache, outputs: Env.Bindings[Value.Base], run_dir: str, + hardlinks: bool = False, use_relative_output_paths: bool = False, ) -> Env.Bindings[Value.Base]: """ Following a successful run, the output files may be scattered throughout a complex directory From ac32a4fa08aea18224fa6fc8ca4f23e1bb5927d6 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 23 Sep 2022 15:54:09 +0200 Subject: [PATCH 02/13] Add code for relative paths to the working directory --- WDL/runtime/config_templates/default.cfg | 3 ++ WDL/runtime/task.py | 49 ++++++++++++++++++++++-- 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/WDL/runtime/config_templates/default.cfg b/WDL/runtime/config_templates/default.cfg index bb3b6cd7..b07ca874 100644 --- a/WDL/runtime/config_templates/default.cfg +++ b/WDL/runtime/config_templates/default.cfg @@ -92,6 +92,9 @@ output_hardlinks = false # output files would be deleted too. Input/output JSON, logs, and stdout/stderr are always retained # in the task run directory (above the container working directory). delete_work = false +# Create an output directory structure based on the positions of output files relative to the +# working directory. +use_relative_output_paths = false # Suggest that each task's temporary directory should reside within the mounted task working # directory, instead of the storage backing the container's root filesystem. The latter (default) # is usually preferred because the working directory is more likely to reside on slower network- diff --git a/WDL/runtime/task.py b/WDL/runtime/task.py index d47b5ee7..a45ab7dc 100644 --- a/WDL/runtime/task.py +++ b/WDL/runtime/task.py @@ -741,11 +741,26 @@ def map_paths(v: Value.Base, dn: str) -> Value.Base: if target: target = os.path.realpath(target) assert os.path.exists(target) + work_dir = os.path.join(os.path.dirname(run_dir), "work") + if use_relative_output_paths: + rel_dir = os.path.dirname(target) + while rel_dir: + if os.path.basename(rel_dir) == "work": + relative_output = os.path.relpath(target, rel_dir) + relpath_dir = os.path.dirname(relative_output) + if relpath_dir: + dn = os.path.join(dn, relpath_dir) + break + rel_dir = os.path.dirname(rel_dir) if not hardlinks and path_really_within(target, os.path.dirname(run_dir)): # make symlink relative target = os.path.relpath(target, start=os.path.realpath(dn)) link = os.path.join(dn, os.path.basename(v.value.rstrip("/"))) - os.makedirs(dn, exist_ok=False) + if os.path.exists(link) and use_relative_output_paths: + raise FileExistsError( + f"File collision: can not create {link} using relative output paths" + ) + os.makedirs(dn, exist_ok=use_relative_output_paths) if hardlinks: # TODO: what if target is an input from a different filesystem? if isinstance(v, Value.Directory): @@ -796,11 +811,39 @@ def map_paths(v: Value.Base, dn: str) -> Value.Base: v.value[key] = map_paths(v.value[key], os.path.join(dn, key)) return v - os.makedirs(os.path.join(run_dir, "out"), exist_ok=False) + def map_paths_relative(v: Value.Base, dn: str): + if isinstance(v, (Value.File, Value.Directory)): + # Fall back on map paths to use all the correct linking code. + return map_paths(v, dn) + elif isinstance(v, Value.Array) and v.value: + for i in range(len(v.value)): + v.value[i] = map_paths_relative(v.value[i], dn) + elif isinstance(v, Value.Map): + for i, b in enumerate(v.value): + v.value[i] = (b[0], map_paths_relative(b[1], dn)) + elif isinstance(v, Value.Pair): + v.value = ( + map_paths_relative(v.value[0], dn), + map_paths_relative(v.value[1], dn), + ) + elif isinstance(v, Value.Struct): + for key in v.value: + v.value[key] = map_paths_relative(v.value[key], dn) + return v + + out_dir = os.path.join(run_dir, "out") + os.makedirs(out_dir, exist_ok=False) + + if use_relative_output_paths: + return outputs.map( + lambda binding: Env.Binding( + binding.name, + map_paths_relative(copy.deepcopy(binding.value), out_dir)) + ) return outputs.map( lambda binding: Env.Binding( binding.name, - map_paths(copy.deepcopy(binding.value), os.path.join(run_dir, "out", binding.name)), + map_paths(copy.deepcopy(binding.value), os.path.join(out_dir, binding.name)), ) ) From e5c8e8ed43bd939d105ff31c189c8b093b66c6a1 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Mon, 26 Sep 2022 11:22:44 +0200 Subject: [PATCH 03/13] Keep track of link destinations --- WDL/runtime/task.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/WDL/runtime/task.py b/WDL/runtime/task.py index a45ab7dc..8e346e6c 100644 --- a/WDL/runtime/task.py +++ b/WDL/runtime/task.py @@ -730,6 +730,7 @@ def link_outputs( containing nicely organized symlinks to the output files, and rewrite File values in the outputs env to use these symlinks. """ + link_destinations: Dict[str, str] = dict() def map_paths(v: Value.Base, dn: str) -> Value.Base: if isinstance(v, (Value.File, Value.Directory)): @@ -756,10 +757,16 @@ def map_paths(v: Value.Base, dn: str) -> Value.Base: # make symlink relative target = os.path.relpath(target, start=os.path.realpath(dn)) link = os.path.join(dn, os.path.basename(v.value.rstrip("/"))) - if os.path.exists(link) and use_relative_output_paths: - raise FileExistsError( - f"File collision: can not create {link} using relative output paths" - ) + if use_relative_output_paths: + known_target = link_destinations.get(link, None) + if known_target: + if known_target != target: + raise FileExistsError( + f"Two files have the same link destination: " + f"{known_target} and {target} are both written " + f"to {link}") + else: + link_destinations[link] = target os.makedirs(dn, exist_ok=use_relative_output_paths) if hardlinks: # TODO: what if target is an input from a different filesystem? From 37810fa7a9b5f27aee90001bd2bef5f84628fd66 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Mon, 26 Sep 2022 11:42:35 +0200 Subject: [PATCH 04/13] Enable relative output paths for workflows --- WDL/runtime/task.py | 2 +- WDL/runtime/workflow.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/WDL/runtime/task.py b/WDL/runtime/task.py index 8e346e6c..f529c7cb 100644 --- a/WDL/runtime/task.py +++ b/WDL/runtime/task.py @@ -745,7 +745,7 @@ def map_paths(v: Value.Base, dn: str) -> Value.Base: work_dir = os.path.join(os.path.dirname(run_dir), "work") if use_relative_output_paths: rel_dir = os.path.dirname(target) - while rel_dir: + while rel_dir and rel_dir != '/': if os.path.basename(rel_dir) == "work": relative_output = os.path.relpath(target, rel_dir) relpath_dir = os.path.dirname(relative_output) diff --git a/WDL/runtime/workflow.py b/WDL/runtime/workflow.py index 26568e3c..8e587b93 100644 --- a/WDL/runtime/workflow.py +++ b/WDL/runtime/workflow.py @@ -780,7 +780,8 @@ def run_local_workflow( ) ) _outputs = link_outputs( - cache, cached, run_dir, hardlinks=cfg["file_io"].get_bool("output_hardlinks") + cache, cached, run_dir, hardlinks=cfg["file_io"].get_bool("output_hardlinks"), + use_relative_output_paths=cfg["file_io"].get_bool("use_relative_output_paths"), ) write_values_json( cached, os.path.join(run_dir, "outputs.json"), namespace=workflow.name @@ -932,7 +933,8 @@ def _workflow_main_loop( # create output_links outputs = link_outputs( - cache, state.outputs, run_dir, hardlinks=cfg["file_io"].get_bool("output_hardlinks") + cache, state.outputs, run_dir, hardlinks=cfg["file_io"].get_bool("output_hardlinks"), + use_relative_output_paths=cfg["file_io"].get_bool("use_relative_output_paths"), ) # process outputs through plugins From b9c109f8983e7cfecc164ca67ba6c4acab764246 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Mon, 26 Sep 2022 12:11:29 +0200 Subject: [PATCH 05/13] Only use relative output paths at the top-level --- WDL/runtime/task.py | 6 ++---- WDL/runtime/workflow.py | 4 ++-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/WDL/runtime/task.py b/WDL/runtime/task.py index f529c7cb..c166092d 100644 --- a/WDL/runtime/task.py +++ b/WDL/runtime/task.py @@ -125,8 +125,7 @@ def run_local_task( ) # create out/ and outputs.json _outputs = link_outputs( - cache, cached, run_dir, hardlinks=cfg["file_io"].get_bool("output_hardlinks"), - use_relative_output_paths=cfg["file_io"].get_bool("use_relative_output_paths") + cache, cached, run_dir, hardlinks=cfg["file_io"].get_bool("output_hardlinks") ) write_values_json( cached, os.path.join(run_dir, "outputs.json"), namespace=task.name @@ -204,8 +203,7 @@ def run_local_task( # create output_links outputs = link_outputs( - cache, outputs, run_dir, hardlinks=cfg["file_io"].get_bool("output_hardlinks"), - use_relative_output_paths=cfg["file_io"].get_bool("use_relative_output_paths") + cache, outputs, run_dir, hardlinks=cfg["file_io"].get_bool("output_hardlinks") ) # process outputs through plugins diff --git a/WDL/runtime/workflow.py b/WDL/runtime/workflow.py index 8e587b93..f8553cfc 100644 --- a/WDL/runtime/workflow.py +++ b/WDL/runtime/workflow.py @@ -780,8 +780,7 @@ def run_local_workflow( ) ) _outputs = link_outputs( - cache, cached, run_dir, hardlinks=cfg["file_io"].get_bool("output_hardlinks"), - use_relative_output_paths=cfg["file_io"].get_bool("use_relative_output_paths"), + cache, cached, run_dir, hardlinks=cfg["file_io"].get_bool("output_hardlinks") ) write_values_json( cached, os.path.join(run_dir, "outputs.json"), namespace=workflow.name @@ -934,6 +933,7 @@ def _workflow_main_loop( # create output_links outputs = link_outputs( cache, state.outputs, run_dir, hardlinks=cfg["file_io"].get_bool("output_hardlinks"), + # Relative output paths only make sense at the top level, and hence is only used here. use_relative_output_paths=cfg["file_io"].get_bool("use_relative_output_paths"), ) From b6371f3d7b8e281c67ff0bf1619a318e647904b0 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Mon, 26 Sep 2022 12:19:33 +0200 Subject: [PATCH 06/13] Simplify link destination check --- WDL/runtime/task.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/WDL/runtime/task.py b/WDL/runtime/task.py index c166092d..083bad0b 100644 --- a/WDL/runtime/task.py +++ b/WDL/runtime/task.py @@ -755,16 +755,12 @@ def map_paths(v: Value.Base, dn: str) -> Value.Base: # make symlink relative target = os.path.relpath(target, start=os.path.realpath(dn)) link = os.path.join(dn, os.path.basename(v.value.rstrip("/"))) - if use_relative_output_paths: - known_target = link_destinations.get(link, None) - if known_target: - if known_target != target: - raise FileExistsError( - f"Two files have the same link destination: " - f"{known_target} and {target} are both written " - f"to {link}") - else: - link_destinations[link] = target + if link_destinations.get(link, target) != target: + raise FileExistsError( + f"Two files have the same link destination: " + f"{link_destinations[link]} and {target} are both " + f"written to {link}.") + link_destinations[link] = target os.makedirs(dn, exist_ok=use_relative_output_paths) if hardlinks: # TODO: what if target is an input from a different filesystem? From d65c1599042c9699e3bcb3ccac1bd22e26b9df9f Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Mon, 26 Sep 2022 12:41:24 +0200 Subject: [PATCH 07/13] Remove unused variable --- WDL/runtime/task.py | 1 - 1 file changed, 1 deletion(-) diff --git a/WDL/runtime/task.py b/WDL/runtime/task.py index 083bad0b..c2b59764 100644 --- a/WDL/runtime/task.py +++ b/WDL/runtime/task.py @@ -740,7 +740,6 @@ def map_paths(v: Value.Base, dn: str) -> Value.Base: if target: target = os.path.realpath(target) assert os.path.exists(target) - work_dir = os.path.join(os.path.dirname(run_dir), "work") if use_relative_output_paths: rel_dir = os.path.dirname(target) while rel_dir and rel_dir != '/': From 3d2b198daf0c1c8e55884f97ccc7a184202c4490 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Mon, 26 Sep 2022 12:58:21 +0200 Subject: [PATCH 08/13] Simplify relativize directory algorithm --- WDL/runtime/task.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/WDL/runtime/task.py b/WDL/runtime/task.py index c2b59764..b8e6a9f2 100644 --- a/WDL/runtime/task.py +++ b/WDL/runtime/task.py @@ -741,15 +741,13 @@ def map_paths(v: Value.Base, dn: str) -> Value.Base: target = os.path.realpath(target) assert os.path.exists(target) if use_relative_output_paths: - rel_dir = os.path.dirname(target) - while rel_dir and rel_dir != '/': - if os.path.basename(rel_dir) == "work": - relative_output = os.path.relpath(target, rel_dir) - relpath_dir = os.path.dirname(relative_output) - if relpath_dir: - dn = os.path.join(dn, relpath_dir) + # Look for the highest-level "work" directory and make sure + # any directories in there are present in the output structure + dir_parts = os.path.dirname(target).split(os.sep) + for i, part in enumerate(reversed(dir_parts)): + if part == "work": + dn = os.path.join(dn, *dir_parts[len(dir_parts) - i:]) break - rel_dir = os.path.dirname(rel_dir) if not hardlinks and path_really_within(target, os.path.dirname(run_dir)): # make symlink relative target = os.path.relpath(target, start=os.path.realpath(dn)) From 18797067a4e47827ce744dd3995302c3c3b539d8 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Mon, 26 Sep 2022 14:48:21 +0200 Subject: [PATCH 09/13] Add test and fix relative output paths for hardlinks --- WDL/runtime/task.py | 9 ++++--- WDL/runtime/workflow.py | 3 ++- tests/runner.t | 59 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 66 insertions(+), 5 deletions(-) diff --git a/WDL/runtime/task.py b/WDL/runtime/task.py index b8e6a9f2..42e72d44 100644 --- a/WDL/runtime/task.py +++ b/WDL/runtime/task.py @@ -125,7 +125,8 @@ def run_local_task( ) # create out/ and outputs.json _outputs = link_outputs( - cache, cached, run_dir, hardlinks=cfg["file_io"].get_bool("output_hardlinks") + cache, cached, run_dir, hardlinks=cfg["file_io"].get_bool("output_hardlinks"), + use_relative_output_paths=cfg["file_io"].get_bool("use_relative_output_paths"), ) write_values_json( cached, os.path.join(run_dir, "outputs.json"), namespace=task.name @@ -203,7 +204,8 @@ def run_local_task( # create output_links outputs = link_outputs( - cache, outputs, run_dir, hardlinks=cfg["file_io"].get_bool("output_hardlinks") + cache, outputs, run_dir, hardlinks=cfg["file_io"].get_bool("output_hardlinks"), + use_relative_output_paths=cfg["file_io"].get_bool("use_relative_output_paths"), ) # process outputs through plugins @@ -745,7 +747,8 @@ def map_paths(v: Value.Base, dn: str) -> Value.Base: # any directories in there are present in the output structure dir_parts = os.path.dirname(target).split(os.sep) for i, part in enumerate(reversed(dir_parts)): - if part == "work": + # Also look for "out" directory in case of hardlinks + if part == "work" or (part == "out" and hardlinks): dn = os.path.join(dn, *dir_parts[len(dir_parts) - i:]) break if not hardlinks and path_really_within(target, os.path.dirname(run_dir)): diff --git a/WDL/runtime/workflow.py b/WDL/runtime/workflow.py index f8553cfc..fa7c5e28 100644 --- a/WDL/runtime/workflow.py +++ b/WDL/runtime/workflow.py @@ -780,7 +780,8 @@ def run_local_workflow( ) ) _outputs = link_outputs( - cache, cached, run_dir, hardlinks=cfg["file_io"].get_bool("output_hardlinks") + cache, cached, run_dir, hardlinks=cfg["file_io"].get_bool("output_hardlinks"), + use_relative_output_paths=cfg["file_io"].get_bool("use_relative_output_paths"), ) write_values_json( cached, os.path.join(run_dir, "outputs.json"), namespace=workflow.name diff --git a/tests/runner.t b/tests/runner.t index 38be67de..e9e4084e 100644 --- a/tests/runner.t +++ b/tests/runner.t @@ -11,7 +11,7 @@ source tests/bash-tap/bash-tap-bootstrap export PYTHONPATH="$SOURCE_DIR:$PYTHONPATH" miniwdl="python3 -m WDL" -plan tests 82 +plan tests 88 $miniwdl run_self_test is "$?" "0" "run_self_test" @@ -524,3 +524,60 @@ workflow outer { EOF MINIWDL__SCHEDULER__SUBWORKFLOW_CONCURRENCY=2 $miniwdl run --dir nested_deadlock outer.wdl is "$?" "0" "avoid deadlocking on nested subworkflows" + +cat << 'EOF' > create_files.wdl +version 1.0 +task create_files { + command <<< + mkdir -p large_matryoshka/medium_matryoskha + touch large_matryoshka/medium_matryoskha/small_matryoshka + mkdir -p utensils + touch utensils/fork utensils/knife utensils/spoon + touch unboxed_item + mkdir -p turtle/turtle/turtle/turtle/turtle/turtle/ + touch turtle/turtle/turtle/turtle/turtle/turtle/turtle + >>> + output { + File small_matryoshka = "large_matryoshka/medium_matryoskha/small_matryoshka" + File fork = "utensils/fork" + File knife = "utensils/knife" + File spoon = "utensils/spoon" + File unboxed_item = "unboxed_item" + File all_the_way_down = "turtle/turtle/turtle/turtle/turtle/turtle/turtle" + # Below arrays will create collisions as these are the same paths as above, + # localized to the same links. This should be handled properly. + Array[File] utensils = [fork, knife, spoon] + Array[File] everything = [small_matryoshka, fork, knife, spoon, unboxed_item, all_the_way_down] + } +} +EOF + +cat << 'EOF' > correct_workflow.wdl +version 1.0 +import "create_files.wdl" as create_files + +workflow filecreator { + call create_files.create_files as createFiles {} + + output { + Array[File] utensils = createFiles.utensils + Array[File] all_stuff = createFiles.everything + } +} +EOF + +OUTPUT_DIR=use_relative_paths/_LAST/out/ +MINIWDL__FILE_IO__USE_RELATIVE_OUTPUT_PATHS=true $miniwdl run --dir use_relative_paths correct_workflow.wdl +is "$?" "0" relative_output_paths +test -L $OUTPUT_DIR/large_matryoshka/medium_matryoskha/small_matryoshka +is "$?" "0" "outputs are relative" +test -d $OUTPUT_DIR/turtle/turtle/turtle/turtle +is "$?" "0" "outputs are relative all the way down" + +MINIWDL__FILE_IO__USE_RELATIVE_OUTPUT_PATHS=true MINIWDL__FILE_IO__OUTPUT_HARDLINKS=true \ +$miniwdl run --dir use_relative_paths correct_workflow.wdl +is "$?" "0" relative_output_paths_with_hardlink +test -f $OUTPUT_DIR/large_matryoshka/medium_matryoskha/small_matryoshka +is "$?" "0" "outputs are relative using hardlinks" +test -d $OUTPUT_DIR/turtle/turtle/turtle/turtle +is "$?" "0" "outputs are relative all the way down using hardlinks" From 5288806e5255d1da96a293004a479fae47e45656 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Mon, 26 Sep 2022 14:55:25 +0200 Subject: [PATCH 10/13] black reformatting --- WDL/runtime/task.py | 26 ++++++++++++++++++-------- WDL/runtime/workflow.py | 10 ++++++++-- 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/WDL/runtime/task.py b/WDL/runtime/task.py index 42e72d44..7cb1a47b 100644 --- a/WDL/runtime/task.py +++ b/WDL/runtime/task.py @@ -125,7 +125,10 @@ def run_local_task( ) # create out/ and outputs.json _outputs = link_outputs( - cache, cached, run_dir, hardlinks=cfg["file_io"].get_bool("output_hardlinks"), + cache, + cached, + run_dir, + hardlinks=cfg["file_io"].get_bool("output_hardlinks"), use_relative_output_paths=cfg["file_io"].get_bool("use_relative_output_paths"), ) write_values_json( @@ -204,7 +207,10 @@ def run_local_task( # create output_links outputs = link_outputs( - cache, outputs, run_dir, hardlinks=cfg["file_io"].get_bool("output_hardlinks"), + cache, + outputs, + run_dir, + hardlinks=cfg["file_io"].get_bool("output_hardlinks"), use_relative_output_paths=cfg["file_io"].get_bool("use_relative_output_paths"), ) @@ -721,8 +727,11 @@ def raiser(exc: OSError): def link_outputs( - cache: CallCache, outputs: Env.Bindings[Value.Base], run_dir: str, - hardlinks: bool = False, use_relative_output_paths: bool = False, + cache: CallCache, + outputs: Env.Bindings[Value.Base], + run_dir: str, + hardlinks: bool = False, + use_relative_output_paths: bool = False, ) -> Env.Bindings[Value.Base]: """ Following a successful run, the output files may be scattered throughout a complex directory @@ -749,7 +758,7 @@ def map_paths(v: Value.Base, dn: str) -> Value.Base: for i, part in enumerate(reversed(dir_parts)): # Also look for "out" directory in case of hardlinks if part == "work" or (part == "out" and hardlinks): - dn = os.path.join(dn, *dir_parts[len(dir_parts) - i:]) + dn = os.path.join(dn, *dir_parts[len(dir_parts) - i :]) break if not hardlinks and path_really_within(target, os.path.dirname(run_dir)): # make symlink relative @@ -759,7 +768,8 @@ def map_paths(v: Value.Base, dn: str) -> Value.Base: raise FileExistsError( f"Two files have the same link destination: " f"{link_destinations[link]} and {target} are both " - f"written to {link}.") + f"written to {link}." + ) link_destinations[link] = target os.makedirs(dn, exist_ok=use_relative_output_paths) if hardlinks: @@ -838,8 +848,8 @@ def map_paths_relative(v: Value.Base, dn: str): if use_relative_output_paths: return outputs.map( lambda binding: Env.Binding( - binding.name, - map_paths_relative(copy.deepcopy(binding.value), out_dir)) + binding.name, map_paths_relative(copy.deepcopy(binding.value), out_dir) + ) ) return outputs.map( lambda binding: Env.Binding( diff --git a/WDL/runtime/workflow.py b/WDL/runtime/workflow.py index fa7c5e28..ac7ce6e2 100644 --- a/WDL/runtime/workflow.py +++ b/WDL/runtime/workflow.py @@ -780,7 +780,10 @@ def run_local_workflow( ) ) _outputs = link_outputs( - cache, cached, run_dir, hardlinks=cfg["file_io"].get_bool("output_hardlinks"), + cache, + cached, + run_dir, + hardlinks=cfg["file_io"].get_bool("output_hardlinks"), use_relative_output_paths=cfg["file_io"].get_bool("use_relative_output_paths"), ) write_values_json( @@ -933,7 +936,10 @@ def _workflow_main_loop( # create output_links outputs = link_outputs( - cache, state.outputs, run_dir, hardlinks=cfg["file_io"].get_bool("output_hardlinks"), + cache, + state.outputs, + run_dir, + hardlinks=cfg["file_io"].get_bool("output_hardlinks"), # Relative output paths only make sense at the top level, and hence is only used here. use_relative_output_paths=cfg["file_io"].get_bool("use_relative_output_paths"), ) From 45dba217e58d201679ac58ff991ed49f90c6c6cd Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Mon, 26 Sep 2022 14:58:17 +0200 Subject: [PATCH 11/13] Add correct return type --- WDL/runtime/task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/WDL/runtime/task.py b/WDL/runtime/task.py index 7cb1a47b..1b9cc7b1 100644 --- a/WDL/runtime/task.py +++ b/WDL/runtime/task.py @@ -822,7 +822,7 @@ def map_paths(v: Value.Base, dn: str) -> Value.Base: v.value[key] = map_paths(v.value[key], os.path.join(dn, key)) return v - def map_paths_relative(v: Value.Base, dn: str): + def map_paths_relative(v: Value.Base, dn: str) -> Value.Base: if isinstance(v, (Value.File, Value.Directory)): # Fall back on map paths to use all the correct linking code. return map_paths(v, dn) From fe9dfbdbcf9893b0af2f6162058d1fb2b43f5c3b Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Mon, 26 Sep 2022 15:14:18 +0200 Subject: [PATCH 12/13] more extensive description of use_relative_output_paths --- WDL/runtime/config_templates/default.cfg | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/WDL/runtime/config_templates/default.cfg b/WDL/runtime/config_templates/default.cfg index b07ca874..40b7d8b0 100644 --- a/WDL/runtime/config_templates/default.cfg +++ b/WDL/runtime/config_templates/default.cfg @@ -93,7 +93,10 @@ output_hardlinks = false # in the task run directory (above the container working directory). delete_work = false # Create an output directory structure based on the positions of output files relative to the -# working directory. +# working directory for each task. For example, a task has the following output: +# File my_report = "reports/subdir/myreport.txt". +# By default this will be in out/my_report/my_report.txt. With use_relative_output_paths=true +# it will be in out/reports/subdir/myreport.txt use_relative_output_paths = false # Suggest that each task's temporary directory should reside within the mounted task working # directory, instead of the storage backing the container's root filesystem. The latter (default) From f168cbe6afffa29542a49e3f98a530bfe47127b8 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Mon, 26 Sep 2022 15:43:36 +0200 Subject: [PATCH 13/13] Add collision test for use_relative_output_paths --- tests/runner.t | 134 ++++++++++++++++++++++++++++--------------------- 1 file changed, 76 insertions(+), 58 deletions(-) diff --git a/tests/runner.t b/tests/runner.t index e9e4084e..89bde96c 100644 --- a/tests/runner.t +++ b/tests/runner.t @@ -11,7 +11,7 @@ source tests/bash-tap/bash-tap-bootstrap export PYTHONPATH="$SOURCE_DIR:$PYTHONPATH" miniwdl="python3 -m WDL" -plan tests 88 +plan tests 89 $miniwdl run_self_test is "$?" "0" "run_self_test" @@ -163,6 +163,81 @@ is "$?" "0" "copy_source scatter_echo.wdl" cmp -s echo_task.wdl scatterrun/wdl/echo_task.wdl is "$?" "0" "copy_source echo_task.wdl" +cat << 'EOF' > create_files.wdl +version 1.0 +task create_files { + command <<< + mkdir -p large_matryoshka/medium_matryoskha + touch large_matryoshka/medium_matryoskha/small_matryoshka + mkdir -p utensils + touch utensils/fork utensils/knife utensils/spoon + touch unboxed_item + mkdir -p turtle/turtle/turtle/turtle/turtle/turtle/ + touch turtle/turtle/turtle/turtle/turtle/turtle/turtle + >>> + output { + File small_matryoshka = "large_matryoshka/medium_matryoskha/small_matryoshka" + File fork = "utensils/fork" + File knife = "utensils/knife" + File spoon = "utensils/spoon" + File unboxed_item = "unboxed_item" + File all_the_way_down = "turtle/turtle/turtle/turtle/turtle/turtle/turtle" + # Below arrays will create collisions as these are the same paths as above, + # localized to the same links. This should be handled properly. + Array[File] utensils = [fork, knife, spoon] + Array[File] everything = [small_matryoshka, fork, knife, spoon, unboxed_item, all_the_way_down] + } +} +EOF + +cat << 'EOF' > correct_workflow.wdl +version 1.0 +import "create_files.wdl" as create_files + +workflow filecreator { + call create_files.create_files as createFiles {} + + output { + Array[File] utensils = createFiles.utensils + Array[File] all_stuff = createFiles.everything + } +} +EOF + +OUTPUT_DIR=use_relative_paths/_LAST/out/ +MINIWDL__FILE_IO__USE_RELATIVE_OUTPUT_PATHS=true $miniwdl run --dir use_relative_paths correct_workflow.wdl +is "$?" "0" relative_output_paths +test -L $OUTPUT_DIR/large_matryoshka/medium_matryoskha/small_matryoshka +is "$?" "0" "outputs are relative" +test -d $OUTPUT_DIR/turtle/turtle/turtle/turtle +is "$?" "0" "outputs are relative all the way down" + +MINIWDL__FILE_IO__USE_RELATIVE_OUTPUT_PATHS=true MINIWDL__FILE_IO__OUTPUT_HARDLINKS=true \ +$miniwdl run --dir use_relative_paths correct_workflow.wdl +is "$?" "0" relative_output_paths_with_hardlink +test -f $OUTPUT_DIR/large_matryoshka/medium_matryoskha/small_matryoshka +is "$?" "0" "outputs are relative using hardlinks" +test -d $OUTPUT_DIR/turtle/turtle/turtle/turtle +is "$?" "0" "outputs are relative all the way down using hardlinks" + +cat << 'EOF' > colliding_workflow.wdl +version 1.0 +import "create_files.wdl" as create_files + +workflow filecollider { + call create_files.create_files as createFiles {} + call create_files.create_files as createFiles2 {} + output { + Array[File] all_stuff1 = createFiles.everything + Array[File] all_stuff2 = createFiles2.everything + } +} +EOF + +MINIWDL__FILE_IO__USE_RELATIVE_OUTPUT_PATHS=true $miniwdl run --dir use_relative_paths colliding_workflow.wdl 2> errors.txt +grep -q "Two files have the same link destination" errors.txt +is "$?" "0" "use_relative_output_paths throws error on collisions" + cat << 'EOF' > failer2000.wdl version 1.0 @@ -524,60 +599,3 @@ workflow outer { EOF MINIWDL__SCHEDULER__SUBWORKFLOW_CONCURRENCY=2 $miniwdl run --dir nested_deadlock outer.wdl is "$?" "0" "avoid deadlocking on nested subworkflows" - -cat << 'EOF' > create_files.wdl -version 1.0 -task create_files { - command <<< - mkdir -p large_matryoshka/medium_matryoskha - touch large_matryoshka/medium_matryoskha/small_matryoshka - mkdir -p utensils - touch utensils/fork utensils/knife utensils/spoon - touch unboxed_item - mkdir -p turtle/turtle/turtle/turtle/turtle/turtle/ - touch turtle/turtle/turtle/turtle/turtle/turtle/turtle - >>> - output { - File small_matryoshka = "large_matryoshka/medium_matryoskha/small_matryoshka" - File fork = "utensils/fork" - File knife = "utensils/knife" - File spoon = "utensils/spoon" - File unboxed_item = "unboxed_item" - File all_the_way_down = "turtle/turtle/turtle/turtle/turtle/turtle/turtle" - # Below arrays will create collisions as these are the same paths as above, - # localized to the same links. This should be handled properly. - Array[File] utensils = [fork, knife, spoon] - Array[File] everything = [small_matryoshka, fork, knife, spoon, unboxed_item, all_the_way_down] - } -} -EOF - -cat << 'EOF' > correct_workflow.wdl -version 1.0 -import "create_files.wdl" as create_files - -workflow filecreator { - call create_files.create_files as createFiles {} - - output { - Array[File] utensils = createFiles.utensils - Array[File] all_stuff = createFiles.everything - } -} -EOF - -OUTPUT_DIR=use_relative_paths/_LAST/out/ -MINIWDL__FILE_IO__USE_RELATIVE_OUTPUT_PATHS=true $miniwdl run --dir use_relative_paths correct_workflow.wdl -is "$?" "0" relative_output_paths -test -L $OUTPUT_DIR/large_matryoshka/medium_matryoskha/small_matryoshka -is "$?" "0" "outputs are relative" -test -d $OUTPUT_DIR/turtle/turtle/turtle/turtle -is "$?" "0" "outputs are relative all the way down" - -MINIWDL__FILE_IO__USE_RELATIVE_OUTPUT_PATHS=true MINIWDL__FILE_IO__OUTPUT_HARDLINKS=true \ -$miniwdl run --dir use_relative_paths correct_workflow.wdl -is "$?" "0" relative_output_paths_with_hardlink -test -f $OUTPUT_DIR/large_matryoshka/medium_matryoskha/small_matryoshka -is "$?" "0" "outputs are relative using hardlinks" -test -d $OUTPUT_DIR/turtle/turtle/turtle/turtle -is "$?" "0" "outputs are relative all the way down using hardlinks"