From 9b6a9dc0c73dfc0625e55a3db4c262174bda0dff Mon Sep 17 00:00:00 2001 From: John Chilton Date: Mon, 13 Feb 2017 14:20:13 -0500 Subject: [PATCH 1/6] Start work on a relabelling collection operation. I think it needs to be touched up but the basic operation seems to work so far. I think what remains to be done is: - Validate uniqueness of identifiers and provide nice messages if they are not unique. - Validate that at least the required number of lines are present in the file and provide a nice message if not. - Add strict mode to ensure exactly the correct number of lines is added. - Find where validation of identifiers happens in the API and apply same validation here - try not to let unsafe identifiers be created. - Consider more advanced modes - selecting a column, apply a regex replace, pick two columns for nested lists, etc.... None of this may need to be needed in the first iteration. - Consider another mode where a collection is labelled against an existing collection - should that be a separate tool of the same tool. --- config/tool_conf.xml.sample | 1 + lib/galaxy/tools/__init__.py | 22 ++++++++- lib/galaxy/tools/relabel_from_file.xml | 53 +++++++++++++++++++++ test-data/new_labels_1.txt | 1 + test/functional/tools/samples_tool_conf.xml | 1 + 5 files changed, 77 insertions(+), 1 deletion(-) create mode 100644 lib/galaxy/tools/relabel_from_file.xml create mode 100644 test-data/new_labels_1.txt diff --git a/config/tool_conf.xml.sample b/config/tool_conf.xml.sample index 3e19d92f2658..2f0399853748 100644 --- a/config/tool_conf.xml.sample +++ b/config/tool_conf.xml.sample @@ -36,6 +36,7 @@ +
diff --git a/lib/galaxy/tools/__init__.py b/lib/galaxy/tools/__init__.py index b84709615f6a..94c65ed43d28 100755 --- a/lib/galaxy/tools/__init__.py +++ b/lib/galaxy/tools/__init__.py @@ -2506,11 +2506,31 @@ def add_elements(collection, prefix=""): ) +class RelabelFromFileTool( DatabaseOperationTool ): + tool_type = 'relabel_from_file' + + def produce_outputs( self, trans, out_data, output_collections, incoming, history ): + hdca = incoming[ "input" ] + new_labels_dataset_assoc = incoming[ "labels" ] + new_elements = odict() + log.info(new_labels_dataset_assoc) + new_labels_path = new_labels_dataset_assoc.file_name + new_labels = open(new_labels_path, "r").readlines(1024 * 1000000) + + for i, dce in enumerate(hdca.collection.elements): + dce_object = dce.element_object + new_elements[new_labels[i].strip()] = dce_object.copy() + + output_collections.create_collection( + next(iter(self.outputs.values())), "output", elements=new_elements + ) + + # Populate tool_type to ToolClass mappings tool_types = {} for tool_class in [ Tool, SetMetadataTool, OutputParameterJSONTool, DataManagerTool, DataSourceTool, AsyncDataSourceTool, - UnzipCollectionTool, ZipCollectionTool, MergeCollectionTool, + UnzipCollectionTool, ZipCollectionTool, MergeCollectionTool, RelabelFromFileTool, DataDestinationTool ]: tool_types[ tool_class.tool_type ] = tool_class diff --git a/lib/galaxy/tools/relabel_from_file.xml b/lib/galaxy/tools/relabel_from_file.xml new file mode 100644 index 000000000000..f52c9e38ef90 --- /dev/null +++ b/lib/galaxy/tools/relabel_from_file.xml @@ -0,0 +1,53 @@ + + from contents of a file + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This tool will take an input list and a text file with new identifiers + and build a new list with the same datasets but these new identifiers. + + This tool will create new history datasets from your collection + but your quota usage will not increase. + + diff --git a/test-data/new_labels_1.txt b/test-data/new_labels_1.txt new file mode 100644 index 000000000000..3fe0afa80c6f --- /dev/null +++ b/test-data/new_labels_1.txt @@ -0,0 +1 @@ +new_i1 \ No newline at end of file diff --git a/test/functional/tools/samples_tool_conf.xml b/test/functional/tools/samples_tool_conf.xml index 59f76c37d1d1..35a1480f8172 100644 --- a/test/functional/tools/samples_tool_conf.xml +++ b/test/functional/tools/samples_tool_conf.xml @@ -146,5 +146,6 @@ + From d21c14f0096b1fa7954396e7e9878b1d1e712331 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Mon, 20 Mar 2017 18:09:10 +0100 Subject: [PATCH 2/6] Add possibility to rename collection items from a tabular file --- lib/galaxy/tools/__init__.py | 18 ++++++++++--- lib/galaxy/tools/relabel_from_file.xml | 37 +++++++++++++++++++++++++- test-data/new_labels_2.txt | 1 + 3 files changed, 52 insertions(+), 4 deletions(-) create mode 100644 test-data/new_labels_2.txt diff --git a/lib/galaxy/tools/__init__.py b/lib/galaxy/tools/__init__.py index 94c65ed43d28..f279af4490a2 100755 --- a/lib/galaxy/tools/__init__.py +++ b/lib/galaxy/tools/__init__.py @@ -2516,10 +2516,22 @@ def produce_outputs( self, trans, out_data, output_collections, incoming, histor log.info(new_labels_dataset_assoc) new_labels_path = new_labels_dataset_assoc.file_name new_labels = open(new_labels_path, "r").readlines(1024 * 1000000) + if new_labels_dataset_assoc.ext == 'tabular' and new_labels_dataset_assoc.metadata.get('columns') == 2: + # We have a tabular file, where the first column is an existing element identifier, + # and the second column is the new element identifier. + source_new_label = (line.strip().split('\t') for line in new_labels) + new_labels_dict = {source: new_label for source, new_label in source_new_label} + for i, dce in enumerate(hdca.collection.elements): + dce_object = dce.element_object + element_identifier = dce.element_identifier + new_label = new_labels_dict.get(element_identifier, element_identifier) + new_elements[new_label] = dce_object.copy() - for i, dce in enumerate(hdca.collection.elements): - dce_object = dce.element_object - new_elements[new_labels[i].strip()] = dce_object.copy() + else: + # If new_labels_dataset_assoc is not a two-column tabular dataset we label with the current line of the dataset + for i, dce in enumerate(hdca.collection.elements): + dce_object = dce.element_object + new_elements[new_labels[i].strip()] = dce_object.copy() output_collections.create_collection( next(iter(self.outputs.values())), "output", elements=new_elements diff --git a/lib/galaxy/tools/relabel_from_file.xml b/lib/galaxy/tools/relabel_from_file.xml index f52c9e38ef90..d4194fefaa41 100644 --- a/lib/galaxy/tools/relabel_from_file.xml +++ b/lib/galaxy/tools/relabel_from_file.xml @@ -8,7 +8,7 @@ class="ModelOperationToolAction"/> - + @@ -42,10 +42,45 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + This tool will take an input list and a text file with new identifiers and build a new list with the same datasets but these new identifiers. + The order and number of entries in the text file must match the order + of the items you want to rename in your dataset collection. + + Alternatively a tabular file may be supplied, where the first column + if the current identifier that should be renamed, and the second column + contains the new label. This file may contain less entries than items + in the collection. In that case only matching list identifiers will be + relabeled. This tool will create new history datasets from your collection but your quota usage will not increase. diff --git a/test-data/new_labels_2.txt b/test-data/new_labels_2.txt new file mode 100644 index 000000000000..7f11cbd1aafa --- /dev/null +++ b/test-data/new_labels_2.txt @@ -0,0 +1 @@ +i1 new_i1 From 338130a979c99f60a171ba6b9ffca5db0a0e16a8 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Wed, 12 Apr 2017 11:54:18 -0400 Subject: [PATCH 3/6] Fix failing test case in relable_from_file based on comment from @mvdbeek. --- lib/galaxy/tools/relabel_from_file.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/galaxy/tools/relabel_from_file.xml b/lib/galaxy/tools/relabel_from_file.xml index d4194fefaa41..8bab52cea641 100644 --- a/lib/galaxy/tools/relabel_from_file.xml +++ b/lib/galaxy/tools/relabel_from_file.xml @@ -53,7 +53,7 @@ - + From 09bcdd9185e969c0eec34770faf8cb418c0f3e47 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Thu, 13 Apr 2017 09:58:45 -0400 Subject: [PATCH 4/6] Various improvements to relabel from file collection operation. - Better error handling (check for bad characters when creating collections). - Implement a strict mode parameter to do even more validation. - Rework tabular vs txt mode to be explicit user choice. - Mirror fix in release_17.01 for datasets not having a history, --- lib/galaxy/tools/__init__.py | 35 +++++++++++------ lib/galaxy/tools/relabel_from_file.xml | 52 +++++++++++++++++++++++++- test-data/new_labels_bad_1.txt | 1 + 3 files changed, 76 insertions(+), 12 deletions(-) create mode 100644 test-data/new_labels_bad_1.txt diff --git a/lib/galaxy/tools/__init__.py b/lib/galaxy/tools/__init__.py index f279af4490a2..1d4861face62 100755 --- a/lib/galaxy/tools/__init__.py +++ b/lib/galaxy/tools/__init__.py @@ -2506,17 +2506,20 @@ def add_elements(collection, prefix=""): ) -class RelabelFromFileTool( DatabaseOperationTool ): +class RelabelFromFileTool(DatabaseOperationTool): tool_type = 'relabel_from_file' - def produce_outputs( self, trans, out_data, output_collections, incoming, history ): - hdca = incoming[ "input" ] - new_labels_dataset_assoc = incoming[ "labels" ] + def produce_outputs(self, trans, out_data, output_collections, incoming, history): + hdca = incoming["input"] + how_type = incoming["how"]["how_select"] + new_labels_dataset_assoc = incoming["how"]["labels"] + strict = string_as_bool(incoming["how"]["strict"]) new_elements = odict() - log.info(new_labels_dataset_assoc) new_labels_path = new_labels_dataset_assoc.file_name new_labels = open(new_labels_path, "r").readlines(1024 * 1000000) - if new_labels_dataset_assoc.ext == 'tabular' and new_labels_dataset_assoc.metadata.get('columns') == 2: + if strict and len(hdca.collection.elements) != len(new_labels): + raise Exception("Relabel mapping file contains incorrect number of identifiers") + if how_type == "tabular": # We have a tabular file, where the first column is an existing element identifier, # and the second column is the new element identifier. source_new_label = (line.strip().split('\t') for line in new_labels) @@ -2524,15 +2527,25 @@ def produce_outputs( self, trans, out_data, output_collections, incoming, histor for i, dce in enumerate(hdca.collection.elements): dce_object = dce.element_object element_identifier = dce.element_identifier - new_label = new_labels_dict.get(element_identifier, element_identifier) - new_elements[new_label] = dce_object.copy() - + default = element_identifier if strict else None + new_label = new_labels_dict.get(element_identifier, default) + if not new_label: + raise Exception("Failed to find new label for identifier [%s]" % element_identifier) + copied_value = dce_object.copy() + if getattr(copied_value, "history_content_type", None) == "dataset": + history.add_dataset(copied_value, set_hid=False) + new_elements[new_label] = copied_value else: # If new_labels_dataset_assoc is not a two-column tabular dataset we label with the current line of the dataset for i, dce in enumerate(hdca.collection.elements): dce_object = dce.element_object - new_elements[new_labels[i].strip()] = dce_object.copy() - + copied_value = dce_object.copy() + if getattr(copied_value, "history_content_type", None) == "dataset": + history.add_dataset(copied_value, set_hid=False) + new_elements[new_labels[i].strip()] = copied_value + for key in new_elements.keys(): + if not re.match("^[\w\-_]+$", key): + raise Exception("Invalid new colleciton identifier [%s]" % key) output_collections.create_collection( next(iter(self.outputs.values())), "output", elements=new_elements ) diff --git a/lib/galaxy/tools/relabel_from_file.xml b/lib/galaxy/tools/relabel_from_file.xml index 8bab52cea641..8d37e671c7c4 100644 --- a/lib/galaxy/tools/relabel_from_file.xml +++ b/lib/galaxy/tools/relabel_from_file.xml @@ -8,7 +8,20 @@ class="ModelOperationToolAction"/> - + + + + + + + + + + + + + + @@ -26,6 +39,7 @@ + @@ -53,6 +67,7 @@ + @@ -69,6 +84,41 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This tool will take an input list and a text file with new identifiers diff --git a/test-data/new_labels_bad_1.txt b/test-data/new_labels_bad_1.txt new file mode 100644 index 000000000000..5177cef38e05 --- /dev/null +++ b/test-data/new_labels_bad_1.txt @@ -0,0 +1 @@ +new_i; rm -rf \ No newline at end of file From 2288a88ab793198147fdb5a1050d0325a831d9f8 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Mon, 17 Apr 2017 13:12:57 -0400 Subject: [PATCH 5/6] Fix relabel_from_file collection operation error handling if duplicate identifiers. --- lib/galaxy/tools/__init__.py | 20 ++++++++++++-------- lib/galaxy/tools/relabel_from_file.xml | 11 +++++++++++ test-data/new_labels_bad_2.txt | 2 ++ 3 files changed, 25 insertions(+), 8 deletions(-) create mode 100644 test-data/new_labels_bad_2.txt diff --git a/lib/galaxy/tools/__init__.py b/lib/galaxy/tools/__init__.py index 1d4861face62..e442812a3e94 100755 --- a/lib/galaxy/tools/__init__.py +++ b/lib/galaxy/tools/__init__.py @@ -2515,6 +2515,16 @@ def produce_outputs(self, trans, out_data, output_collections, incoming, history new_labels_dataset_assoc = incoming["how"]["labels"] strict = string_as_bool(incoming["how"]["strict"]) new_elements = odict() + + def add_copied_value_to_new_elements(new_label, dce_object): + new_label = new_label.strip() + if new_label in new_elements: + raise Exception("New identifier [%s] appears twice in resulting collection, these values must be unique." % new_label) + copied_value = dce_object.copy() + if getattr(copied_value, "history_content_type", None) == "dataset": + history.add_dataset(copied_value, set_hid=False) + new_elements[new_label] = copied_value + new_labels_path = new_labels_dataset_assoc.file_name new_labels = open(new_labels_path, "r").readlines(1024 * 1000000) if strict and len(hdca.collection.elements) != len(new_labels): @@ -2531,18 +2541,12 @@ def produce_outputs(self, trans, out_data, output_collections, incoming, history new_label = new_labels_dict.get(element_identifier, default) if not new_label: raise Exception("Failed to find new label for identifier [%s]" % element_identifier) - copied_value = dce_object.copy() - if getattr(copied_value, "history_content_type", None) == "dataset": - history.add_dataset(copied_value, set_hid=False) - new_elements[new_label] = copied_value + add_copied_value_to_new_elements(new_label, dce_object) else: # If new_labels_dataset_assoc is not a two-column tabular dataset we label with the current line of the dataset for i, dce in enumerate(hdca.collection.elements): dce_object = dce.element_object - copied_value = dce_object.copy() - if getattr(copied_value, "history_content_type", None) == "dataset": - history.add_dataset(copied_value, set_hid=False) - new_elements[new_labels[i].strip()] = copied_value + add_copied_value_to_new_elements(new_labels[i], dce_object) for key in new_elements.keys(): if not re.match("^[\w\-_]+$", key): raise Exception("Invalid new colleciton identifier [%s]" % key) diff --git a/lib/galaxy/tools/relabel_from_file.xml b/lib/galaxy/tools/relabel_from_file.xml index 8d37e671c7c4..ca806b3b5ec5 100644 --- a/lib/galaxy/tools/relabel_from_file.xml +++ b/lib/galaxy/tools/relabel_from_file.xml @@ -119,6 +119,17 @@ + + + + + + + + + + + This tool will take an input list and a text file with new identifiers diff --git a/test-data/new_labels_bad_2.txt b/test-data/new_labels_bad_2.txt new file mode 100644 index 000000000000..591022662d72 --- /dev/null +++ b/test-data/new_labels_bad_2.txt @@ -0,0 +1,2 @@ +newi1 +newi1 \ No newline at end of file From 49abfced0918e2722ad0325aa7df6abbd2c5019e Mon Sep 17 00:00:00 2001 From: John Chilton Date: Mon, 17 Apr 2017 15:53:51 -0400 Subject: [PATCH 6/6] Implement collection operation for filtering lists from a file. --- config/tool_conf.xml.sample | 1 + lib/galaxy/tools/__init__.py | 40 ++++++++- lib/galaxy/tools/filter_from_file.xml | 91 +++++++++++++++++++++ lib/galaxy/tools/relabel_from_file.xml | 7 +- test-data/filter_labels_1.txt | 1 + test/functional/tools/samples_tool_conf.xml | 1 + 6 files changed, 138 insertions(+), 3 deletions(-) create mode 100644 lib/galaxy/tools/filter_from_file.xml create mode 100644 test-data/filter_labels_1.txt diff --git a/config/tool_conf.xml.sample b/config/tool_conf.xml.sample index 2f0399853748..b8b98a82d778 100644 --- a/config/tool_conf.xml.sample +++ b/config/tool_conf.xml.sample @@ -37,6 +37,7 @@ +
diff --git a/lib/galaxy/tools/__init__.py b/lib/galaxy/tools/__init__.py index e442812a3e94..686c4e6e8a5b 100755 --- a/lib/galaxy/tools/__init__.py +++ b/lib/galaxy/tools/__init__.py @@ -2555,11 +2555,49 @@ def add_copied_value_to_new_elements(new_label, dce_object): ) +class FilterFromFileTool(DatabaseOperationTool): + tool_type = 'filter_from_file' + + def produce_outputs(self, trans, out_data, output_collections, incoming, history): + hdca = incoming["input"] + how_filter = incoming["how"]["how_filter"] + filter_dataset_assoc = incoming["how"]["filter_source"] + filtered_elements = odict() + discarded_elements = odict() + + filtered_path = filter_dataset_assoc.file_name + filtered_identifiers_raw = open(filtered_path, "r").readlines(1024 * 1000000) + filtered_identifiers = [i.strip() for i in filtered_identifiers_raw] + + # If filtered_dataset_assoc is not a two-column tabular dataset we label with the current line of the dataset + for i, dce in enumerate(hdca.collection.elements): + dce_object = dce.element_object + element_identifier = dce.element_identifier + in_filter_file = element_identifier in filtered_identifiers + passes_filter = in_filter_file if how_filter == "remove_if_absent" else not in_filter_file + + copied_value = dce_object.copy() + if getattr(copied_value, "history_content_type", None) == "dataset": + history.add_dataset(copied_value, set_hid=False) + + if passes_filter: + filtered_elements[element_identifier] = copied_value + else: + discarded_elements[element_identifier] = copied_value + + output_collections.create_collection( + self.outputs["output_filtered"], "output_filtered", elements=filtered_elements + ) + output_collections.create_collection( + self.outputs["output_discarded"], "output_discarded", elements=discarded_elements + ) + + # Populate tool_type to ToolClass mappings tool_types = {} for tool_class in [ Tool, SetMetadataTool, OutputParameterJSONTool, DataManagerTool, DataSourceTool, AsyncDataSourceTool, - UnzipCollectionTool, ZipCollectionTool, MergeCollectionTool, RelabelFromFileTool, + UnzipCollectionTool, ZipCollectionTool, MergeCollectionTool, RelabelFromFileTool, FilterFromFileTool, DataDestinationTool ]: tool_types[ tool_class.tool_type ] = tool_class diff --git a/lib/galaxy/tools/filter_from_file.xml b/lib/galaxy/tools/filter_from_file.xml new file mode 100644 index 000000000000..6dbb28277a4b --- /dev/null +++ b/lib/galaxy/tools/filter_from_file.xml @@ -0,0 +1,91 @@ + + from contents of a file + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/lib/galaxy/tools/relabel_from_file.xml b/lib/galaxy/tools/relabel_from_file.xml index ca806b3b5ec5..797e647819a7 100644 --- a/lib/galaxy/tools/relabel_from_file.xml +++ b/lib/galaxy/tools/relabel_from_file.xml @@ -131,7 +131,10 @@ - + + ]]> diff --git a/test-data/filter_labels_1.txt b/test-data/filter_labels_1.txt new file mode 100644 index 000000000000..928f40b2d212 --- /dev/null +++ b/test-data/filter_labels_1.txt @@ -0,0 +1 @@ +i2 diff --git a/test/functional/tools/samples_tool_conf.xml b/test/functional/tools/samples_tool_conf.xml index 35a1480f8172..415766d06035 100644 --- a/test/functional/tools/samples_tool_conf.xml +++ b/test/functional/tools/samples_tool_conf.xml @@ -147,5 +147,6 @@ +