Merge pull request #3940 from jmchilton/collection_op_filter_from_file

Collection Operation - Filtering from a File
galaxyproject · Apr 20, 2017 · 01b3eb4 · 01b3eb4
2 parents 9f4fd7c + 49abfce
commit 01b3eb4
Show file tree

Hide file tree

Showing 10 changed files with 341 additions and 1 deletion.
diff --git a/config/tool_conf.xml.sample b/config/tool_conf.xml.sample
@@ -36,6 +36,8 @@
     <tool file="${model_tools_path}/filter_failed_collection.xml" />
     <tool file="${model_tools_path}/flatten_collection.xml" />
     <tool file="${model_tools_path}/merge_collection.xml" />
+    <tool file="${model_tools_path}/relabel_from_file.xml" />
+    <tool file="${model_tools_path}/filter_from_file.xml" />
   </section>
   <section id="liftOver" name="Lift-Over">
     <tool file="extract/liftOver_wrapper.xml" />

diff --git a/lib/galaxy/tools/__init__.py b/lib/galaxy/tools/__init__.py
@@ -2506,11 +2506,98 @@ def add_elements(collection, prefix=""):
         )
 
 
+class RelabelFromFileTool(DatabaseOperationTool):
+    tool_type = 'relabel_from_file'
+
+    def produce_outputs(self, trans, out_data, output_collections, incoming, history):
+        hdca = incoming["input"]
+        how_type = incoming["how"]["how_select"]
+        new_labels_dataset_assoc = incoming["how"]["labels"]
+        strict = string_as_bool(incoming["how"]["strict"])
+        new_elements = odict()
+
+        def add_copied_value_to_new_elements(new_label, dce_object):
+            new_label = new_label.strip()
+            if new_label in new_elements:
+                raise Exception("New identifier [%s] appears twice in resulting collection, these values must be unique." % new_label)
+            copied_value = dce_object.copy()
+            if getattr(copied_value, "history_content_type", None) == "dataset":
+                history.add_dataset(copied_value, set_hid=False)
+            new_elements[new_label] = copied_value
+
+        new_labels_path = new_labels_dataset_assoc.file_name
+        new_labels = open(new_labels_path, "r").readlines(1024 * 1000000)
+        if strict and len(hdca.collection.elements) != len(new_labels):
+            raise Exception("Relabel mapping file contains incorrect number of identifiers")
+        if how_type == "tabular":
+            # We have a tabular file, where the first column is an existing element identifier,
+            # and the second column is the new element identifier.
+            source_new_label = (line.strip().split('\t') for line in new_labels)
+            new_labels_dict = {source: new_label for source, new_label in source_new_label}
+            for i, dce in enumerate(hdca.collection.elements):
+                dce_object = dce.element_object
+                element_identifier = dce.element_identifier
+                default = element_identifier if strict else None
+                new_label = new_labels_dict.get(element_identifier, default)
+                if not new_label:
+                    raise Exception("Failed to find new label for identifier [%s]" % element_identifier)
+                add_copied_value_to_new_elements(new_label, dce_object)
+        else:
+            # If new_labels_dataset_assoc is not a two-column tabular dataset we label with the current line of the dataset
+            for i, dce in enumerate(hdca.collection.elements):
+                dce_object = dce.element_object
+                add_copied_value_to_new_elements(new_labels[i], dce_object)
+        for key in new_elements.keys():
+            if not re.match("^[\w\-_]+$", key):
+                raise Exception("Invalid new colleciton identifier [%s]" % key)
+        output_collections.create_collection(
+            next(iter(self.outputs.values())), "output", elements=new_elements
+        )
+
+
+class FilterFromFileTool(DatabaseOperationTool):
+    tool_type = 'filter_from_file'
+
+    def produce_outputs(self, trans, out_data, output_collections, incoming, history):
+        hdca = incoming["input"]
+        how_filter = incoming["how"]["how_filter"]
+        filter_dataset_assoc = incoming["how"]["filter_source"]
+        filtered_elements = odict()
+        discarded_elements = odict()
+
+        filtered_path = filter_dataset_assoc.file_name
+        filtered_identifiers_raw = open(filtered_path, "r").readlines(1024 * 1000000)
+        filtered_identifiers = [i.strip() for i in filtered_identifiers_raw]
+
+        # If filtered_dataset_assoc is not a two-column tabular dataset we label with the current line of the dataset
+        for i, dce in enumerate(hdca.collection.elements):
+            dce_object = dce.element_object
+            element_identifier = dce.element_identifier
+            in_filter_file = element_identifier in filtered_identifiers
+            passes_filter = in_filter_file if how_filter == "remove_if_absent" else not in_filter_file
+
+            copied_value = dce_object.copy()
+            if getattr(copied_value, "history_content_type", None) == "dataset":
+                history.add_dataset(copied_value, set_hid=False)
+
+            if passes_filter:
+                filtered_elements[element_identifier] = copied_value
+            else:
+                discarded_elements[element_identifier] = copied_value
+
+        output_collections.create_collection(
+            self.outputs["output_filtered"], "output_filtered", elements=filtered_elements
+        )
+        output_collections.create_collection(
+            self.outputs["output_discarded"], "output_discarded", elements=discarded_elements
+        )
+
+
 # Populate tool_type to ToolClass mappings
 tool_types = {}
 for tool_class in [ Tool, SetMetadataTool, OutputParameterJSONTool,
                     DataManagerTool, DataSourceTool, AsyncDataSourceTool,
-                    UnzipCollectionTool, ZipCollectionTool, MergeCollectionTool,
+                    UnzipCollectionTool, ZipCollectionTool, MergeCollectionTool, RelabelFromFileTool, FilterFromFileTool,
                     DataDestinationTool ]:
     tool_types[ tool_class.tool_type ] = tool_class
 

diff --git a/lib/galaxy/tools/filter_from_file.xml b/lib/galaxy/tools/filter_from_file.xml
@@ -0,0 +1,91 @@
+<tool id="__FILTER_FROM_FILE__"
+      name="Filter List"
+      version="1.0.0"
+      tool_type="filter_from_file">
+    <description>from contents of a file</description>
+    <type class="FilterFromFileTool" module="galaxy.tools" />
+    <action module="galaxy.tools.actions.model_operations"
+            class="ModelOperationToolAction"/>
+    <inputs>
+        <param type="data_collection" name="input" label="Input Collection" help="A list whose elements will be filtered."/>
+        <conditional name="how">
+            <param type="select" name="how_filter" label="How should the elements to remove be determined?">
+                <option value="remove_if_absent" selected="true">Remove if identifiers are absent from supplied text file.</option>
+                <option value="remove_if_present">Remove if identifiers are present in supplied text file.</option>
+            </param>
+            <when value="remove_if_absent">
+                <param type="data" name="filter_source" format="txt" label="Filter out identifiers absent from" />
+            </when>
+            <when value="remove_if_present">
+                <param type="data" name="filter_source" format="txt" label="Filter out identifiers present in" />
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <collection name="output_filtered" format_source="input" type_source="input" label="${on_string} (filtered)" >
+        </collection>
+        <collection name="output_discarded" format_source="input" type_source="input" label="${on_string} (discarded)" >
+        </collection>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input">
+                <collection type="list">
+                    <element name="i1" value="simple_line.txt" />
+                    <element name="i2" value="simple_line_alternative.txt" />
+                </collection>
+            </param>
+            <param name="how_filter" value="remove_if_present" />
+            <param name="filter_source" value="filter_labels_1.txt" ftype="txt" />
+            <output_collection name="output_filtered" type="list">
+                <element name="i1">
+                  <assert_contents>
+                    <has_text_matching expression="^This is a line of text.\n$" />
+                  </assert_contents>
+                </element>
+            </output_collection>
+            <output_collection name="output_discarded" type="list">
+                <element name="i2">
+                  <assert_contents>
+                    <has_text_matching expression="^This is a different line of text.\n$" />
+                  </assert_contents>
+                </element>
+            </output_collection>
+        </test>
+        <test>
+            <param name="input">
+                <collection type="list">
+                    <element name="i1" value="simple_line.txt" />
+                    <element name="i2" value="simple_line_alternative.txt" />
+                </collection>
+            </param>
+            <param name="how_filter" value="remove_if_absent" />
+            <param name="filter_source" value="filter_labels_1.txt" ftype="txt" />
+            <output_collection name="output_filtered" type="list">
+                <element name="i2">
+                  <assert_contents>
+                    <has_text_matching expression="^This is a different line of text.\n$" />
+                  </assert_contents>
+                </element>
+            </output_collection>
+            <output_collection name="output_discarded" type="list">
+                <element name="i1">
+                  <assert_contents>
+                    <has_text_matching expression="^This is a line of text.\n$" />
+                  </assert_contents>
+                </element>
+            </output_collection>
+        </test>
+    </tests>
+    <help><![CDATA[
+    
+.. class:: infomark
+
+        This tool will take an input list and a text file with identifiers to
+        filter the list with. It will build two new lists - one "filtered" to
+        contain only the supplied identifiers and one of the discarded elements.
+
+        This tool will create new history datasets from your collection
+        but your quota usage will not increase.
+    ]]></help>
+</tool>
diff --git a/lib/galaxy/tools/relabel_from_file.xml b/lib/galaxy/tools/relabel_from_file.xml
@@ -0,0 +1,152 @@
+<tool id="__RELABEL_FROM_FILE__"
+      name="Relabel List Identifiers"
+      version="1.0.0"
+      tool_type="relabel_from_file">
+    <description>from contents of a file</description>
+    <type class="RelabelFromFileTool" module="galaxy.tools" />
+    <action module="galaxy.tools.actions.model_operations"
+            class="ModelOperationToolAction"/>
+    <inputs>
+        <param type="data_collection" name="input" label="Input Collection" help="A list whose identifiers will be relabelled."/>
+        <conditional name="how">
+            <param type="select" name="how_select" label="How should the new labels be specified?">
+                <option value="txt">Using lines in a simple text file.</option>
+                <option value="tabular">Map original identifiers to new ones using a two column table.</option>
+            </param>
+            <when value="txt">
+                <param type="data" name="labels" format="txt" label="New Identifiers" />
+                <param name="strict" type="boolean" label="Ensure strict mapping" help="If selected, the target file must contain exactly the correct number of lines." truevalue="true" falsevalue="false" />
+            </when>
+            <when value="tabular">
+                <param type="data" name="labels" format="tabular" label="New Identifiers" />
+                <param name="strict" type="boolean" label="Ensure strict mapping" help="If selected, the target file must contain exactly the correct number of lines and each input identifier must match exactly one element of the input collection." truevalue="true" falsevalue="false" />
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <collection name="output" format_source="input" type_source="input" label="${on_string} (relabelled)" >
+        </collection>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input">
+                <collection type="list:paired">
+                    <element name="i1">
+                        <collection type="paired">
+                            <element name="forward" value="simple_line.txt" />
+                            <element name="reverse" value="simple_line_alternative.txt" />
+                        </collection>
+                    </element>
+                </collection>
+            </param>
+            <param name="how_select" value="txt" />
+            <param name="labels" value="new_labels_1.txt" ftype="txt" />
+            <output_collection name="output" type="list:paired">
+              <element name="new_i1">
+                <element name="forward">
+                  <assert_contents>
+                    <has_text_matching expression="^This is a line of text.\n$" />
+                  </assert_contents>
+                </element>
+                <element name="reverse">
+                  <assert_contents>
+                    <has_text_matching expression="^This is a different line of text.\n$" />
+                  </assert_contents>
+                </element>
+              </element>
+            </output_collection>
+        </test>
+        <test>
+            <param name="input">
+                <collection type="list:paired">
+                    <element name="i1">
+                        <collection type="paired">
+                            <element name="forward" value="simple_line.txt" />
+                            <element name="reverse" value="simple_line_alternative.txt" />
+                        </collection>
+                    </element>
+                </collection>
+            </param>
+            <param name="how_select" value="tabular" />
+            <param name="labels" value="new_labels_2.txt" ftype="tabular" />
+            <output_collection name="output" type="list:paired">
+              <element name="new_i1">
+                <element name="forward">
+                  <assert_contents>
+                    <has_text_matching expression="^This is a line of text.\n$" />
+                  </assert_contents>
+                </element>
+                <element name="reverse">
+                  <assert_contents>
+                    <has_text_matching expression="^This is a different line of text.\n$" />
+                  </assert_contents>
+                </element>
+              </element>
+            </output_collection>
+        </test>
+        <!-- test strict -->
+        <test expect_failure="true">
+            <param name="input">
+                <collection type="list:paired">
+                    <element name="wrongi">
+                        <collection type="paired">
+                            <element name="forward" value="simple_line.txt" />
+                            <element name="reverse" value="simple_line_alternative.txt" />
+                        </collection>
+                    </element>
+                </collection>
+            </param>
+            <param name="how_select" value="tabular" />
+            <param name="labels" value="new_labels_2.txt" ftype="tabular" />
+        </test>
+        <test expect_failure="true">
+            <param name="input">
+                <collection type="list">
+                    <element name="i1" value="simple_line.txt" />
+                    <element name="i3" value="simple_line_alternative.txt" />
+                </collection>
+            </param>
+            <param name="how_select" value="txt" />
+            <param name="labels" value="new_labels_1.txt" ftype="txt" />
+        </test>
+        <!-- test label bad characters -->
+        <test expect_failure="true">
+            <param name="input">
+                <collection type="list">
+                    <element name="i1" value="simple_line.txt" />
+                </collection>
+            </param>
+            <param name="how_select" value="txt" />
+            <param name="labels" value="new_labels_bad_1.txt" ftype="txt" />
+        </test>
+        <!-- test label bad because of duplicates -->
+        <test expect_failure="true">
+            <param name="input">
+                <collection type="list">
+                    <element name="i1" value="simple_line.txt" />
+                    <element name="i2" value="simple_line.txt" />
+                </collection>
+            </param>
+            <param name="how_select" value="txt" />
+            <param name="labels" value="new_labels_bad_2.txt" ftype="txt" />
+        </test>
+    </tests>
+    <help><![CDATA[
+    
+.. class:: infomark
+
+        This tool will take an input list and a text file with new identifiers
+        and build a new list with the same datasets but these new identifiers.
+        The order and number of entries in the text file must match the order
+        of the items you want to rename in your dataset collection.
+
+        Alternatively a tabular file may be supplied, where the first column
+        if the current identifier that should be renamed, and the second column
+        contains the new label. This file may contain less entries than items
+        in the collection. In that case only matching list identifiers will be
+        relabeled.
+
+        This tool will create new history datasets from your collection
+        but your quota usage will not increase.
+    ]]></help>
+</tool>
diff --git a/test-data/filter_labels_1.txt b/test-data/filter_labels_1.txt
@@ -0,0 +1 @@
+i2
diff --git a/test-data/new_labels_1.txt b/test-data/new_labels_1.txt
@@ -0,0 +1 @@
+new_i1
diff --git a/test-data/new_labels_2.txt b/test-data/new_labels_2.txt
@@ -0,0 +1 @@
+i1	new_i1
diff --git a/test-data/new_labels_bad_1.txt b/test-data/new_labels_bad_1.txt
@@ -0,0 +1 @@
+new_i; rm -rf
diff --git a/test-data/new_labels_bad_2.txt b/test-data/new_labels_bad_2.txt
@@ -0,0 +1,2 @@
+newi1
+newi1
diff --git a/test/functional/tools/samples_tool_conf.xml b/test/functional/tools/samples_tool_conf.xml
@@ -146,5 +146,7 @@
   <tool file="${model_tools_path}/filter_failed_collection.xml" />
   <tool file="${model_tools_path}/flatten_collection.xml" />
   <tool file="${model_tools_path}/merge_collection.xml" />
+  <tool file="${model_tools_path}/relabel_from_file.xml" />
+  <tool file="${model_tools_path}/filter_from_file.xml" />
 
 </toolbox>