galaxyproject · stephanflemming · Oct 16, 2020 · Oct 16, 2020 · Oct 16, 2020 · Oct 16, 2020
@@ -0,0 +1,169 @@
+<tool id="cd_hit_454" name="CD-HIT 454" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@">
+    <description>identifies duplicates from 454 reads</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements"/>
+    <version_command><![CDATA[cd-hit | grep "CD-HIT version" | cut -d" " -f 4]]></version_command>
+    <command detect_errors="exit_code"><![CDATA[
+cd-hit-454
+-i '$i'
+-o 'result'
+-b $b
+-c $c
+-n $n
+-aL $aL
+-AL $AL
+-aS $aS
+-AS $AS
+$g
+-D $D
+-match $match
+-mismatch $mismatch
+-gap $gap
+-gap-ext $gapext
+$out.bak
+-M \${GALAXY_MEMORY_MB:-0}
+-T \${GALAXY_SLOTS:-1}
+@LOG@
+    ]]></command>
+    <inputs>
+        <param argument="-i" type="data" format="fasta" label="Select file with reads"/>
+        <expand macro="c" default="0.98"/>
+        <expand macro="b" default="10"/>
+        <expand macro="n" default="10"/>
+        <expand macro="aL"/>
+        <expand macro="AL"/>
+        <expand macro="aS"/>
+        <expand macro="AS"/>
+        <expand macro="g"/>
+        <param argument="-D" type="integer" value="1" label="Set maximum size per indel"/>
+        <expand macro="match"/>
+        <expand macro="mismatch" default="-1"/>
+        <expand macro="gap" default="-3"/>
+        <expand macro="gapext"/>
+        <section name="out" title="Output options">
+            <expand macro="bak"/>
+            <expand macro="log"/>
+        </section>
+    </inputs>
+    <outputs>
+        <expand macro="out_clusters"/>
+        <expand macro="out_clusters_backup"/>
+        <expand macro="out_sequences"/>
+        <expand macro="out_log"/>
+    </outputs>
+    <tests>
+        <!-- #1 default -->
+        <test expect_num_outputs="4">
+            <param name="i" value="nucleotide_sequences.fasta"/>
+            <section name="out">
+                <param name="bak" value="true"/>
+                <param name="log" value="true"/>
+            </section>
+            <output name="out_clusters">
+                <assert_contents>
+                    <has_n_lines n="65"/>
+                    <has_line line=">Cluster 27"/>
+                    <has_text_matching expression="0.+183nt.+"/>
+                </assert_contents>
+            </output>
+            <output name="out_clusters_backup">
+                <assert_contents>
+                    <has_n_lines n="37"/>
+                    <has_text_matching expression="6.+241nt"/>
+                </assert_contents>
+            </output>
+            <output name="out_sequences">
+                <assert_contents>
+                    <has_n_lines n="56"/>
+                    <has_line line=">M44Fcsw_200453"/>
+                </assert_contents>
+            </output>
+            <output name="out_log">
+                <assert_contents>
+                    <has_line line="program completed !"/>
+                </assert_contents>
+            </output>
+        </test>
+        <!-- #2 custom -->
+        <test expect_num_outputs="4">
+            <param name="i" value="nucleotide_sequences.fasta"/>
+            <param name="c" value="0.97"/>
+            <param name="b" value="9"/>
+            <param name="n" value="11"/>
+            <param name="aL" value="0.9"/>
+            <param name="AL" value="99999998"/>
+            <param name="aS" value="0.9"/>
+            <param name="AS" value="99999998"/>
+            <param name="g" value="true"/>
+            <param name="D" value="2"/>
+            <param name="match" value="3"/>
+            <param name="mismatch" value="-2"/>
+            <param name="gap" value="-4"/>
+            <param name="gapext" value="-2"/>
+            <section name="out">
+                <param name="bak" value="true"/>
+                <param name="log" value="true"/>
+            </section>
+            <output name="out_clusters">
+                <assert_contents>
+                    <has_n_lines n="60"/>
+                    <has_line line=">Cluster 22"/>
+                    <has_text_matching expression="0.+183nt.+"/>
+                </assert_contents>
+            </output>
+            <output name="out_clusters_backup">
+                <assert_contents>
+                    <has_n_lines n="37"/>
+                    <has_text_matching expression="3.+241nt.+"/>
+                </assert_contents>
+            </output>
+            <output name="out_sequences">
+                <assert_contents>
+                    <has_n_lines n="46"/>
+                    <has_line line=">M43Fcsw_250770"/>
+                </assert_contents>
+            </output>
+            <output name="out_log">
+                <assert_contents>
+                    <has_line line="program completed !"/>
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+.. class:: infomark
+
+**What it does**
+
+@WID@
+
+*CD-HIT-454* is part of CD-HIT and identifies duplicated 454 reads by reengineering *CD-HIT-EST*. Duplicates are either exactly identical or meet these criteria:
+
+    (1) they start at the same position 
+    (2) their lengths can be different, but shorter one must be fully aligned with the longer one (the seed)
+    (3) they can only have 4% mismatches (insertion, deletion, and substitution)
+    (4) only 1 base is allowed per insertion or deletion
+
+Here, 3. and 4. can be adjusted by users. Mismatches are allowed in order to tolerate sequencing errors. Visit the `project wiki <https://github.com/weizhongli/cdhit/wiki/3.-User's-Guide#CDHIT454_clustering>`_ for a detailed description.
+
+**Input**
+
+Reads in FASTA format.
+
+**Output**
+
+- Representative sequences in FASTA format
+@CLUSTER@
+@CLUSTER_BACKUP@
+- Log file
+
+.. class:: infomark
+
+**References**
+
+@REFERENCES@
+    ]]></help>
+    <expand macro="citations"/>
+</tool>