adding rule to intersect bed (#65)

* when using bedtools intersect, keep only the highest intersection of annotation * Update merge_annotations.nf * updating versioning information * change modules to use named outputs for better readability * fixed optional emissions * fix awk print statement * Update CHANGELOG.md * Update CHANGELOG.md
fmalmeida · Sep 17, 2022 · 8841acf · 8841acf
1 parent 620b3a5
commit 8841acf
Show file tree

Hide file tree

Showing 40 changed files with 277 additions and 343 deletions.
diff --git a/.zenodo.json b/.zenodo.json
@@ -2,7 +2,7 @@
     "description": "<p>The pipeline</p>\n\n<p>bacannot, is a customisable, easy to use, pipeline that uses state-of-the-art software for comprehensively annotating prokaryotic genomes having only Docker and Nextflow as dependencies. It is able to annotate and detect virulence and resistance genes, plasmids, secondary metabolites, genomic islands, prophages, ICEs, KO, and more, while providing nice an beautiful interactive documents for results exploration.</p>", 
     "license": "other-open", 
     "title": "fmalmeida/bacannot: A generic but comprehensive bacterial annotation pipeline", 
-    "version": "v3.1.4", 
+    "version": "v3.1.5", 
     "upload_type": "software",
     "creators": [
         {

diff --git a/markdown/CHANGELOG.md b/markdown/CHANGELOG.md
@@ -2,6 +2,12 @@
 
 The tracking for changes started in v2.1
 
+## v3.1.5 [17-September-2022]
+
+* Fixes https://github.com/fmalmeida/bacannot/issues/64 reported by @fmalmeida, which highlights that the resfinder annotation were sometimes being duplicated because a single gene had intersection to more then one CDS regions, but, only one being "true".
+  - To solve such issue, intersections are now sorted by length, and only the first occurence (per gene) is kept.
+* As reported in issue #39, named outputs are now used in modules for better readability.
+
 ## v3.1.4 [13-September-2022]
 
 * Fixes https://github.com/fmalmeida/bacannot/issues/62 reported by @rujinlong, where Island-Path tool was failling because it was running on genbank files with no true CDS. This was hapenning because Bakta writes in the comments that the GBK has 0 CDS and, at first, the module was selecting GBK by checking if the CDS string was there. It has now been modified to also work with Bakta.

diff --git a/modules/KOs/kegg-decoder.nf b/modules/KOs/kegg-decoder.nf
@@ -7,9 +7,8 @@ process KEGG_DECODER {
   tuple val(prefix), path('input_mapper.txt')
 
   output:
-  // Grab all outputs
-  path("*") // Get all files to input directory
-  tuple val(prefix), path("*.svg") // get svg
+  path("*")                       , emit: all     // Get all files to input directory
+  tuple val(prefix), path("*.svg"), emit: results // get svg
 
   script:
   """

diff --git a/modules/KOs/kofamscan.nf b/modules/KOs/kofamscan.nf
@@ -12,8 +12,8 @@ process KOFAMSCAN {
 
   output:
   // Grab all outputs
-  file("KOfamscan")
-  tuple val(prefix), file("KOfamscan/${prefix}_ko_forKEGGMapper.txt")
+  path("KOfamscan"), emit: all
+  tuple val(prefix), path("KOfamscan/${prefix}_ko_forKEGGMapper.txt"), emit: results
 
   script:
   """

diff --git a/modules/MGEs/digIS.nf b/modules/MGEs/digIS.nf
@@ -12,10 +12,9 @@ process DIGIS {
   tuple val(prefix), path(genome), path(genbank)
 
   output:
-  // Grab results
-  file("digIS")
-  tuple val(prefix), path("digIS/results/${prefix}.gff")
-  tuple val(prefix), path("${prefix}_IS.gff"), path("digIS/results/fastas/${prefix}_IS.fa"), path("digIS/results/fastas/${prefix}_IS.faa")
+  path("digIS")                                         , emit: all
+  tuple val(prefix), path("digIS/results/${prefix}.gff"), emit: gff
+  tuple val(prefix), path("${prefix}_IS.gff"), path("digIS/results/fastas/${prefix}_IS.fa"), path("digIS/results/fastas/${prefix}_IS.faa"), emit: gff_and_sequences
 
   script:
   """

diff --git a/modules/MGEs/draw_gis.nf b/modules/MGEs/draw_gis.nf
@@ -11,9 +11,8 @@ process DRAW_GIS {
   tuple val(prefix), file(gff), file(gis_bed)
 
   output:
-  // Outputs must be linked to each prefix (tag)
-  tuple val(prefix), file("plots")     optional true
-  tuple val(prefix), file("teste.png") optional true
+  tuple val(prefix), file("plots")    , emit: all     optional true
+  tuple val(prefix), file("teste.png"), emit: example optional true
 
   script:
   """

diff --git a/modules/MGEs/iceberg.nf b/modules/MGEs/iceberg.nf
@@ -9,11 +9,10 @@ process ICEBERG {
   file(bacannot_db)
 
   output:
-  // Outputs must be linked to each prefix (tag)
-  tuple val(prefix), file("${prefix}_iceberg_blastp_onGenes.summary.txt")
-  tuple val(prefix), file("${prefix}_iceberg_blastp_onGenes.txt")
-  tuple val(prefix), file("${prefix}_iceberg_blastn_onGenome.summary.txt")
-  file('*.txt') // Grab all
+  tuple val(prefix), path("${prefix}_iceberg_blastp_onGenes.summary.txt") , emit: genes_summary
+  tuple val(prefix), path("${prefix}_iceberg_blastp_onGenes.txt")         , emit: results
+  tuple val(prefix), path("${prefix}_iceberg_blastn_onGenome.summary.txt"), emit: genome_summary
+  path('*.txt')                                                           , emit: all
 
   script:
   """

diff --git a/modules/MGEs/islandpath.nf b/modules/MGEs/islandpath.nf
@@ -7,8 +7,7 @@ process ISLANDPATH {
   tuple val(prefix), file("annotation.gbk")
 
   output:
-  // Outputs must be linked to each prefix (tag)
-  tuple val(prefix), path("${prefix}_predicted_GIs.bed")
+  tuple val(prefix), path("${prefix}_predicted_GIs.bed"), emit: results
 
   script:
   """

diff --git a/modules/MGEs/plasmidfinder.nf b/modules/MGEs/plasmidfinder.nf
@@ -11,8 +11,8 @@ process PLASMIDFINDER {
   file(bacannot_db)
 
   output:
-  tuple val(prefix), path("plasmidfinder")
-  tuple val(prefix), path("plasmidfinder/results_tab.tsv")
+  tuple val(prefix), path("plasmidfinder")                , emit: all
+  tuple val(prefix), path("plasmidfinder/results_tab.tsv"), emit: results
 
   script:
   """

diff --git a/modules/MGEs/platon.nf b/modules/MGEs/platon.nf
@@ -12,9 +12,9 @@ process PLATON {
   file(bacannot_db)
 
   output:
-  path("platon")
-  tuple val(prefix), path("platon/${prefix}.tsv")
-  path("platon_version.txt")
+  path("platon")                                 , emit: all
+  tuple val(prefix), path("platon/${prefix}.tsv"), emit: results
+  path("platon_version.txt")                     , emit: version
 
   script:
   """

diff --git a/modules/assembly/flye.nf b/modules/assembly/flye.nf
@@ -11,10 +11,10 @@ process FLYE {
   tuple val(prefix), val(entrypoint), file(sread1), file(sread2), file(sreads), file(lreads), val(lr_type), file(fast5), val(assembly), val(resfinder_species)
 
   output:
-  file "flye_${prefix}" // Saves all files
+  path "flye_${prefix}", emit: all // Saves all files
   // Keep tuple structure to mixing channels
-  tuple val("${prefix}"), val("${entrypoint}"), val("${sread1}"), val("${sread2}"), val("${sreads}"), file("${lreads}"), val("${lr_type}"), file("${fast5}"), file("flye_${prefix}.fasta"), val("${resfinder_species}")
-  file('flye_version.txt')
+  tuple val("${prefix}"), val("${entrypoint}"), val("${sread1}"), val("${sread2}"), val("${sreads}"), path("${lreads}"), val("${lr_type}"), path("${fast5}"), path("flye_${prefix}.fasta"), val("${resfinder_species}"), emit: results
+  path('flye_version.txt'), emit: version
 
   script:
   lr = (lr_type == 'nanopore') ? '--nano-raw' : '--pacbio-raw'

diff --git a/modules/assembly/unicycler.nf b/modules/assembly/unicycler.nf
@@ -11,10 +11,10 @@ process UNICYCLER {
   tuple val(prefix), val(entrypoint), file(sread1), file(sread2), file(sreads), file(lreads), val(lr_type), file(fast5), val(assembly), val(resfinder_species)
 
   output:
-  file "unicycler_${prefix}" // Save everything
+  path "unicycler_${prefix}", emit: all // Save everything
   // Keep tuple structure to mixing channels
-  tuple val("${prefix}"), val("${entrypoint}"), val("${sread1}"), val("${sread2}"), val("${sreads}"), file("${lreads}"), val("${lr_type}"), file("${fast5}"), file("unicycler_${prefix}.fasta"), val("${resfinder_species}")
-  file('unicycler_version.txt')
+  tuple val("${prefix}"), val("${entrypoint}"), val("${sread1}"), val("${sread2}"), val("${sreads}"), path("${lreads}"), val("${lr_type}"), path("${fast5}"), path("unicycler_${prefix}.fasta"), val("${resfinder_species}"), emit: results
+  path('unicycler_version.txt'), emit: version
 
   script:
   unpaired_param = (sreads.getName() != "input.3") ? "-s $sreads" : ""

diff --git a/modules/generic/antismash.nf b/modules/generic/antismash.nf
@@ -11,10 +11,9 @@ process ANTISMASH {
   file(bacannot_db)
 
   output:
-  // Grab results
-  tuple val(prefix), path("antiSMASH/regions.gff")
-  path("antiSMASH")
-  path("*_version.txt")
+  tuple val(prefix), path("antiSMASH/regions.gff"), emit: gff
+  path("antiSMASH")                               , emit: all
+  path("*_version.txt")                           , emit: version
 
   script:
   def gbk_suffix = (params.bakta_db) ? "gbff" : "gbk"

diff --git a/modules/generic/bakta.nf b/modules/generic/bakta.nf
@@ -13,17 +13,17 @@ process BAKTA {
 
     output:
     // Grab all outputs
-    file "annotation"
+    path "annotation", emit: all
     // Outputs must be linked to each prefix (tag)
-    tuple val(prefix), file("annotation/${prefix}.gff3") // annotation in gff format
-    tuple val(prefix), file("annotation/${prefix}.gbff") // annotation in gbk format
-    tuple val(prefix), file("annotation/${prefix}.fna") // renamed genome
-    tuple val(prefix), file("annotation/${prefix}.faa") // gene aa sequences
-    tuple val(prefix), file("annotation/${prefix}.ffn") // gene nt sequences
-    tuple val(prefix), file("annotation/${prefix}.fna"), file("${lreads}"), file("${fast5}") // For methylation calling
-    tuple val(prefix), file("annotation/${prefix}.fna"), val("${resfinder_species}") // For resfinder
-    tuple val(prefix), file("annotation/${prefix}.txt") // bakta stats
-    file('bakta_version.txt') // Save bakta version
+    tuple val(prefix), path("annotation/${prefix}.gff3"), emit: gff // annotation in gff format
+    tuple val(prefix), path("annotation/${prefix}.gbff"), emit: gbk // annotation in gbk format
+    tuple val(prefix), path("annotation/${prefix}.fna") , emit: genome // renamed genome
+    tuple val(prefix), path("annotation/${prefix}.faa") , emit: proteins // gene aa sequences
+    tuple val(prefix), path("annotation/${prefix}.ffn") , emit: genes // gene nt sequences
+    tuple val(prefix), path("annotation/${prefix}.fna"), path("${lreads}"), path("${fast5}"), emit: genome_with_fast5 // For methylation calling
+    tuple val(prefix), path("annotation/${prefix}.fna"), val("${resfinder_species}"), emit: genome_with_species // For resfinder
+    tuple val(prefix), path("annotation/${prefix}.txt") , emit: summary // bakta stats
+    path('bakta_version.txt'), emit: version // Save bakta version
 
     script:
     """

diff --git a/modules/generic/barrnap.nf b/modules/generic/barrnap.nf
@@ -10,9 +10,9 @@ process BARRNAP {
    tuple val(prefix), file(genome)
 
    output:
-   tuple val(prefix), path("${prefix}_rRNA.gff")
-   tuple val(prefix), path("${prefix}_rRNA.fa")
-   path('barrnap_version.txt')
+   tuple val(prefix), path("${prefix}_rRNA.gff"), emit: gff
+   tuple val(prefix), path("${prefix}_rRNA.fa") , emit: fasta
+   path('barrnap_version.txt')                  , emit: version
 
    script:
    """

diff --git a/modules/generic/compute_gc.nf b/modules/generic/compute_gc.nf
@@ -6,8 +6,7 @@ process COMPUTE_GC {
   tuple val(prefix), file(genome)
 
   output:
-  // Outputs must be linked to each prefix (tag)
-  tuple val(prefix), path("input_GC_500_bps.sorted.bedGraph"), path("input.sizes")
+  tuple val(prefix), path("input_GC_500_bps.sorted.bedGraph"), path("input.sizes"), emit: results
 
   script:
   """

diff --git a/modules/generic/custom_database.nf b/modules/generic/custom_database.nf
@@ -8,11 +8,10 @@ process CUSTOM_DATABASE {
   each file(customDB)
 
   output:
-  // Outputs must be linked to each prefix (tag)
-  tuple val(prefix), val("${customDB.baseName}"), path("${prefix}_${customDB.baseName}*.summary.txt")
-  tuple val(prefix), path("${customDB.baseName}_custom_db.gff")
-  path('*.txt') // Grab all
-  path(customDB)
+  tuple val(prefix), val("${customDB.baseName}"), path("${prefix}_${customDB.baseName}*.summary.txt"), emit: summary
+  tuple val(prefix), path("${customDB.baseName}_custom_db.gff")                                      , emit: gff
+  path('*.txt')                                                                                      , emit: all
+  path(customDB)                                                                                     , emit: db
 
   script:
   """

diff --git a/modules/generic/custom_database_report.nf b/modules/generic/custom_database_report.nf
@@ -10,7 +10,7 @@ process CUSTOM_DATABASE_REPORT {
   tuple val(prefix), val(customDB), file(custom_blast), file(custom_gff)
 
   output:
-  file '*.html'
+  path '*.html', emit: results
 
   script:
   """

diff --git a/modules/generic/gff2gbk.nf b/modules/generic/gff2gbk.nf
@@ -7,7 +7,7 @@ process GFF2GBK {
   tuple val(prefix), file(gff), file(input)
 
   output:
-  file "*.genbank"
+  path "*.genbank", emit: results
 
   """
   # Activate env

diff --git a/modules/generic/gff2sql.nf b/modules/generic/gff2sql.nf
@@ -10,8 +10,8 @@ process CREATE_SQL {
     tuple val(prefix), file(gff), file(genes_nt), file(genes_aa), file(genome), file("digIS.gff"), file("digIS.fa"), file("digIS.faa")
 
   output:
-    file "${prefix}.sqlite"
-    file "run_server.sh"
+  path "${prefix}.sqlite", emit: results
+  path "run_server.sh"   , emit: script
 
   script:
   """

diff --git a/modules/generic/jbrowse.nf b/modules/generic/jbrowse.nf
@@ -7,7 +7,7 @@ process JBROWSE {
   tuple val(prefix), file(merged_gff), file(draft), file("prokka_gff"), file(barrnap), file(gc_bedGraph), file(gc_chrSizes), file(resfinder_gff), file(phigaro), file(genomic_islands), file("methylation"), file("chr.sizes"), file(phispy_tsv), file(digIS_gff), file(antiSMASH), file(custom_annotations)
 
   output:
-  file "*"
+  path "*", emit: results
 
   script:
   """

diff --git a/modules/generic/mash.nf b/modules/generic/mash.nf
@@ -10,9 +10,8 @@ process REFSEQ_MASHER {
   tuple val(prefix), path(genome)
 
   output:
-  // Grab results
-  tuple val(prefix), path("refseq_masher_results.txt")
-  path("*_version.txt")
+  tuple val(prefix), path("refseq_masher_results.txt"), emit: results
+  path("*_version.txt")                               , emit: version
 
   script:
   """

diff --git a/modules/generic/merge_annotations.nf b/modules/generic/merge_annotations.nf
@@ -7,10 +7,10 @@ process MERGE_ANNOTATIONS {
   tuple val(prefix), file('prokka_gff'), file(kofamscan), file(vfdb), file(victors), file(amrfinder), file(resfinder), file(rgi), file(iceberg), file(phast), file('digis_gff'), file(custom_databases)
 
   output:
-  tuple val(prefix), path("${prefix}.gff")
-  tuple val(prefix), path("transposable_elements_digis.gff")
-  tuple val(prefix), path("custom_database_*.gff") optional true
-  path("*.gff")
+  tuple val(prefix), path("${prefix}.gff")                  , emit: gff
+  tuple val(prefix), path("transposable_elements_digis.gff"), emit: digis_gff
+  tuple val(prefix), path("custom_database_*.gff")          , emit: customdb_gff optional true
+  path("*.gff")                                             , emit: all
 
   script:
   """
@@ -81,7 +81,7 @@ process MERGE_ANNOTATIONS {
   #### Resfinder
   if [ ! \$(cat $resfinder | wc -l) -eq 0 ]
   then
-    bedtools intersect -a $resfinder -b ${prefix}.gff -wo > resfinder_intersected.txt ;
+    bedtools intersect -a $resfinder -b ${prefix}.gff -wo | sort -k19,19 -r | awk -F '\\t' '!seen[\$9]++' > resfinder_intersected.txt ;
     addBedtoolsIntersect.R -g ${prefix}.gff -t resfinder_intersected.txt --type Resistance --source Resfinder -o ${prefix}.gff ;
     grep "Resfinder" ${prefix}.gff > resistance_resfinder.gff ;
     rm -f resfinder_intersected.txt ;
@@ -93,7 +93,7 @@ process MERGE_ANNOTATIONS {
     if [ ! \$(cat \$file | wc -l) -eq 0 ]
     then
       db=\${file%%_custom_db.gff} ;
-      bedtools intersect -a \${file} -b ${prefix}.gff -wo > bedtools_intersected.txt ;
+      bedtools intersect -a \${file} -b ${prefix}.gff -wo | sort -k19,19 -r | awk -F '\\t' '!seen[\$9]++' > bedtools_intersected.txt ;
       addBedtoolsIntersect.R -g ${prefix}.gff -t bedtools_intersected.txt --type "CDS" --source "\${db}" -o ${prefix}.gff ;
       grep "\${db}" ${prefix}.gff > custom_database_\${db}.gff ;
       rm -f bedtools_intersected.txt ;

diff --git a/modules/generic/methylation.nf b/modules/generic/methylation.nf
@@ -10,12 +10,11 @@ process CALL_METHYLATION {
   tuple val(prefix), file(draft), file(reads), file(fast5)
 
   output:
-  // Grab all outputs
-  file "*_calls.tsv" optional true
-  file "*_frequency.tsv" optional true
-  tuple val(prefix), file("methylation_frequency.bedGraph") optional true
-  tuple val(prefix), file("chr.sizes") optional true
-  file('nanopolish_version.txt')
+  path "*_calls.tsv"                                       , emit: results     optional true
+  path "*_frequency.tsv"                                   , emit: frequencies optional true
+  tuple val(prefix), path("methylation_frequency.bedGraph"), emit: bedgraph    optional true
+  tuple val(prefix), path("chr.sizes")                     , emit: chr_sizes   optional true
+  path('nanopolish_version.txt')                           , emit: version
 
   when:
   // When an entry does not exist, it is created as 'input'

diff --git a/modules/generic/mlst.nf b/modules/generic/mlst.nf
@@ -11,9 +11,9 @@ process MLST {
    file(bacannot_db)
 
    output:
-   tuple val(prefix), path("${prefix}_mlst_analysis.txt")   optional true
-   tuple val(prefix), path("${prefix}_novel_alleles.fasta") optional true
-   path('mlst_version.txt')
+   tuple val(prefix), path("${prefix}_mlst_analysis.txt")  , emit: results optional true
+   tuple val(prefix), path("${prefix}_novel_alleles.fasta"), emit: alleles optional true
+   path('mlst_version.txt')                                , emit: version
 
    script:
    """

diff --git a/modules/generic/ncbi_protein.nf b/modules/generic/ncbi_protein.nf
@@ -5,7 +5,7 @@ process GET_NCBI_PROTEIN {
   file(ncbi_accs)
 
   output:
-  path("ncbi_protein.faa")
+  path("ncbi_protein.faa"), emit: proteins
 
   script:
   """

diff --git a/modules/generic/prokka.nf b/modules/generic/prokka.nf
@@ -13,17 +13,17 @@ process PROKKA {
 
     output:
     // Grab all outputs
-    path("annotation")
+    path("annotation"), emit: all
     // Outputs must be linked to each prefix (tag)
-    tuple val(prefix), path("annotation/${prefix}.gff")
-    tuple val(prefix), path("annotation/${prefix}.gbk")
-    tuple val(prefix), path("annotation/${prefix}.fna")
-    tuple val(prefix), path("annotation/${prefix}.faa")
-    tuple val(prefix), path("annotation/${prefix}.ffn")
-    tuple val(prefix), path("annotation/${prefix}.fna"), path("${lreads}"), path("${fast5}")
-    tuple val(prefix), path("annotation/${prefix}.fna"), val("${resfinder_species}")
-    tuple val(prefix), path("annotation/${prefix}.txt")
-    path('prokka_version.txt')
+    tuple val(prefix), path("annotation/${prefix}.gff"), emit: gff
+    tuple val(prefix), path("annotation/${prefix}.gbk"), emit: gbk
+    tuple val(prefix), path("annotation/${prefix}.fna"), emit: genome
+    tuple val(prefix), path("annotation/${prefix}.faa"), emit: proteins
+    tuple val(prefix), path("annotation/${prefix}.ffn"), emit: genes
+    tuple val(prefix), path("annotation/${prefix}.fna"), path("${lreads}"), path("${fast5}"), emit: genome_with_fast5
+    tuple val(prefix), path("annotation/${prefix}.fna"), val("${resfinder_species}"), emit: genome_with_species
+    tuple val(prefix), path("annotation/${prefix}.txt"), emit: summary
+    path('prokka_version.txt'), emit: version
 
     script:
     kingdom = (params.prokka_kingdom)      ? "--kingdom ${params.prokka_kingdom}"        : ''

diff --git a/modules/generic/reports.nf b/modules/generic/reports.nf
@@ -7,7 +7,7 @@ process REPORT {
   tuple val(prefix), file('annotation_stats.tsv'), file(gff), file(barrnap), file(mlst), file(keggsvg), file(refseq_masher_txt), file(amrfinder), file(rgi), file(rgi_parsed), file(rgi_heatmap), file(argminer_out), file(resfinder_tab), file(resfinder_point), file(resfinder_phenotable), file(vfdb_blastn), file(victors_blastp), file(phigaro_txt), file(phispy_tsv), file(iceberg_blastp), file(iceberg_blastn), file(plasmids_tsv), file(platon_tsv), file(gi_image), file(phast_blastp), file(digIS)
 
   output:
-  file '*.html'
+  path '*.html', emit: results
 
   script:
   def generic_annotator = (params.bakta_db) ? "bakta" : "prokka"