fmalmeida · fmalmeida · Jan 27, 2022 · Jan 28, 2022 · Feb 1, 2022 · Feb 1, 2022
diff --git a/bin/gbk2faa.py b/bin/gbk2faa.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+
+# impor required libraries
+from Bio import SeqIO
+import sys
+
+# get filename from command line
+filename = sys.argv[-1]
+gbk_filename = filename
+
+# open file connection
+input_handle  = open(gbk_filename, "r")
+
+# def function for stderr print
+def eprint(*args, **kwargs):
+    print(*args, file=sys.stderr, **kwargs)
+
+for seq_record in SeqIO.parse(input_handle, "genbank") :
+
+    # all brank to check for errors later
+    organism = None
+    product  = None
+    gene     = None
+
+    for seq_feature in seq_record.features:
+
+        # get organism
+        if seq_feature.type.lower()=="source" :
+            organism = seq_feature.qualifiers['organism'][0]
+
+        # get product
+        if seq_feature.type.lower()=="protein" :
+            product = seq_feature.qualifiers['product'][0].replace(" ", "_")
+
+        # get gene name if entry has only CDS definition
+        if seq_feature.type.lower()=="cds" :
+            gene = seq_feature.qualifiers['gene'][0]
+
+        # get gene name if entry has only Gene definition
+        if seq_feature.type.lower()=="gene" :
+            gene = seq_feature.qualifiers['gene'][0]
+
+        # save sequence info
+        seq  = seq_record.seq
+        acc  = seq_record.name
+
+    # print
+    if gene==None or product==None:
+        eprint(f"An error has been found with entry {acc}. Either its gene (value found: {gene}) or product (Value found: {product}) was not found in the database genbank.\nPlease make sure this entry is from NCBI Protein db and it has the Gene/Protein information properly formated.")
+    else:
+        print(f">NCBI_PROTEIN~~~{gene}~~~{acc}~~~{product}~~~[{organism}]\n{seq}")
+
+# close file connection
+input_handle.close()
diff --git a/docker/scripts/pscripts/resfinder2gff.py → bin/resfinder2gff.py b/docker/scripts/pscripts/resfinder2gff.py → bin/resfinder2gff.py
diff --git a/docker/scripts/pscripts/run_blasts.py → bin/run_blasts.py b/docker/scripts/pscripts/run_blasts.py → bin/run_blasts.py
@@ -54,10 +54,10 @@ def filter(string, substr):
 def blastn(query, db, culling, minid, mincov, out, threads):
 
     # Outfmt
-    outfmt="6 qseqid qstart qend qlen sseqid sstart send slen evalue length pident gaps gapopen stitle"
+    outfmt="6 qseqid qstart qend qlen sseqid sstart send slen evalue length pident gaps gapopen"
 
     # Run blastn
-    os.system(f"echo \"qseqid\tqstart\tqend\tqlen\tsseqid\tsstart\tsend\tslen\tevalue\tlength\tpident\tgaps\tgapopen\tstitle\" > {out}")
+    os.system(f"echo \"qseqid\tqstart\tqend\tqlen\tsseqid\tsstart\tsend\tslen\tevalue\tlength\tpident\tgaps\tgapopen\" > {out}")
 
     if arguments['--2way']:
         os.system(f"blastn -query {query} -db {db} -outfmt \"{outfmt}\" -num_threads {threads} -culling_limit {culling} -perc_identity {minid} | \
@@ -72,10 +72,10 @@ def blastn(query, db, culling, minid, mincov, out, threads):
 def tblastn(query, db, culling, minid, mincov, out, threads):
 
     # Outfmt
-    outfmt="6 sseqid sstart send slen qseqid qstart qend qlen evalue length pident gaps gapopen stitle"
+    outfmt="6 sseqid sstart send slen qseqid qstart qend qlen evalue length pident gaps gapopen"
 
     # Run blastn
-    os.system(f"echo \"qseqid\tqstart\tqend\tqlen\tsseqid\tsstart\tsend\tslen\tevalue\tlength\tpident\tgaps\tgapopen\tstitle\" > {out}")
+    os.system(f"echo \"qseqid\tqstart\tqend\tqlen\tsseqid\tsstart\tsend\tslen\tevalue\tlength\tpident\tgaps\tgapopen\" > {out}")
 
     if arguments['--2way']:
         os.system(f"tblastn -subject {query} -query {db} -outfmt \"{outfmt}\" -num_threads {threads} -culling_limit {culling} | \
@@ -90,10 +90,10 @@ def tblastn(query, db, culling, minid, mincov, out, threads):
 def blastx(query, db, culling, minid, mincov, out, threads):
 
     # Outfmt
-    outfmt="6 qseqid qstart qend qlen sseqid sstart send slen evalue length pident gaps gapopen stitle"
+    outfmt="6 qseqid qstart qend qlen sseqid sstart send slen evalue length pident gaps gapopen"
 
     # Run blastn
-    os.system(f"echo \"qseqid\tqstart\tqend\tqlen\tsseqid\tsstart\tsend\tslen\tevalue\tlength\tpident\tgaps\tgapopen\tstitle\" > {out}")
+    os.system(f"echo \"qseqid\tqstart\tqend\tqlen\tsseqid\tsstart\tsend\tslen\tevalue\tlength\tpident\tgaps\tgapopen\" > {out}")
 
     if arguments['--2way']:
         os.system(f"diamond blastx --query {query} --db {db} --outfmt {outfmt} --max-target-seqs {culling} \
@@ -108,10 +108,10 @@ def blastx(query, db, culling, minid, mincov, out, threads):
 def blastp(query, db, culling, minid, mincov, out, threads):
 
     # Outfmt
-    outfmt="6 qseqid qstart qend qlen sseqid sstart send slen evalue length pident gaps gapopen stitle"
+    outfmt="6 qseqid qstart qend qlen sseqid sstart send slen evalue length pident gaps gapopen"
 
     # Run blastn
-    os.system(f"echo \"qseqid\tqstart\tqend\tqlen\tsseqid\tsstart\tsend\tslen\tevalue\tlength\tpident\tgaps\tgapopen\tstitle\" > {out}")
+    os.system(f"echo \"qseqid\tqstart\tqend\tqlen\tsseqid\tsstart\tsend\tslen\tevalue\tlength\tpident\tgaps\tgapopen\" > {out}")
 
     if arguments['--2way']:
         os.system(f"diamond blastp --query {query} --db {db} --outfmt {outfmt} --max-target-seqs {culling} \
@@ -143,11 +143,11 @@ def summary(output):
         else:
             cov_map=str(line["sstart"]) + '-' + str(line["send"]) + "/" + str(line["slen"])
         # Parse headers
-        db=line["stitle"].split('~~~')[0]
-        gene=line["stitle"].split('~~~')[1]
-        acc=line["stitle"].split('~~~')[2]
-        prodc=line["stitle"].split('~~~')[3].split(" ", 1)[0]
-        desc=line["stitle"].split('~~~')[3].split(" ", 1)[1]
+        db=line["sseqid"].split('~~~')[0]
+        gene=line["sseqid"].split('~~~')[1]
+        acc=line["sseqid"].split('~~~')[2]
+        prodc=line["sseqid"].split('~~~')[4]
+        desc=line["sseqid"].split('~~~')[5]
         # Subject coverage
         cov=round((100 * (line["length"] - line["gaps"]) / line["slen"]), 2)
         # Identity

diff --git a/docker/scripts/pscripts/splitgenbank.py → bin/splitgenbank.py b/docker/scripts/pscripts/splitgenbank.py → bin/splitgenbank.py
diff --git a/bin/update_database_image.sh b/bin/update_database_image.sh
diff --git a/conf/small_dataset_test_profile.config b/conf/small_dataset_test_profile.config
@@ -59,6 +59,8 @@ params {
            */
 // Custom nucleotide fastas
   custom_db = 'https://github.com/fmalmeida/test_datasets/raw/main/small_custom_db.fasta'
+// Custom annotation based on NCBI protein IDs
+  ncbi_proteins = 'https://github.com/fmalmeida/test_datasets/raw/main/haemophilus_ncbi_proteins.txt'
 
 }
 

diff --git a/conf/test_profile.config b/conf/test_profile.config
@@ -59,6 +59,8 @@ params {
            */
 // Custom nucleotide fastas
   custom_db = 'https://github.com/fmalmeida/test_datasets/raw/main/small_custom_db.fasta'
+// Custom annotation based on NCBI protein IDs
+  ncbi_proteins = 'https://github.com/fmalmeida/test_datasets/raw/main/haemophilus_ncbi_proteins.txt'
 
 }
 

diff --git a/docker/Dockerfile_bacannot b/docker/Dockerfile_bacannot
@@ -52,7 +52,7 @@ WORKDIR /work/dbs
 RUN mkdir vfdb && \
 		wget http://www.mgc.ac.cn/VFs/Down/VFDB_setA_nt.fas.gz && \
 		gzip -d VFDB_setA_nt.fas.gz && \
-		awk -v db=VFDB '/>/{ split($0,name," "); split($0,id," \\["); all=$0; $0=">" db "~~~" name[2] "~~~" name[1] "~~~[" id[2] " " all }1' VFDB_setA_nt.fas | \
+		awk -v db=VFDB '/>/{ split($0,name," "); split($0,id," \\["); all=$0; $0=">" db "~~~" name[2] "~~~" name[1] "~~~[" id[2] "~~~" all }1' VFDB_setA_nt.fas | \
 		sed -e 's/~>/~/g' -e 's/ ~/~/g' -e 's/]~/~/g' -e 's/ >/ /' | \
 		awk -F "]" ' { if ($0 ~ />/) { gsub(" ", "_", $1); print $1 "] " $2 "]"} else { print $0 }}' > vfdb/sequences && \
 		makeblastdb -in vfdb/sequences -title 'vfdb' -dbtype nucl -logfile /dev/null && \
@@ -61,7 +61,7 @@ RUN mkdir vfdb && \
 ## ICEberg nt (ICEs)
 RUN mkdir iceberg && \
 		wget https://bioinfo-mml.sjtu.edu.cn/ICEberg2/download/ICE_seq_experimental.fas && \
-		awk -v db=ICEberg '/>/{ split($0,a,"|"); all=$0; $0=">" db "~~~" "ICE_" a[2] "~~~" a[5] "~~~" a[3] " " all }1' ICE_seq_experimental.fas | \
+		awk -v db=ICEberg '/>/{ split($0,a,"|"); all=$0; $0=">" db "~~~" "ICE_" a[2] "~~~" a[5] "~~~" a[3] "~~~" all }1' ICE_seq_experimental.fas | \
 		sed -e 's/ >/ /g' > iceberg/sequences && \
 		rm ICE_seq_experimental.fas && \
 		makeblastdb -in iceberg/sequences -title 'iceberg' -dbtype nucl -logfile /dev/null

diff --git a/docker/Dockerfile_main_tools b/docker/Dockerfile_main_tools
@@ -59,10 +59,3 @@ RUN conda create -y -n digIS -c bioconda 'hmmer==3.1b2' 'biopython==1.77' nomkl
 ### my custom scripts ###
 #########################
 COPY scripts/bscripts /work/bscripts
-COPY scripts/pscripts/splitgenbank.py /usr/local/bin/splitgenbank.py
-COPY scripts/pscripts/resfinder2gff.py /usr/local/bin/resfinder2gff.py
-COPY scripts/pscripts/run_blasts.py /usr/local/bin/run_blasts.py
-RUN chmod a+x /usr/local/bin/run_blasts.py && \
-		chmod a+rwx /work/bscripts/* && \
-		chmod a+x /usr/local/bin/splitgenbank.py && \
-		chmod a+x /usr/local/bin/resfinder2gff.py
diff --git a/docker/reports/report_custom_blast.Rmd b/docker/reports/report_custom_blast.Rmd
@@ -77,7 +77,7 @@ dt_opt_lst <- list(pageLength = 5,
 
 ## About
 
-This report is based on the user's custom database input called `r params$blast_db`. This database have been blasted against the query genome via BLASTn. The BLASTn raw results are shown in the table \@ref(tab:blastn-raw-table). Additionally, the BLAST results have been used to detect intersection points with the annotation using the software bedtools. Therefore, in table \@ref(tab:blastn-gff-table), we show the gene features from the query genome that have intersection points with the BLAST results.
+This report is based on the user's custom database input called `r params$blast_db`. This database have been blasted against the query genome via BLASTn or tBLASTn, depending on the query input type. The raw BLAST results are shown in the table \@ref(tab:blastn-raw-table). Additionally, the BLAST results have been used to detect intersection points with the annotation using the software bedtools. Therefore, in table \@ref(tab:blastn-gff-table), we show the gene features from the query genome that have intersection points with the BLAST results.
 
 > Take note that DT tables will only be rendered when input is available. Thus, whenever a table is missing it is because no results have been found.
 
@@ -90,12 +90,12 @@ All the predictions were passed through a user defined threshold for minimum cov
 
 ## Results
 
-### BLASTn results
+### Raw BLAST results
 
-The BLASTn results of the custom database against the query genome is shown in the table \@ref(tab:blastn-raw-table).
+The BLAST results of the custom database against the query genome is shown in the table \@ref(tab:blast-raw-table).
 
 <br>
-<caption>(#tab:blastn-raw-table) Alignment of the `r params$blast_db` custom database against the query genome via BLASTn</caption>
+<caption>(#tab:blast-raw-table) Alignment of the `r params$blast_db` custom database against the query genome.</caption>
 ```{r}
 blast <- custom_blast
 # Render dt
@@ -109,10 +109,10 @@ datatable(blast,
 
 ### Annotation intersection
 
-Using the software bedtools intersect, the BLASTn results have been used to search for intersections against the query gene annotation. This result is shown in table \@ref(tab:blastn-gff-table).
+Using the software bedtools intersect, the BLAST results have been used to search for intersections against the query gene annotation. This result is shown in table \@ref(tab:blast-gff-table).
 
 <br>
-<caption>(#tab:blastn-gff-table) Annotation intersection of the `r params$blast_db` custom database BLASTn results</caption>
+<caption>(#tab:blast-gff-table) Annotation intersection of the `r params$blast_db` custom database BLAST results with the pipeline's core annotation.</caption>
 ```{r}
 blast <- blast_gff
 colnames(blast) <-

diff --git a/docs/custom-db.rst b/docs/custom-db.rst
@@ -3,17 +3,32 @@
 Custom database configuration
 =============================
 
-It is also possible that users use custom databases for the annotation of genomes. Currently, the pipeline only executes BLASTn alignments using the genome as query.
-Therefore, the given databases must be files of **target gene sequences in nucleotide** FASTA.
+It is also possible that users use custom databases for the annotation of genomes, either by providing custom nucleotide FASTAs, properly formatted with **target gene sequences in nucleotide** to be search against the genome with BLASTn, or by providing a file containing a list of NCBI Protein database accession which will retrieve these sequences and annotated the genome against them using BLASTp.
 
-The custom annotation is triggered with the ``--custom_db`` parameter. The pipeline accepts more than one custom database at once, separated by commas, e.g.
+NCBI Protein database
+---------------------
+
+To perform an additional annotation of the genome using proteins from NCBI Protein database, one must use the parameter ``--ncbi_proteins`` and provide a file containing a list of protein IDs such as this one:
+
+.. code-block:: bash
+
+    WP_118891437.1
+    VTX70803.1
+    VTX49335.1
+    WP_005693332.1
+    WP_138172127.1
+
+Custom nucleotide database (in FASTA)
+-------------------------------------
+
+This custom annotation is triggered with the ``--custom_db`` parameter. The pipeline accepts more than one custom database at once, separated by commas, e.g.
 ``--custom_db db1.fasta,db2.fasta``.
 
 Although simple, the custom database must follow some rules about sequence header format in order to make it possible the summarization of alignments and renderization
 of custom reports in HTML format, that shall be available under the ``report_files`` directory.
 
 Sequence header format
-----------------------
+""""""""""""""""""""""
 
 Sequence headers must follow a 5-field rule separated by "~~~" and spaces. The first 4 fields must be separated by "~~~" and the last one by one space, following the
 example shown below:
@@ -34,10 +49,10 @@ example shown below:
 
   It is very important to follow this header format in order to make it possible and easier to render summaries and reports of the BLASTn result, such as below:
 
-BLASTn summary example
-----------------------
+Summary example for custom database annotations
+-----------------------------------------------
 
-When the header is followed, the summaries and reports are very well rendered such as in this example:
+The custom annotations, either with ``--custom_db`` or with ``--ncbi_protein`` will produce useful summaries and reports (:ref:`outputs`) such as in this example:
 
 .. code-block:: bash
 

diff --git a/docs/manual.rst b/docs/manual.rst
@@ -195,8 +195,10 @@ On/Off processes
      - | Tells whether or not to run antiSMASH (secondary metabolite) annotation.
        | AntiSMASH is executed using only its core annotation modules in order to keep it fast
 
-Custom nucl databases
-"""""""""""""""""""""
+Custom databases
+""""""""""""""""
+
+Parameters to load custom databases annotation using nucleotide sequences provided by the user or protein sequences from NCBI (given a list with accessions). Please, to learn more about how to format a nucl custom database or to use sequences from NCBI Protein, refer to :ref:`custom-db`.
 
 .. list-table::
    :widths: 20 10 20 30
@@ -210,7 +212,12 @@ Custom nucl databases
    * - ``--custom_db``
      - N
      - NA
-     - Custom gene nucleotide databases to be used for additional annotations against the genome. See :ref:`custom-db`.
+     - Path to nucleotide FASTA containing a custom database to be used for additional annotations against the genome. Multiple FASTAs can be provided separated by comma. E.g. db1.fasta,db2.fasta,...
+
+   * - ``--ncbi_proteins``
+     - N
+     - NA
+     - File containing a list of accessions from `NCBI Protein database <https://www.ncbi.nlm.nih.gov/protein/>`_. The selected proteins will be downloaded and the input genomes will be scaned for with these sequences using ``blastp``.
 
 Annotation thresholds
 """""""""""""""""""""
@@ -254,25 +261,25 @@ Annotation thresholds
      - 60
      - Coverage (%) threshold to be used when detecting plasmids with Plasmidfinder
 
-   * - ``--blast_MGEs_minid``
+   * - ``--blast_mge_minid``
      - N
      - 85
      - Identity (%) threshold to be used when annotating prophages and mobile elements from PHAST and ICEberg databases
 
-   * - ``--blast_MGEs_mincov``
+   * - ``--blast_mge_mincov``
      - N
      - 85
      - Coverage (%) threshold to be used when annotating prophages and mobile elements from PHAST and ICEberg databases
 
    * - ``--blast_custom_minid``
      - N
      - 0
-     - Identity (%) threshold to be used when annotating with user's custom databases
+     - Identity (%) threshold to be used when annotating with custom databases (User's Nucl or NCBI Protein)
 
    * - ``--blast_custom_mincov``
      - N
      - 0
-     - Coverage (%) threshold to be used when annotating with user's custom databases
+     - Coverage (%) threshold to be used when annotating with custom databases (User's Nucl or NCBI Protein)
 
 Merge distance
 """"""""""""""

diff --git a/main.nf b/main.nf
@@ -10,7 +10,7 @@ import org.yaml.snakeyaml.Yaml
  * Include functions
  */
 include { helpMessage } from './nf_functions/help.nf'
-include { logMessage } from './nf_functions/log.nf'
+include { logMessage  } from './nf_functions/log.nf'
 include { paramsCheck } from './nf_functions/paramsCheck.nf'
 
 
@@ -70,6 +70,7 @@ params.prokka_kingdom      = ''
 params.prokka_genetic_code = false
 params.prokka_use_rnammer  = false
 // User custom db
+params.ncbi_protein        = ''
 params.custom_db           = ''
 params.blast_custom_minid  = 0
 params.blast_custom_mincov = 0
@@ -82,8 +83,8 @@ params.blast_virulence_minid   = 90
 params.blast_virulence_mincov  = 80
 params.blast_resistance_minid  = 90
 params.blast_resistance_mincov = 80
-params.blast_MGEs_minid        = 65
-params.blast_MGEs_mincov       = 65
+params.blast_mge_minid        = 65
+params.blast_mge_mincov       = 65
 // Workflow parameters
 params.skip_plasmid_search    = false
 params.skip_virulence_search  = false
@@ -132,7 +133,8 @@ workflow {
     // Run annotation
     BACANNOT(
       parse_samplesheet.out,
-      (params.custom_db) ? Channel.fromPath( params.custom_db.split(',').collect{ it } ) : Channel.empty()
+      (params.custom_db) ? Channel.fromPath( params.custom_db.split(',').collect{ it } ) : Channel.empty(),
+      (params.ncbi_proteins) ? Channel.fromPath( params.ncbi_proteins ) : Channel.empty()
     )
 
   } else {

diff --git a/modules/MGEs/iceberg.nf b/modules/MGEs/iceberg.nf
@@ -23,8 +23,8 @@ process ICEBERG {
   ## Checking ICE genes
 
   ## With predicted gene sequences
-  run_blasts.py blastp --query $genes_aa --db /work/dbs/iceberg/diamond.dmnd --minid ${params.blast_MGEs_minid} \
-  --mincov ${params.blast_MGEs_mincov} --threads ${params.threads} --out ${prefix}_iceberg_blastp_onGenes.txt --2way | \
+  run_blasts.py blastp --query $genes_aa --db /work/dbs/iceberg/diamond.dmnd --minid ${params.blast_mge_minid} \
+  --mincov ${params.blast_mge_mincov} --threads ${params.threads} --out ${prefix}_iceberg_blastp_onGenes.txt --2way | \
   sed -e 's/GENE/ICEBERG_ID/g' > ${prefix}_iceberg_blastp_onGenes.summary.txt ;
 
   ## Checking for full-length ICEs

diff --git a/modules/assembly/flye.nf b/modules/assembly/flye.nf
@@ -23,9 +23,16 @@ process FLYE {
   flye -v > flye_version.txt ;
 
   # Run flye
-  flye ${lr} $lreads --plasmids --out-dir flye_${prefix} --threads ${params.threads} &> flye.log ;
+  flye \\
+      ${lr} \\
+      $lreads \\
+      --out-dir flye_${prefix} \\
+      --threads ${params.threads} \\
+      &> flye.log ;
 
   # Save a copy for annotation
-  cp flye_${prefix}/assembly.fasta flye_${prefix}.fasta
+  cp \\
+      flye_${prefix}/assembly.fasta \\
+      flye_${prefix}.fasta
   """
 }