Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

addition of ncbi_protein annotation module #49

Closed
wants to merge 12 commits into from
Closed
54 changes: 54 additions & 0 deletions bin/gbk2faa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/usr/bin/env python

# impor required libraries
from Bio import SeqIO
import sys

# get filename from command line
filename = sys.argv[-1]
gbk_filename = filename

# open file connection
input_handle = open(gbk_filename, "r")

# def function for stderr print
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)

for seq_record in SeqIO.parse(input_handle, "genbank") :

# all brank to check for errors later
organism = None
product = None
gene = None

for seq_feature in seq_record.features:

# get organism
if seq_feature.type.lower()=="source" :
organism = seq_feature.qualifiers['organism'][0]

# get product
if seq_feature.type.lower()=="protein" :
product = seq_feature.qualifiers['product'][0].replace(" ", "_")

# get gene name if entry has only CDS definition
if seq_feature.type.lower()=="cds" :
gene = seq_feature.qualifiers['gene'][0]

# get gene name if entry has only Gene definition
if seq_feature.type.lower()=="gene" :
gene = seq_feature.qualifiers['gene'][0]

# save sequence info
seq = seq_record.seq
acc = seq_record.name

# print
if gene==None or product==None:
eprint(f"An error has been found with entry {acc}. Either its gene (value found: {gene}) or product (Value found: {product}) was not found in the database genbank.\nPlease make sure this entry is from NCBI Protein db and it has the Gene/Protein information properly formated.")
else:
print(f">NCBI_PROTEIN~~~{gene}~~~{acc}~~~{product}~~~[{organism}]\n{seq}")

# close file connection
input_handle.close()
File renamed without changes.
26 changes: 13 additions & 13 deletions docker/scripts/pscripts/run_blasts.py → bin/run_blasts.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,10 @@ def filter(string, substr):
def blastn(query, db, culling, minid, mincov, out, threads):

# Outfmt
outfmt="6 qseqid qstart qend qlen sseqid sstart send slen evalue length pident gaps gapopen stitle"
outfmt="6 qseqid qstart qend qlen sseqid sstart send slen evalue length pident gaps gapopen"

# Run blastn
os.system(f"echo \"qseqid\tqstart\tqend\tqlen\tsseqid\tsstart\tsend\tslen\tevalue\tlength\tpident\tgaps\tgapopen\tstitle\" > {out}")
os.system(f"echo \"qseqid\tqstart\tqend\tqlen\tsseqid\tsstart\tsend\tslen\tevalue\tlength\tpident\tgaps\tgapopen\" > {out}")

if arguments['--2way']:
os.system(f"blastn -query {query} -db {db} -outfmt \"{outfmt}\" -num_threads {threads} -culling_limit {culling} -perc_identity {minid} | \
Expand All @@ -72,10 +72,10 @@ def blastn(query, db, culling, minid, mincov, out, threads):
def tblastn(query, db, culling, minid, mincov, out, threads):

# Outfmt
outfmt="6 sseqid sstart send slen qseqid qstart qend qlen evalue length pident gaps gapopen stitle"
outfmt="6 sseqid sstart send slen qseqid qstart qend qlen evalue length pident gaps gapopen"

# Run blastn
os.system(f"echo \"qseqid\tqstart\tqend\tqlen\tsseqid\tsstart\tsend\tslen\tevalue\tlength\tpident\tgaps\tgapopen\tstitle\" > {out}")
os.system(f"echo \"qseqid\tqstart\tqend\tqlen\tsseqid\tsstart\tsend\tslen\tevalue\tlength\tpident\tgaps\tgapopen\" > {out}")

if arguments['--2way']:
os.system(f"tblastn -subject {query} -query {db} -outfmt \"{outfmt}\" -num_threads {threads} -culling_limit {culling} | \
Expand All @@ -90,10 +90,10 @@ def tblastn(query, db, culling, minid, mincov, out, threads):
def blastx(query, db, culling, minid, mincov, out, threads):

# Outfmt
outfmt="6 qseqid qstart qend qlen sseqid sstart send slen evalue length pident gaps gapopen stitle"
outfmt="6 qseqid qstart qend qlen sseqid sstart send slen evalue length pident gaps gapopen"

# Run blastn
os.system(f"echo \"qseqid\tqstart\tqend\tqlen\tsseqid\tsstart\tsend\tslen\tevalue\tlength\tpident\tgaps\tgapopen\tstitle\" > {out}")
os.system(f"echo \"qseqid\tqstart\tqend\tqlen\tsseqid\tsstart\tsend\tslen\tevalue\tlength\tpident\tgaps\tgapopen\" > {out}")

if arguments['--2way']:
os.system(f"diamond blastx --query {query} --db {db} --outfmt {outfmt} --max-target-seqs {culling} \
Expand All @@ -108,10 +108,10 @@ def blastx(query, db, culling, minid, mincov, out, threads):
def blastp(query, db, culling, minid, mincov, out, threads):

# Outfmt
outfmt="6 qseqid qstart qend qlen sseqid sstart send slen evalue length pident gaps gapopen stitle"
outfmt="6 qseqid qstart qend qlen sseqid sstart send slen evalue length pident gaps gapopen"

# Run blastn
os.system(f"echo \"qseqid\tqstart\tqend\tqlen\tsseqid\tsstart\tsend\tslen\tevalue\tlength\tpident\tgaps\tgapopen\tstitle\" > {out}")
os.system(f"echo \"qseqid\tqstart\tqend\tqlen\tsseqid\tsstart\tsend\tslen\tevalue\tlength\tpident\tgaps\tgapopen\" > {out}")

if arguments['--2way']:
os.system(f"diamond blastp --query {query} --db {db} --outfmt {outfmt} --max-target-seqs {culling} \
Expand Down Expand Up @@ -143,11 +143,11 @@ def summary(output):
else:
cov_map=str(line["sstart"]) + '-' + str(line["send"]) + "/" + str(line["slen"])
# Parse headers
db=line["stitle"].split('~~~')[0]
gene=line["stitle"].split('~~~')[1]
acc=line["stitle"].split('~~~')[2]
prodc=line["stitle"].split('~~~')[3].split(" ", 1)[0]
desc=line["stitle"].split('~~~')[3].split(" ", 1)[1]
db=line["sseqid"].split('~~~')[0]
gene=line["sseqid"].split('~~~')[1]
acc=line["sseqid"].split('~~~')[2]
prodc=line["sseqid"].split('~~~')[4]
desc=line["sseqid"].split('~~~')[5]
# Subject coverage
cov=round((100 * (line["length"] - line["gaps"]) / line["slen"]), 2)
# Identity
Expand Down
File renamed without changes.
Empty file modified bin/update_database_image.sh
100644 → 100755
Empty file.
2 changes: 2 additions & 0 deletions conf/small_dataset_test_profile.config
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ params {
*/
// Custom nucleotide fastas
custom_db = 'https://github.com/fmalmeida/test_datasets/raw/main/small_custom_db.fasta'
// Custom annotation based on NCBI protein IDs
ncbi_proteins = 'https://github.com/fmalmeida/test_datasets/raw/main/haemophilus_ncbi_proteins.txt'

}

Expand Down
2 changes: 2 additions & 0 deletions conf/test_profile.config
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ params {
*/
// Custom nucleotide fastas
custom_db = 'https://github.com/fmalmeida/test_datasets/raw/main/small_custom_db.fasta'
// Custom annotation based on NCBI protein IDs
ncbi_proteins = 'https://github.com/fmalmeida/test_datasets/raw/main/haemophilus_ncbi_proteins.txt'

}

Expand Down
4 changes: 2 additions & 2 deletions docker/Dockerfile_bacannot
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ WORKDIR /work/dbs
RUN mkdir vfdb && \
wget http://www.mgc.ac.cn/VFs/Down/VFDB_setA_nt.fas.gz && \
gzip -d VFDB_setA_nt.fas.gz && \
awk -v db=VFDB '/>/{ split($0,name," "); split($0,id," \\["); all=$0; $0=">" db "~~~" name[2] "~~~" name[1] "~~~[" id[2] " " all }1' VFDB_setA_nt.fas | \
awk -v db=VFDB '/>/{ split($0,name," "); split($0,id," \\["); all=$0; $0=">" db "~~~" name[2] "~~~" name[1] "~~~[" id[2] "~~~" all }1' VFDB_setA_nt.fas | \
sed -e 's/~>/~/g' -e 's/ ~/~/g' -e 's/]~/~/g' -e 's/ >/ /' | \
awk -F "]" ' { if ($0 ~ />/) { gsub(" ", "_", $1); print $1 "] " $2 "]"} else { print $0 }}' > vfdb/sequences && \
makeblastdb -in vfdb/sequences -title 'vfdb' -dbtype nucl -logfile /dev/null && \
Expand All @@ -61,7 +61,7 @@ RUN mkdir vfdb && \
## ICEberg nt (ICEs)
RUN mkdir iceberg && \
wget https://bioinfo-mml.sjtu.edu.cn/ICEberg2/download/ICE_seq_experimental.fas && \
awk -v db=ICEberg '/>/{ split($0,a,"|"); all=$0; $0=">" db "~~~" "ICE_" a[2] "~~~" a[5] "~~~" a[3] " " all }1' ICE_seq_experimental.fas | \
awk -v db=ICEberg '/>/{ split($0,a,"|"); all=$0; $0=">" db "~~~" "ICE_" a[2] "~~~" a[5] "~~~" a[3] "~~~" all }1' ICE_seq_experimental.fas | \
sed -e 's/ >/ /g' > iceberg/sequences && \
rm ICE_seq_experimental.fas && \
makeblastdb -in iceberg/sequences -title 'iceberg' -dbtype nucl -logfile /dev/null
Expand Down
7 changes: 0 additions & 7 deletions docker/Dockerfile_main_tools
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,3 @@ RUN conda create -y -n digIS -c bioconda 'hmmer==3.1b2' 'biopython==1.77' nomkl
### my custom scripts ###
#########################
COPY scripts/bscripts /work/bscripts
COPY scripts/pscripts/splitgenbank.py /usr/local/bin/splitgenbank.py
COPY scripts/pscripts/resfinder2gff.py /usr/local/bin/resfinder2gff.py
COPY scripts/pscripts/run_blasts.py /usr/local/bin/run_blasts.py
RUN chmod a+x /usr/local/bin/run_blasts.py && \
chmod a+rwx /work/bscripts/* && \
chmod a+x /usr/local/bin/splitgenbank.py && \
chmod a+x /usr/local/bin/resfinder2gff.py
12 changes: 6 additions & 6 deletions docker/reports/report_custom_blast.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ dt_opt_lst <- list(pageLength = 5,

## About

This report is based on the user's custom database input called `r params$blast_db`. This database have been blasted against the query genome via BLASTn. The BLASTn raw results are shown in the table \@ref(tab:blastn-raw-table). Additionally, the BLAST results have been used to detect intersection points with the annotation using the software bedtools. Therefore, in table \@ref(tab:blastn-gff-table), we show the gene features from the query genome that have intersection points with the BLAST results.
This report is based on the user's custom database input called `r params$blast_db`. This database have been blasted against the query genome via BLASTn or tBLASTn, depending on the query input type. The raw BLAST results are shown in the table \@ref(tab:blastn-raw-table). Additionally, the BLAST results have been used to detect intersection points with the annotation using the software bedtools. Therefore, in table \@ref(tab:blastn-gff-table), we show the gene features from the query genome that have intersection points with the BLAST results.

> Take note that DT tables will only be rendered when input is available. Thus, whenever a table is missing it is because no results have been found.

Expand All @@ -90,12 +90,12 @@ All the predictions were passed through a user defined threshold for minimum cov

## Results

### BLASTn results
### Raw BLAST results

The BLASTn results of the custom database against the query genome is shown in the table \@ref(tab:blastn-raw-table).
The BLAST results of the custom database against the query genome is shown in the table \@ref(tab:blast-raw-table).

<br>
<caption>(#tab:blastn-raw-table) Alignment of the `r params$blast_db` custom database against the query genome via BLASTn</caption>
<caption>(#tab:blast-raw-table) Alignment of the `r params$blast_db` custom database against the query genome.</caption>
```{r}
blast <- custom_blast
# Render dt
Expand All @@ -109,10 +109,10 @@ datatable(blast,

### Annotation intersection

Using the software bedtools intersect, the BLASTn results have been used to search for intersections against the query gene annotation. This result is shown in table \@ref(tab:blastn-gff-table).
Using the software bedtools intersect, the BLAST results have been used to search for intersections against the query gene annotation. This result is shown in table \@ref(tab:blast-gff-table).

<br>
<caption>(#tab:blastn-gff-table) Annotation intersection of the `r params$blast_db` custom database BLASTn results</caption>
<caption>(#tab:blast-gff-table) Annotation intersection of the `r params$blast_db` custom database BLAST results with the pipeline's core annotation.</caption>
```{r}
blast <- blast_gff
colnames(blast) <-
Expand Down
29 changes: 22 additions & 7 deletions docs/custom-db.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,32 @@
Custom database configuration
=============================

It is also possible that users use custom databases for the annotation of genomes. Currently, the pipeline only executes BLASTn alignments using the genome as query.
Therefore, the given databases must be files of **target gene sequences in nucleotide** FASTA.
It is also possible that users use custom databases for the annotation of genomes, either by providing custom nucleotide FASTAs, properly formatted with **target gene sequences in nucleotide** to be search against the genome with BLASTn, or by providing a file containing a list of NCBI Protein database accession which will retrieve these sequences and annotated the genome against them using BLASTp.

The custom annotation is triggered with the ``--custom_db`` parameter. The pipeline accepts more than one custom database at once, separated by commas, e.g.
NCBI Protein database
---------------------

To perform an additional annotation of the genome using proteins from NCBI Protein database, one must use the parameter ``--ncbi_proteins`` and provide a file containing a list of protein IDs such as this one:

.. code-block:: bash

WP_118891437.1
VTX70803.1
VTX49335.1
WP_005693332.1
WP_138172127.1

Custom nucleotide database (in FASTA)
-------------------------------------

This custom annotation is triggered with the ``--custom_db`` parameter. The pipeline accepts more than one custom database at once, separated by commas, e.g.
``--custom_db db1.fasta,db2.fasta``.

Although simple, the custom database must follow some rules about sequence header format in order to make it possible the summarization of alignments and renderization
of custom reports in HTML format, that shall be available under the ``report_files`` directory.

Sequence header format
----------------------
""""""""""""""""""""""

Sequence headers must follow a 5-field rule separated by "~~~" and spaces. The first 4 fields must be separated by "~~~" and the last one by one space, following the
example shown below:
Expand All @@ -34,10 +49,10 @@ example shown below:

It is very important to follow this header format in order to make it possible and easier to render summaries and reports of the BLASTn result, such as below:

BLASTn summary example
----------------------
Summary example for custom database annotations
-----------------------------------------------

When the header is followed, the summaries and reports are very well rendered such as in this example:
The custom annotations, either with ``--custom_db`` or with ``--ncbi_protein`` will produce useful summaries and reports (:ref:`outputs`) such as in this example:

.. code-block:: bash

Expand Down
21 changes: 14 additions & 7 deletions docs/manual.rst
Original file line number Diff line number Diff line change
Expand Up @@ -195,8 +195,10 @@ On/Off processes
- | Tells whether or not to run antiSMASH (secondary metabolite) annotation.
| AntiSMASH is executed using only its core annotation modules in order to keep it fast

Custom nucl databases
"""""""""""""""""""""
Custom databases
""""""""""""""""

Parameters to load custom databases annotation using nucleotide sequences provided by the user or protein sequences from NCBI (given a list with accessions). Please, to learn more about how to format a nucl custom database or to use sequences from NCBI Protein, refer to :ref:`custom-db`.

.. list-table::
:widths: 20 10 20 30
Expand All @@ -210,7 +212,12 @@ Custom nucl databases
* - ``--custom_db``
- N
- NA
- Custom gene nucleotide databases to be used for additional annotations against the genome. See :ref:`custom-db`.
- Path to nucleotide FASTA containing a custom database to be used for additional annotations against the genome. Multiple FASTAs can be provided separated by comma. E.g. db1.fasta,db2.fasta,...

* - ``--ncbi_proteins``
- N
- NA
- File containing a list of accessions from `NCBI Protein database <https://www.ncbi.nlm.nih.gov/protein/>`_. The selected proteins will be downloaded and the input genomes will be scaned for with these sequences using ``blastp``.

Annotation thresholds
"""""""""""""""""""""
Expand Down Expand Up @@ -254,25 +261,25 @@ Annotation thresholds
- 60
- Coverage (%) threshold to be used when detecting plasmids with Plasmidfinder

* - ``--blast_MGEs_minid``
* - ``--blast_mge_minid``
- N
- 85
- Identity (%) threshold to be used when annotating prophages and mobile elements from PHAST and ICEberg databases

* - ``--blast_MGEs_mincov``
* - ``--blast_mge_mincov``
- N
- 85
- Coverage (%) threshold to be used when annotating prophages and mobile elements from PHAST and ICEberg databases

* - ``--blast_custom_minid``
- N
- 0
- Identity (%) threshold to be used when annotating with user's custom databases
- Identity (%) threshold to be used when annotating with custom databases (User's Nucl or NCBI Protein)

* - ``--blast_custom_mincov``
- N
- 0
- Coverage (%) threshold to be used when annotating with user's custom databases
- Coverage (%) threshold to be used when annotating with custom databases (User's Nucl or NCBI Protein)

Merge distance
""""""""""""""
Expand Down
10 changes: 6 additions & 4 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import org.yaml.snakeyaml.Yaml
* Include functions
*/
include { helpMessage } from './nf_functions/help.nf'
include { logMessage } from './nf_functions/log.nf'
include { logMessage } from './nf_functions/log.nf'
include { paramsCheck } from './nf_functions/paramsCheck.nf'


Expand Down Expand Up @@ -70,6 +70,7 @@ params.prokka_kingdom = ''
params.prokka_genetic_code = false
params.prokka_use_rnammer = false
// User custom db
params.ncbi_protein = ''
params.custom_db = ''
params.blast_custom_minid = 0
params.blast_custom_mincov = 0
Expand All @@ -82,8 +83,8 @@ params.blast_virulence_minid = 90
params.blast_virulence_mincov = 80
params.blast_resistance_minid = 90
params.blast_resistance_mincov = 80
params.blast_MGEs_minid = 65
params.blast_MGEs_mincov = 65
params.blast_mge_minid = 65
params.blast_mge_mincov = 65
// Workflow parameters
params.skip_plasmid_search = false
params.skip_virulence_search = false
Expand Down Expand Up @@ -132,7 +133,8 @@ workflow {
// Run annotation
BACANNOT(
parse_samplesheet.out,
(params.custom_db) ? Channel.fromPath( params.custom_db.split(',').collect{ it } ) : Channel.empty()
(params.custom_db) ? Channel.fromPath( params.custom_db.split(',').collect{ it } ) : Channel.empty(),
(params.ncbi_proteins) ? Channel.fromPath( params.ncbi_proteins ) : Channel.empty()
)

} else {
Expand Down
4 changes: 2 additions & 2 deletions modules/MGEs/iceberg.nf
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ process ICEBERG {
## Checking ICE genes

## With predicted gene sequences
run_blasts.py blastp --query $genes_aa --db /work/dbs/iceberg/diamond.dmnd --minid ${params.blast_MGEs_minid} \
--mincov ${params.blast_MGEs_mincov} --threads ${params.threads} --out ${prefix}_iceberg_blastp_onGenes.txt --2way | \
run_blasts.py blastp --query $genes_aa --db /work/dbs/iceberg/diamond.dmnd --minid ${params.blast_mge_minid} \
--mincov ${params.blast_mge_mincov} --threads ${params.threads} --out ${prefix}_iceberg_blastp_onGenes.txt --2way | \
sed -e 's/GENE/ICEBERG_ID/g' > ${prefix}_iceberg_blastp_onGenes.summary.txt ;

## Checking for full-length ICEs
Expand Down
11 changes: 9 additions & 2 deletions modules/assembly/flye.nf
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,16 @@ process FLYE {
flye -v > flye_version.txt ;

# Run flye
flye ${lr} $lreads --plasmids --out-dir flye_${prefix} --threads ${params.threads} &> flye.log ;
flye \\
${lr} \\
$lreads \\
--out-dir flye_${prefix} \\
--threads ${params.threads} \\
&> flye.log ;

# Save a copy for annotation
cp flye_${prefix}/assembly.fasta flye_${prefix}.fasta
cp \\
flye_${prefix}/assembly.fasta \\
flye_${prefix}.fasta
"""
}
Loading