-
Notifications
You must be signed in to change notification settings - Fork 92
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #137 from davebx/hisat
Add HISAT wrapper and data manager.
- Loading branch information
Showing
21 changed files
with
4,896 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
categories: | ||
- Data Managers | ||
description: HISAT is a fast and sensitive spliced alignment program. | ||
long_description: | | ||
As part of HISAT, we have developed a new indexing scheme based on | ||
the Burrows-Wheeler transform (BWT) and the FM index, called | ||
hierarchical indexing, that employs two types of indexes: (1) | ||
one global FM index representing the whole genome, and (2) many | ||
separate local FM indexes for small regions collectively covering | ||
the genome. Our hierarchical index for the human genome (about 3 | ||
billion bp) includes ~48,000 local FM indexes, each representing a | ||
genomic region of ~64,000bp. As the basis for non-gapped alignment, | ||
the FM index is extremely fast with a low memory footprint, as | ||
demonstrated by Bowtie. In addition, HISAT provides several alignment | ||
strategies specifically designed for mapping different types of | ||
RNA-seq reads. All these together, HISAT enables extremely fast and | ||
sensitive alignment of reads, in particular those spanning two exons | ||
or more. As a result, HISAT is much faster >50 times than TopHat2 | ||
with better alignment quality. Although it uses a large number of | ||
indexes, the memory requirement of HISAT is still modest, | ||
approximately 4.3 GB for human. HISAT uses the Bowtie2 | ||
implementation to handle most of the operations on the FM index. | ||
In addition to spliced alignment, HISAT handles reads involving | ||
indels and supports a paired-end alignment mode. Multiple processor | ||
can be used simultaneously to achieve greater alignment speed. | ||
HISAT outputs alignments in SAM format, enabling interoperation | ||
with a large number of other tools (e.g. SAMtools, GATK) that use | ||
SAM. HISAT is distributed under the GPLv3 license, and it runs on | ||
the command line under Linux, Mac OS X and Windows. | ||
http://ccb.jhu.edu/software/hisat/index.shtml | ||
name: data_manager_hisat_index_builder | ||
owner: devteam | ||
remote_repository_url: https://github.com/galaxyproject/tools-devteam/tree/master/data_managers/data_manager_hisat_index_builder | ||
type: unrestricted |
76 changes: 76 additions & 0 deletions
76
data_managers/data_manager_hisat_index_builder/data_manager/hisat_index_builder.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
#!/usr/bin/env python | ||
# Based heavily on the Bowtie 2 data manager wrapper script by Dan Blankenberg | ||
|
||
import sys | ||
import os | ||
import optparse | ||
import subprocess | ||
|
||
from json import loads, dumps | ||
|
||
|
||
DEFAULT_DATA_TABLE_NAME = "hisat_indexes" | ||
|
||
def get_id_name( params, dbkey, fasta_description=None): | ||
#TODO: ensure sequence_id is unique and does not already appear in location file | ||
sequence_id = params['param_dict']['sequence_id'] | ||
if not sequence_id: | ||
sequence_id = dbkey | ||
|
||
sequence_name = params['param_dict']['sequence_name'] | ||
if not sequence_name: | ||
sequence_name = fasta_description | ||
if not sequence_name: | ||
sequence_name = dbkey | ||
return sequence_id, sequence_name | ||
|
||
def build_hisat_index( data_manager_dict, fasta_filename, params, target_directory, dbkey, sequence_id, sequence_name, data_table_name=DEFAULT_DATA_TABLE_NAME ): | ||
#TODO: allow multiple FASTA input files | ||
fasta_base_name = os.path.split( fasta_filename )[-1] | ||
sym_linked_fasta_filename = os.path.join( target_directory, fasta_base_name ) | ||
os.symlink( fasta_filename, sym_linked_fasta_filename ) | ||
args = [ 'hisat-build', sym_linked_fasta_filename, sequence_id ] | ||
proc = subprocess.Popen( args=args, shell=False, cwd=target_directory ) | ||
return_code = proc.wait() | ||
if return_code: | ||
print >> sys.stderr, "Error building index." | ||
sys.exit( return_code ) | ||
data_table_entry = dict( value=sequence_id, dbkey=dbkey, name=sequence_name, path=sequence_id ) | ||
_add_data_table_entry( data_manager_dict, data_table_name, data_table_entry ) | ||
|
||
def _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry ): | ||
data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} ) | ||
data_manager_dict['data_tables'][ data_table_name ] = data_manager_dict['data_tables'].get( data_table_name, [] ) | ||
data_manager_dict['data_tables'][ data_table_name ].append( data_table_entry ) | ||
return data_manager_dict | ||
|
||
def main(): | ||
#Parse Command Line | ||
parser = optparse.OptionParser() | ||
parser.add_option( '-f', '--fasta_filename', dest='fasta_filename', action='store', type="string", default=None, help='fasta_filename' ) | ||
parser.add_option( '-d', '--fasta_dbkey', dest='fasta_dbkey', action='store', type="string", default=None, help='fasta_dbkey' ) | ||
parser.add_option( '-t', '--fasta_description', dest='fasta_description', action='store', type="string", default=None, help='fasta_description' ) | ||
parser.add_option( '-n', '--data_table_name', dest='data_table_name', action='store', type="string", default=None, help='data_table_name' ) | ||
(options, args) = parser.parse_args() | ||
|
||
filename = args[0] | ||
|
||
params = loads( open( filename ).read() ) | ||
target_directory = params[ 'output_data' ][0]['extra_files_path'] | ||
os.mkdir( target_directory ) | ||
data_manager_dict = {} | ||
|
||
dbkey = options.fasta_dbkey | ||
|
||
if dbkey in [ None, '', '?' ]: | ||
raise Exception( '"%s" is not a valid dbkey. You must specify a valid dbkey.' % ( dbkey ) ) | ||
|
||
sequence_id, sequence_name = get_id_name( params, dbkey=dbkey, fasta_description=options.fasta_description ) | ||
|
||
#build the index | ||
build_hisat_index( data_manager_dict, options.fasta_filename, params, target_directory, dbkey, sequence_id, sequence_name, data_table_name=options.data_table_name or DEFAULT_DATA_TABLE_NAME ) | ||
|
||
#save info to json file | ||
open( filename, 'wb' ).write( dumps( data_manager_dict ) ) | ||
|
||
if __name__ == "__main__": main() |
67 changes: 67 additions & 0 deletions
67
data_managers/data_manager_hisat_index_builder/data_manager/hisat_index_builder.xml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
<tool id="hisat_index_builder_data_manager" name="HISAT index" tool_type="manage_data" version="1.0.0"> | ||
<description>builder</description> | ||
<requirements> | ||
<requirement type="package" version="0.1.6">hisat</requirement> | ||
</requirements> | ||
<stdio> | ||
<exit_code range=":-1" /> | ||
<exit_code range="1:" /> | ||
</stdio> | ||
<command interpreter="python">hisat_index_builder.py "${out_file}" --fasta_filename "${all_fasta_source.fields.path}" --fasta_dbkey "${all_fasta_source.fields.dbkey}" --fasta_description "${all_fasta_source.fields.name}" --data_table_name "hisat_indexes"</command> | ||
<inputs> | ||
<param label="Source FASTA Sequence" name="all_fasta_source" type="select"> | ||
<options from_data_table="all_fasta" /> | ||
</param> | ||
<param label="Name of sequence" name="sequence_name" type="text" value="" /> | ||
<param label="ID for sequence" name="sequence_id" type="text" value="" /> | ||
</inputs> | ||
<outputs> | ||
<data format="data_manager_json" name="out_file" /> | ||
</outputs> | ||
<help> | ||
<![CDATA[ | ||
.. class:: infomark | ||
**Notice:** If you leave name, description, or id blank, it will be generated automatically. | ||
What is HISAT? | ||
-------------- | ||
`HISAT <http://ccb.jhu.edu/software/hisat>`__ is a fast and sensitive | ||
spliced alignment program. As part of HISAT, we have developed a new | ||
indexing scheme based on the Burrows-Wheeler transform | ||
(`BWT <http://en.wikipedia.org/wiki/Burrows-Wheeler_transform>`__) and | ||
the `FM index <http://en.wikipedia.org/wiki/FM-index>`__, called | ||
hierarchical indexing, that employs two types of indexes: (1) one global | ||
FM index representing the whole genome, and (2) many separate local FM | ||
indexes for small regions collectively covering the genome. Our | ||
hierarchical index for the human genome (about 3 billion bp) includes | ||
~48,000 local FM indexes, each representing a genomic region of | ||
~64,000bp. As the basis for non-gapped alignment, the FM index is | ||
extremely fast with a low memory footprint, as demonstrated by | ||
`Bowtie <http://bowtie-bio.sf.net>`__. In addition, HISAT provides | ||
several alignment strategies specifically designed for mapping different | ||
types of RNA-seq reads. All these together, HISAT enables extremely fast | ||
and sensitive alignment of reads, in particular those spanning two exons | ||
or more. As a result, HISAT is much faster >50 times than | ||
`TopHat2 <http://ccb.jhu.edu/software/tophat>`__ with better alignment | ||
quality. Although it uses a large number of indexes, the memory | ||
requirement of HISAT is still modest, approximately 4.3 GB for human. | ||
HISAT uses the `Bowtie2 <http://bowtie-bio.sf.net/bowtie2>`__ | ||
implementation to handle most of the operations on the FM index. In | ||
addition to spliced alignment, HISAT handles reads involving indels and | ||
supports a paired-end alignment mode. Multiple processors can be used | ||
simultaneously to achieve greater alignment speed. HISAT outputs | ||
alignments in `SAM <http://samtools.sourceforge.net/SAM1.pdf>`__ format, | ||
enabling interoperation with a large number of other tools (e.g. | ||
`SAMtools <http://samtools.sourceforge.net>`__, | ||
`GATK <http://www.broadinstitute.org/gsa/wiki/index.php/The_Genome_Analysis_Toolkit>`__) | ||
that use SAM. HISAT is distributed under the `GPLv3 | ||
license <http://www.gnu.org/licenses/gpl-3.0.html>`__, and it runs on | ||
the command line under Linux, Mac OS X and Windows. | ||
]]> | ||
</help> | ||
<citations> | ||
<citation type="doi">10.1038/nmeth.3317</citation> | ||
</citations> | ||
</tool> |
20 changes: 20 additions & 0 deletions
20
data_managers/data_manager_hisat_index_builder/data_manager_conf.xml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
<?xml version="1.0"?> | ||
<data_managers> | ||
<data_manager tool_file="data_manager/hisat_index_builder.xml" id="hisat_index_builder" version="0.0.1"> | ||
<data_table name="hisat_indexes"> | ||
<output> | ||
<column name="value" /> | ||
<column name="dbkey" /> | ||
<column name="name" /> | ||
<column name="path" output_ref="out_file" > | ||
<move type="directory" relativize_symlinks="True"> | ||
<!-- <source>${path}</source>--> <!-- out_file.extra_files_path is used as base by default --> <!-- if no source, eg for type=directory, then refers to base --> | ||
<target base="${GALAXY_DATA_MANAGER_DATA_PATH}">${dbkey}/hisat_index/${value}</target> | ||
</move> | ||
<value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/${dbkey}/hisat_index/${value}/${path}</value_translation> | ||
<value_translation type="function">abspath</value_translation> | ||
</column> | ||
</output> | ||
</data_table> | ||
</data_manager> | ||
</data_managers> |
18 changes: 18 additions & 0 deletions
18
data_managers/data_manager_hisat_index_builder/tool-data/all_fasta.loc.sample
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
#This file lists the locations and dbkeys of all the fasta files | ||
#under the "genome" directory (a directory that contains a directory | ||
#for each build). The script extract_fasta.py will generate the file | ||
#all_fasta.loc. This file has the format (white space characters are | ||
#TAB characters): | ||
# | ||
#<unique_build_id> <dbkey> <display_name> <file_path> | ||
# | ||
#So, all_fasta.loc could look something like this: | ||
# | ||
#apiMel3 apiMel3 Honeybee (Apis mellifera): apiMel3 /path/to/genome/apiMel3/apiMel3.fa | ||
#hg19canon hg19 Human (Homo sapiens): hg19 Canonical /path/to/genome/hg19/hg19canon.fa | ||
#hg19full hg19 Human (Homo sapiens): hg19 Full /path/to/genome/hg19/hg19full.fa | ||
# | ||
#Your all_fasta.loc file should contain an entry for each individual | ||
#fasta file. So there will be multiple fasta files for each build, | ||
#such as with hg19 above. | ||
# |
37 changes: 37 additions & 0 deletions
37
data_managers/data_manager_hisat_index_builder/tool-data/hisat_indexes.loc.sample
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# hisat_indexes.loc.sample | ||
# This is a *.loc.sample file distributed with Galaxy that enables tools | ||
# to use a directory of indexed data files. This one is for HISAT. | ||
# See the wiki: http://wiki.galaxyproject.org/Admin/NGS%20Local%20Setup | ||
# First create these data files and save them in your own data directory structure. | ||
# Then, create a hisat_indexes.loc file to use those indexes with tools. | ||
# Copy this file, save it with the same name (minus the .sample), | ||
# follow the format examples, and store the result in this directory. | ||
# The file should include an one line entry for each index set. | ||
# The path points to the "basename" for the set, not a specific file. | ||
# It has four text columns seperated by TABS. | ||
# | ||
# <unique_build_id> <dbkey> <display_name> <file_base_path> | ||
# | ||
# So, for example, if you had hg18 indexes stored in: | ||
# | ||
# /depot/data2/galaxy/hg19/hisat/ | ||
# | ||
# containing hg19 genome and hg19.*.bt2 files, such as: | ||
# -rw-rw-r-- 1 james james 914M Feb 10 18:56 hg19canon.fa | ||
# -rw-rw-r-- 1 james james 914M Feb 10 18:56 hg19canon.1.bt2 | ||
# -rw-rw-r-- 1 james james 683M Feb 10 18:56 hg19canon.2.bt2 | ||
# -rw-rw-r-- 1 james james 3.3K Feb 10 16:54 hg19canon.3.bt2 | ||
# -rw-rw-r-- 1 james james 683M Feb 10 16:54 hg19canon.4.bt2 | ||
# -rw-rw-r-- 1 james james 914M Feb 10 20:45 hg19canon.rev.1.bt2 | ||
# -rw-rw-r-- 1 james james 683M Feb 10 20:45 hg19canon.rev.2.bt2 | ||
# | ||
# then the hisat_indexes.loc entry could look like this: | ||
# | ||
#hg19 hg19 Human (hg19) /depot/data2/galaxy/hg19/hisat/hg19canon | ||
# | ||
#More examples: | ||
# | ||
#mm10 mm10 Mouse (mm10) /depot/data2/galaxy/mm10/hisat/mm10 | ||
#dm3 dm3 D. melanogaster (dm3) /depot/data2/galaxy/mm10/hisat/dm3 | ||
# | ||
# |
13 changes: 13 additions & 0 deletions
13
data_managers/data_manager_hisat_index_builder/tool_data_table_conf.xml.sample
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
<!-- Use the file tool_data_table_conf.xml.oldlocstyle if you don't want to update your loc files as changed in revision 4550:535d276c92bc--> | ||
<tables> | ||
<!-- Locations of all fasta files under genome directory --> | ||
<table name="all_fasta" comment_char="#"> | ||
<columns>value, dbkey, name, path</columns> | ||
<file path="tool-data/all_fasta.loc" /> | ||
</table> | ||
<!-- Locations of indexes in the hisat mapper format --> | ||
<table name="hisat_indexes" comment_char="#"> | ||
<columns>value, dbkey, name, path</columns> | ||
<file path="tool-data/hisat_indexes.loc" /> | ||
</table> | ||
</tables> |
6 changes: 6 additions & 0 deletions
6
data_managers/data_manager_hisat_index_builder/tool_dependencies.xml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
<?xml version="1.0"?> | ||
<tool_dependency> | ||
<package name="hisat" version="0.1.6"> | ||
<repository name="package_hisat_0_1_6" owner="iuc" /> | ||
</package> | ||
</tool_dependency> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
categories: | ||
- Assembly | ||
description: HISAT is a fast and sensitive spliced alignment program. | ||
long_description: | | ||
As part of HISAT, we have developed a new indexing scheme based on | ||
the Burrows-Wheeler transform (BWT) and the FM index, called | ||
hierarchical indexing, that employs two types of indexes: (1) | ||
one global FM index representing the whole genome, and (2) many | ||
separate local FM indexes for small regions collectively covering | ||
the genome. Our hierarchical index for the human genome (about 3 | ||
billion bp) includes ~48,000 local FM indexes, each representing a | ||
genomic region of ~64,000bp. As the basis for non-gapped alignment, | ||
the FM index is extremely fast with a low memory footprint, as | ||
demonstrated by Bowtie. In addition, HISAT provides several alignment | ||
strategies specifically designed for mapping different types of | ||
RNA-seq reads. All these together, HISAT enables extremely fast and | ||
sensitive alignment of reads, in particular those spanning two exons | ||
or more. As a result, HISAT is much faster >50 times than TopHat2 | ||
with better alignment quality. Although it uses a large number of | ||
indexes, the memory requirement of HISAT is still modest, | ||
approximately 4.3 GB for human. HISAT uses the Bowtie2 | ||
implementation to handle most of the operations on the FM index. | ||
In addition to spliced alignment, HISAT handles reads involving | ||
indels and supports a paired-end alignment mode. Multiple processor | ||
can be used simultaneously to achieve greater alignment speed. | ||
HISAT outputs alignments in SAM format, enabling interoperation | ||
with a large number of other tools (e.g. SAMtools, GATK) that use | ||
SAM. HISAT is distributed under the GPLv3 license, and it runs on | ||
the command line under Linux, Mac OS X and Windows. | ||
http://ccb.jhu.edu/software/hisat/index.shtml | ||
name: hisat | ||
owner: devteam | ||
remote_repository_url: https://github.com/galaxyproject/tools-devteam/tree/master/tools/hisat | ||
type: unrestricted |
Oops, something went wrong.