-
Notifications
You must be signed in to change notification settings - Fork 992
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Display preview feature for Bam datatype #4279
Changes from 3 commits
e88233a
2005a9d
bb67be8
72c196f
c66298d
d168115
8c8aa88
3ddf819
9a8c8b8
731d1db
dac7295
635df81
253d0c2
3ddd05d
c476090
8b3e492
0cfdfbf
90a0ab0
b370a9c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,11 +10,14 @@ | |
import subprocess | ||
import tempfile | ||
import zipfile | ||
from json import dumps | ||
|
||
import pysam | ||
from bx.seq.twobit import TWOBIT_MAGIC_NUMBER, TWOBIT_MAGIC_NUMBER_SWAP, TWOBIT_MAGIC_SIZE | ||
|
||
from galaxy import util | ||
from galaxy.datatypes import metadata | ||
from galaxy.datatypes.tabular import Sam | ||
from galaxy.datatypes.metadata import DictParameter, ListParameter, MetadataElement, MetadataParameter | ||
from galaxy.util import FILENAME_VALID_CHARS, nice_size, sqlite, which | ||
from . import data, dataproviders | ||
|
@@ -220,7 +223,7 @@ class GenericAsn1Binary( Binary ): | |
|
||
|
||
@dataproviders.decorators.has_dataproviders | ||
class Bam( Binary ): | ||
class Bam( Binary, Sam ): | ||
"""Class describing a BAM binary file""" | ||
edam_format = "format_2572" | ||
edam_data = "data_0863" | ||
|
@@ -236,6 +239,9 @@ class Bam( Binary ): | |
MetadataElement( name="reference_lengths", default=[], desc="Chromosome Lengths", param=MetadataParameter, readonly=True, visible=False, optional=True, no_value=[] ) | ||
MetadataElement( name="bam_header", default={}, desc="Dictionary of BAM Headers", param=MetadataParameter, readonly=True, visible=False, optional=True, no_value={} ) | ||
|
||
def __init__(self, **kwd): | ||
super( Bam, self ).__init__( **kwd ) | ||
|
||
def _get_samtools_version( self ): | ||
version = '0.0.0' | ||
samtools_exec = which('samtools') | ||
|
@@ -462,6 +468,52 @@ def to_archive(self, trans, dataset, name=""): | |
file_paths.append(dataset.metadata.bam_index.file_name) | ||
return zip(file_paths, rel_paths) | ||
|
||
def get_chunk(self, trans, dataset, offset=0, ck_size=None): | ||
index_file = dataset.metadata.bam_index | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @mvdbeek Do you happen to know if this function has the potential to consume unbounded memory if the bam file is problematic or is it going to always be pretty managable despite the bam file? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, bam lines (reads) can become huge. If someone did a genome-against-genome alignment I think this could be problematic. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We verified the problem on main wasn't data loaded into peak attributes - sorry for the false alarm. Maybe we should create an issue for known points in Galaxy where the amount of data loaded is unbounded, I don't know. |
||
with pysam.AlignmentFile(dataset.file_name, "rb", index_filename=index_file.file_name) as bamfile: | ||
ck_size = 1000 # 1000 lines | ||
ck_data = "" | ||
line_number = 0 | ||
if offset == 0: | ||
ck_data = bamfile.text | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @mvdbeek Like you mentioned already, What happens when bam file has several lines of header. Do I need to trim if it exceeds certain limit ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think for now we should cut them off as you're doing now. In the future we could work with the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what do you think about doing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, i think it is good idea. I will update my code. |
||
for line_number, alignment, in enumerate(bamfile): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can drop the |
||
if line_number > offset and line_number <= (offset + ck_size): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you probably want to change this to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, we can switch this around and break here if the
|
||
bamline = alignment.tostring(bamfile) | ||
# Galaxy display each tag as separate column because 'tostring()' funcition put spaces in between each tag of tags column. | ||
# Below code will remove spaces between each tag. | ||
bamline_modified = ('\t').join(bamline.split()[:11] + [('').join(bamline.split()[11:])]) | ||
ck_data = ck_data +"\n" + bamline_modified | ||
elif line_number > (offset + ck_size): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can drop this if you implement https://github.com/galaxyproject/galaxy/pull/4279/files#r125709901 |
||
break | ||
last_read = offset + ck_size | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. here you can set a new offset using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd change this to |
||
return dumps( { 'ck_data': util.unicodify( ck_data ), | ||
'offset': last_read } ) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. and then change |
||
|
||
def display_data( self, trans, dataset, preview=False, filename=None, to_ext=None, offset=None, ck_size=None, **kwd): | ||
preview = util.string_as_bool( preview ) | ||
if offset is not None: | ||
return self.get_chunk(trans, dataset, offset, ck_size) | ||
elif to_ext or not preview: | ||
return super( Bam, self ).display_data( trans, dataset, preview, filename, to_ext, **kwd ) | ||
else: | ||
column_names = '' | ||
if dataset.metadata.column_names: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. dataset.metadata.column_names will be always empty because I am not setting anything in the set_meta() function. Even in the Sam class I could see the same thing. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, I think you could set the column_names |
||
column_names = dataset.metadata.column_names | ||
elif hasattr(dataset.datatype, 'column_names'): | ||
column_names = dataset.datatype.column_names | ||
column_types = dataset.metadata.column_types | ||
if not column_types: | ||
column_types = [] | ||
column_number = dataset.metadata.columns | ||
if column_number is None: | ||
column_number = 1 | ||
return trans.fill_template( "/dataset/tabular_chunked.mako", | ||
dataset=dataset, | ||
chunk=self.get_chunk(trans, dataset, 0), | ||
column_number=column_number, | ||
column_names=column_names, | ||
column_types=column_types ) | ||
|
||
# ------------- Dataproviders | ||
# pipe through samtools view | ||
# ALSO: (as Sam) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,7 +28,6 @@ | |
is_bz2, | ||
is_gzip | ||
) | ||
from galaxy.datatypes.binary import Binary | ||
|
||
log = logging.getLogger(__name__) | ||
|
||
|
@@ -478,7 +477,7 @@ def handle_uploaded_dataset_file( filename, datatypes_registry, ext='auto', is_m | |
ext = guess_ext( filename, sniff_order=datatypes_registry.sniff_order, is_multi_byte=is_multi_byte ) | ||
|
||
if check_binary( filename ): | ||
if not Binary.is_ext_unsniffable(ext) and not datatypes_registry.get_datatype_by_extension( ext ).sniff( filename ): | ||
if not galaxy.datatypes.binary.is_ext_unsniffable(ext) and not datatypes_registry.get_datatype_by_extension( ext ).sniff( filename ): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this change necessary? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Compilation breaks when I have both 'from galaxy.datatypes.binary import Binary' in sniff.py and 'from galaxy.datatypes.tabular import Sam' in binary.py . I donot understand the reason behind that. Code is compiling properly only when i remove 'from galaxy.datatypes.binary import Binary' from sniff.py. |
||
raise InappropriateDatasetContentError( 'The binary uploaded file contains inappropriate content.' ) | ||
elif check_html( filename ): | ||
raise InappropriateDatasetContentError( 'The uploaded file contains inappropriate HTML content.' ) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems that above three lines are mandatory inorder to show the column names in galaxy