Skip to content

Commit

Permalink
API, history contents: contents download endpoint
Browse files Browse the repository at this point in the history
- Adds an api endpoint at 'histories/{id}/contents/archive' for
creating an archive from all history contents or a filtered subset
of contents
- Contents are returned in 'tgz' format currently
- Datasets are located in the archive in paths roughly matching the
nesting relationships of their parent collections
- Filters can be applied using the same syntax as index
- Adding 'dry_run=True' will return a JSON list of archive paths and
dataset file paths for debugging
- Adds map_datasets method for walking the datasets of histories and
HDCAs
  • Loading branch information
carlfeberhard committed Sep 1, 2016
1 parent ba6a0a3 commit 21ccf76
Show file tree
Hide file tree
Showing 5 changed files with 189 additions and 3 deletions.
21 changes: 19 additions & 2 deletions lib/galaxy/managers/datasets.py
@@ -1,6 +1,8 @@
"""
Manager and Serializer for Datasets.
"""
import glob
import os
from six import string_types

from galaxy import model
Expand Down Expand Up @@ -313,12 +315,13 @@ def purge( self, dataset_assoc, flush=True ):
return dataset_assoc

def by_user( self, user ):
"""
"""
raise galaxy.exceptions.NotImplemented( 'Abstract Method' )

# .... associated job
def creating_job( self, dataset_assoc ):
"""
Return the `Job` that created this dataset or None if not found.
"""
# TODO: is this needed? Can't you use the dataset_assoc.creating_job attribute? When is this None?
# TODO: this would be even better if outputs and inputs were the underlying datasets
job = None
Expand Down Expand Up @@ -348,6 +351,20 @@ def stop_creating_job( self, dataset_assoc ):
return True
return False

def is_composite( self, dataset_assoc ):
"""
Return True if this hda/ldda is a composite type dataset.
.. note: see also (whereever we keep information on composite datatypes?)
"""
return dataset_assoc.extension in self.app.datatypes_registry.get_composite_extensions()

def extra_files( self, dataset_assoc ):
"""Return a list of file paths for composite files, an empty list otherwise."""
if not self.is_composite( dataset_assoc ):
return []
return glob.glob( os.path.join( dataset_assoc.dataset.extra_files_path, '*' ) )


class _UnflattenedMetadataDatasetAssociationSerializer( base.ModelSerializer,
deletable.PurgableSerializerMixin ):
Expand Down
21 changes: 21 additions & 0 deletions lib/galaxy/managers/hdcas.py
Expand Up @@ -42,6 +42,27 @@ def __init__( self, app ):
"""
super( HDCAManager, self ).__init__( app )

def map_datasets( self, content, fn, *parents ):
"""
Iterate over the datasets of a given collection, recursing into collections, and
calling fn on each dataset.
Uses the same kwargs as `contents` above.
"""
returned = []
# lots of nesting going on within the nesting
collection = content.collection if hasattr( content, 'collection' ) else content
this_parents = ( content, ) + parents
for element in collection.elements:
next_parents = ( element, ) + this_parents
if element.is_collection:
processed_list = self.map_datasets( element.child_collection, fn, *next_parents )
returned.extend( processed_list )
else:
processed = fn( element.dataset_instance, *next_parents )
returned.append( processed )
return returned

# TODO: un-stub


Expand Down
22 changes: 21 additions & 1 deletion lib/galaxy/managers/history_contents.py
Expand Up @@ -18,6 +18,7 @@
from galaxy.managers import deletable
from galaxy.managers import containers
from galaxy.managers import hdas
from galaxy.managers import hdcas

import logging
log = logging.getLogger( __name__ )
Expand All @@ -34,7 +35,7 @@ class HistoryContentsManager( containers.ContainerManagerMixin ):
contained_class_type_name = 'dataset'

subcontainer_class = model.HistoryDatasetCollectionAssociation
subcontainer_class_manager_class = None
subcontainer_class_manager_class = hdcas.HDCAManager
subcontainer_class_type_name = 'dataset_collection'

#: the columns which are common to both subcontainers and non-subcontainers.
Expand All @@ -61,6 +62,7 @@ class HistoryContentsManager( containers.ContainerManagerMixin ):
def __init__( self, app ):
self.app = app
self.contained_manager = self.contained_class_manager_class( app )
self.subcontainer_manager = self.subcontainer_class_manager_class( app )

# ---- interface
def contained( self, container, filters=None, limit=None, offset=None, order_by=None, **kwargs ):
Expand Down Expand Up @@ -176,6 +178,24 @@ def active_counts( self, history ):
returned[ 'active' ] += count
return returned

def map_datasets( self, history, fn, **kwargs ):
"""
Iterate over the datasets of a given history, recursing into collections, and
calling fn on each dataset.
Uses the same kwargs as `contents` above.
"""
returned = []
contents = self.contents( history, **kwargs )
for content in contents:
if isinstance( content, self.subcontainer_class ):
processed_list = self.subcontainer_manager.map_datasets( content, fn )
returned.extend( processed_list )
else:
processed = fn( content )
returned.append( processed )
return returned

# ---- private
def _session( self ):
return self.app.model.context
Expand Down
124 changes: 124 additions & 0 deletions lib/galaxy/webapps/galaxy/api/history_contents.py
@@ -1,11 +1,16 @@
"""
API operations on the contents of a history.
"""
import os
import re

from galaxy import exceptions
from galaxy import util
from galaxy.util.streamball import StreamBall
from galaxy.util.json import safe_dumps

from galaxy.web import _future_expose_api as expose_api
from galaxy.web import _future_expose_api_raw as expose_api_raw
from galaxy.web import _future_expose_api_anonymous as expose_api_anonymous

from galaxy.web.base.controller import BaseAPIController
Expand Down Expand Up @@ -597,3 +602,122 @@ def _parse_order_by( self, order_by_string ):
if ORDER_BY_SEP_CHAR in order_by_string:
return [ manager.parse_order_by( o ) for o in order_by_string.split( ORDER_BY_SEP_CHAR ) ]
return manager.parse_order_by( order_by_string )

@expose_api_raw
def archive( self, trans, history_id, filename='', format='tgz', dry_run=True, **kwd ):
"""
archive( self, trans, history_id, filename='', format='tgz', dry_run=True, **kwd )
* GET /api/histories/{history_id}/contents/archive/{id}
* GET /api/histories/{history_id}/contents/archive/{filename}.{format}
build and return a compressed archive of the selected history contents
:type filename: string
:param filename: (optional) archive name (defaults to history name)
:type dry_run: boolean
:param dry_run: (optional) if True, return the archive and file paths only
as json and not an archive file
:returns: archive file for download
.. note: this is a volatile endpoint and settings and behavior may change.
"""
# roughly from: http://stackoverflow.com/a/31976060 (windows, linux)
invalid_filename_char_regex = re.compile( r'[:<>|\\\/\?\* "]' )
# path format string - dot separator between id and name
id_name_format = u'{}.{}'

def name_to_filename( name, max_length=150, replace_with=u'_' ):
# TODO: seems like shortening unicode with [:] would cause unpredictable display strings
return invalid_filename_char_regex.sub( replace_with, name )[0:max_length]

# given a set of parents for a dataset (HDCAs, DC, DCEs, etc.) - build a directory structure that
# (roughly) recreates the nesting in the contents using the parent names and ids
def build_path_from_parents( parents ):
parent_names = []
for parent in parents:
# an HDCA
if hasattr( parent, 'hid' ):
name = name_to_filename( parent.name )
parent_names.append( id_name_format.format( parent.hid, name ) )
# a DCE
elif hasattr( parent, 'element_index' ):
name = name_to_filename( parent.element_identifier )
parent_names.append( id_name_format.format( parent.element_index, name ) )
# NOTE: DCs are skipped and use the wrapping DCE info instead
return parent_names

# get the history used for the contents query and check for accessibility
history = self.history_manager.get_accessible( trans.security.decode_id( history_id ), trans.user )
archive_base_name = filename or name_to_filename( history.name )

# this is the fn applied to each dataset contained in the query
paths_and_files = []

def build_archive_files_and_paths( content, *parents ):
archive_path = archive_base_name
if not self.hda_manager.is_accessible( content, trans.user ):
# if the underlying dataset is not accessible, skip it silently
return

content_container_id = content.hid
content_name = name_to_filename( content.name )
if parents:
if hasattr( parents[0], 'element_index' ):
# if content is directly wrapped in a DCE, strip it from parents (and the resulting path)
# and instead replace the content id and name with the DCE index and identifier
parent_dce, parents = parents[0], parents[1:]
content_container_id = parent_dce.element_index
content_name = name_to_filename( parent_dce.element_identifier )
# reverse for path from parents: oldest parent first
archive_path = os.path.join( archive_path, *build_path_from_parents( parents )[::-1] )
# TODO: this is brute force - building the path each time instead of re-using it
# possibly cache

# add the name as the last element in the archive path
content_id_and_name = id_name_format.format( content_container_id, content_name )
archive_path = os.path.join( archive_path, content_id_and_name )

# ---- for composite files, we use id and name for a directory and, inside that, ...
if self.hda_manager.is_composite( content ):
# ...save the 'main' composite file (gen. html)
paths_and_files.append( ( content.file_name, os.path.join( archive_path, content.name + '.html' ) ) )
for extra_file in self.hda_manager.extra_files( content ):
extra_file_basename = os.path.basename( extra_file )
archive_extra_file_path = os.path.join( archive_path, extra_file_basename )
# ...and one for each file in the composite
paths_and_files.append( ( extra_file, archive_extra_file_path ) )

# ---- for single files, we add the true extension to id and name and store that single filename
else:
# some dataset names can contain their original file extensions, don't repeat
if not archive_path.endswith( '.' + content.extension ):
archive_path += '.' + content.extension
paths_and_files.append( ( content.file_name, archive_path ) )

# filter the contents that contain datasets using any filters possible from index above and map the datasets
filter_params = self.parse_filter_params( kwd )
filters = self.history_contents_filters.parse_filters( filter_params )
self.history_contents_manager.map_datasets( history, build_archive_files_and_paths, filters=filters )

# if dry_run, return the structure as json for debugging
if dry_run == 'True':
trans.response.headers['Content-Type'] = 'application/json'
return safe_dumps( paths_and_files )

# create the archive, add the dataset files, then stream the archive as a download
archive_type_string = 'w|gz'
archive_ext = 'tgz'
if self.app.config.upstream_gzip:
archive_type_string = 'w|'
archive_ext = 'tar'
archive = StreamBall( archive_type_string )

for file_path, archive_path in paths_and_files:
archive.add( file_path, archive_path )

archive_name = '.'.join([ archive_base_name, archive_ext ])
trans.response.set_content_type( "application/x-tar" )
trans.response.headers[ "Content-Disposition" ] = 'attachment; filename="{}"'.format( archive_name )
archive.wsgi_status = trans.response.wsgi_status()
archive.wsgi_headeritems = trans.response.wsgi_headeritems()
return archive.stream
4 changes: 4 additions & 0 deletions lib/galaxy/webapps/galaxy/buildapp.py
Expand Up @@ -178,6 +178,10 @@ def populate_api_routes( webapp, app ):
parent_resources=dict( member_name='history', collection_name='histories' ),
)

contents_archive_mapper = webapp.mapper.submapper( action='archive', controller='history_contents' )
contents_archive_mapper.connect( '/api/histories/{history_id}/contents/archive' )
contents_archive_mapper.connect( '/api/histories/{history_id}/contents/archive/{filename}{.format}' )

# Legacy access to HDA details via histories/{history_id}/contents/{hda_id}
webapp.mapper.resource( 'content',
'contents',
Expand Down

0 comments on commit 21ccf76

Please sign in to comment.