API, history contents: contents download endpoint

- Adds an api endpoint at 'histories/{id}/contents/archive' for creating an archive from all history contents or a filtered subset of contents - Contents are returned in 'tgz' format currently - Datasets are located in the archive in paths roughly matching the nesting relationships of their parent collections - Filters can be applied using the same syntax as index - Adding 'dry_run=True' will return a JSON list of archive paths and dataset file paths for debugging - Adds map_datasets method for walking the datasets of histories and HDCAs
galaxyproject · Sep 1, 2016 · 21ccf76 · 21ccf76
1 parent ba6a0a3
commit 21ccf76
Show file tree

Hide file tree

Showing 5 changed files with 189 additions and 3 deletions.
diff --git a/lib/galaxy/managers/datasets.py b/lib/galaxy/managers/datasets.py
@@ -1,6 +1,8 @@
 """
 Manager and Serializer for Datasets.
 """
+import glob
+import os
 from six import string_types
 
 from galaxy import model
@@ -313,12 +315,13 @@ def purge( self, dataset_assoc, flush=True ):
         return dataset_assoc
 
     def by_user( self, user ):
-        """
-        """
         raise galaxy.exceptions.NotImplemented( 'Abstract Method' )
 
     # .... associated job
     def creating_job( self, dataset_assoc ):
+        """
+        Return the `Job` that created this dataset or None if not found.
+        """
         # TODO: is this needed? Can't you use the dataset_assoc.creating_job attribute? When is this None?
         # TODO: this would be even better if outputs and inputs were the underlying datasets
         job = None
@@ -348,6 +351,20 @@ def stop_creating_job( self, dataset_assoc ):
                     return True
         return False
 
+    def is_composite( self, dataset_assoc ):
+        """
+        Return True if this hda/ldda is a composite type dataset.
+
+        .. note: see also (whereever we keep information on composite datatypes?)
+        """
+        return dataset_assoc.extension in self.app.datatypes_registry.get_composite_extensions()
+
+    def extra_files( self, dataset_assoc ):
+        """Return a list of file paths for composite files, an empty list otherwise."""
+        if not self.is_composite( dataset_assoc ):
+            return []
+        return glob.glob( os.path.join( dataset_assoc.dataset.extra_files_path, '*' ) )
+
 
 class _UnflattenedMetadataDatasetAssociationSerializer( base.ModelSerializer,
                                                         deletable.PurgableSerializerMixin ):

diff --git a/lib/galaxy/managers/hdcas.py b/lib/galaxy/managers/hdcas.py
@@ -42,6 +42,27 @@ def __init__( self, app ):
         """
         super( HDCAManager, self ).__init__( app )
 
+    def map_datasets( self, content, fn, *parents ):
+        """
+        Iterate over the datasets of a given collection, recursing into collections, and
+        calling fn on each dataset.
+
+        Uses the same kwargs as `contents` above.
+        """
+        returned = []
+        # lots of nesting going on within the nesting
+        collection = content.collection if hasattr( content, 'collection' ) else content
+        this_parents = ( content, ) + parents
+        for element in collection.elements:
+            next_parents = ( element, ) + this_parents
+            if element.is_collection:
+                processed_list = self.map_datasets( element.child_collection, fn, *next_parents )
+                returned.extend( processed_list )
+            else:
+                processed = fn( element.dataset_instance, *next_parents )
+                returned.append( processed )
+        return returned
+
     # TODO: un-stub
 
 

diff --git a/lib/galaxy/managers/history_contents.py b/lib/galaxy/managers/history_contents.py
@@ -18,6 +18,7 @@
 from galaxy.managers import deletable
 from galaxy.managers import containers
 from galaxy.managers import hdas
+from galaxy.managers import hdcas
 
 import logging
 log = logging.getLogger( __name__ )
@@ -34,7 +35,7 @@ class HistoryContentsManager( containers.ContainerManagerMixin ):
     contained_class_type_name = 'dataset'
 
     subcontainer_class = model.HistoryDatasetCollectionAssociation
-    subcontainer_class_manager_class = None
+    subcontainer_class_manager_class = hdcas.HDCAManager
     subcontainer_class_type_name = 'dataset_collection'
 
     #: the columns which are common to both subcontainers and non-subcontainers.
@@ -61,6 +62,7 @@ class HistoryContentsManager( containers.ContainerManagerMixin ):
     def __init__( self, app ):
         self.app = app
         self.contained_manager = self.contained_class_manager_class( app )
+        self.subcontainer_manager = self.subcontainer_class_manager_class( app )
 
     # ---- interface
     def contained( self, container, filters=None, limit=None, offset=None, order_by=None, **kwargs ):
@@ -176,6 +178,24 @@ def active_counts( self, history ):
                 returned[ 'active' ] += count
         return returned
 
+    def map_datasets( self, history, fn, **kwargs ):
+        """
+        Iterate over the datasets of a given history, recursing into collections, and
+        calling fn on each dataset.
+
+        Uses the same kwargs as `contents` above.
+        """
+        returned = []
+        contents = self.contents( history, **kwargs )
+        for content in contents:
+            if isinstance( content, self.subcontainer_class ):
+                processed_list = self.subcontainer_manager.map_datasets( content, fn )
+                returned.extend( processed_list )
+            else:
+                processed = fn( content )
+                returned.append( processed )
+        return returned
+
     # ---- private
     def _session( self ):
         return self.app.model.context

diff --git a/lib/galaxy/webapps/galaxy/api/history_contents.py b/lib/galaxy/webapps/galaxy/api/history_contents.py
@@ -1,11 +1,16 @@
 """
 API operations on the contents of a history.
 """
+import os
+import re
 
 from galaxy import exceptions
 from galaxy import util
+from galaxy.util.streamball import StreamBall
+from galaxy.util.json import safe_dumps
 
 from galaxy.web import _future_expose_api as expose_api
+from galaxy.web import _future_expose_api_raw as expose_api_raw
 from galaxy.web import _future_expose_api_anonymous as expose_api_anonymous
 
 from galaxy.web.base.controller import BaseAPIController
@@ -597,3 +602,122 @@ def _parse_order_by( self, order_by_string ):
         if ORDER_BY_SEP_CHAR in order_by_string:
             return [ manager.parse_order_by( o ) for o in order_by_string.split( ORDER_BY_SEP_CHAR ) ]
         return manager.parse_order_by( order_by_string )
+
+    @expose_api_raw
+    def archive( self, trans, history_id, filename='', format='tgz', dry_run=True, **kwd ):
+        """
+        archive( self, trans, history_id, filename='', format='tgz', dry_run=True, **kwd )
+        * GET /api/histories/{history_id}/contents/archive/{id}
+        * GET /api/histories/{history_id}/contents/archive/{filename}.{format}
+            build and return a compressed archive of the selected history contents
+
+        :type   filename:  string
+        :param  filename:  (optional) archive name (defaults to history name)
+        :type   dry_run:   boolean
+        :param  dry_run:   (optional) if True, return the archive and file paths only
+                           as json and not an archive file
+
+        :returns:   archive file for download
+
+        .. note: this is a volatile endpoint and settings and behavior may change.
+        """
+        # roughly from: http://stackoverflow.com/a/31976060 (windows, linux)
+        invalid_filename_char_regex = re.compile( r'[:<>|\\\/\?\* "]' )
+        # path format string - dot separator between id and name
+        id_name_format = u'{}.{}'
+
+        def name_to_filename( name, max_length=150, replace_with=u'_' ):
+            # TODO: seems like shortening unicode with [:] would cause unpredictable display strings
+            return invalid_filename_char_regex.sub( replace_with, name )[0:max_length]
+
+        # given a set of parents for a dataset (HDCAs, DC, DCEs, etc.) - build a directory structure that
+        # (roughly) recreates the nesting in the contents using the parent names and ids
+        def build_path_from_parents( parents ):
+            parent_names = []
+            for parent in parents:
+                # an HDCA
+                if hasattr( parent, 'hid' ):
+                    name = name_to_filename( parent.name )
+                    parent_names.append( id_name_format.format( parent.hid, name ) )
+                # a DCE
+                elif hasattr( parent, 'element_index' ):
+                    name = name_to_filename( parent.element_identifier )
+                    parent_names.append( id_name_format.format( parent.element_index, name ) )
+            # NOTE: DCs are skipped and use the wrapping DCE info instead
+            return parent_names
+
+        # get the history used for the contents query and check for accessibility
+        history = self.history_manager.get_accessible( trans.security.decode_id( history_id ), trans.user )
+        archive_base_name = filename or name_to_filename( history.name )
+
+        # this is the fn applied to each dataset contained in the query
+        paths_and_files = []
+
+        def build_archive_files_and_paths( content, *parents ):
+            archive_path = archive_base_name
+            if not self.hda_manager.is_accessible( content, trans.user ):
+                # if the underlying dataset is not accessible, skip it silently
+                return
+
+            content_container_id = content.hid
+            content_name = name_to_filename( content.name )
+            if parents:
+                if hasattr( parents[0], 'element_index' ):
+                    # if content is directly wrapped in a DCE, strip it from parents (and the resulting path)
+                    # and instead replace the content id and name with the DCE index and identifier
+                    parent_dce, parents = parents[0], parents[1:]
+                    content_container_id = parent_dce.element_index
+                    content_name = name_to_filename( parent_dce.element_identifier )
+                # reverse for path from parents: oldest parent first
+                archive_path = os.path.join( archive_path, *build_path_from_parents( parents )[::-1] )
+                # TODO: this is brute force - building the path each time instead of re-using it
+                # possibly cache
+
+            # add the name as the last element in the archive path
+            content_id_and_name = id_name_format.format( content_container_id, content_name )
+            archive_path = os.path.join( archive_path, content_id_and_name )
+
+            # ---- for composite files, we use id and name for a directory and, inside that, ...
+            if self.hda_manager.is_composite( content ):
+                # ...save the 'main' composite file (gen. html)
+                paths_and_files.append( ( content.file_name, os.path.join( archive_path, content.name + '.html' ) ) )
+                for extra_file in self.hda_manager.extra_files( content ):
+                    extra_file_basename = os.path.basename( extra_file )
+                    archive_extra_file_path = os.path.join( archive_path, extra_file_basename )
+                    # ...and one for each file in the composite
+                    paths_and_files.append( ( extra_file, archive_extra_file_path ) )
+
+            # ---- for single files, we add the true extension to id and name and store that single filename
+            else:
+                # some dataset names can contain their original file extensions, don't repeat
+                if not archive_path.endswith( '.' + content.extension ):
+                    archive_path += '.' + content.extension
+                paths_and_files.append( ( content.file_name, archive_path ) )
+
+        # filter the contents that contain datasets using any filters possible from index above and map the datasets
+        filter_params = self.parse_filter_params( kwd )
+        filters = self.history_contents_filters.parse_filters( filter_params )
+        self.history_contents_manager.map_datasets( history, build_archive_files_and_paths, filters=filters )
+
+        # if dry_run, return the structure as json for debugging
+        if dry_run == 'True':
+            trans.response.headers['Content-Type'] = 'application/json'
+            return safe_dumps( paths_and_files )
+
+        # create the archive, add the dataset files, then stream the archive as a download
+        archive_type_string = 'w|gz'
+        archive_ext = 'tgz'
+        if self.app.config.upstream_gzip:
+            archive_type_string = 'w|'
+            archive_ext = 'tar'
+        archive = StreamBall( archive_type_string )
+
+        for file_path, archive_path in paths_and_files:
+            archive.add( file_path, archive_path )
+
+        archive_name = '.'.join([ archive_base_name, archive_ext ])
+        trans.response.set_content_type( "application/x-tar" )
+        trans.response.headers[ "Content-Disposition" ] = 'attachment; filename="{}"'.format( archive_name )
+        archive.wsgi_status = trans.response.wsgi_status()
+        archive.wsgi_headeritems = trans.response.wsgi_headeritems()
+        return archive.stream
diff --git a/lib/galaxy/webapps/galaxy/buildapp.py b/lib/galaxy/webapps/galaxy/buildapp.py
@@ -178,6 +178,10 @@ def populate_api_routes( webapp, app ):
                             parent_resources=dict( member_name='history', collection_name='histories' ),
                             )
 
+    contents_archive_mapper = webapp.mapper.submapper( action='archive', controller='history_contents' )
+    contents_archive_mapper.connect( '/api/histories/{history_id}/contents/archive' )
+    contents_archive_mapper.connect( '/api/histories/{history_id}/contents/archive/{filename}{.format}' )
+
     # Legacy access to HDA details via histories/{history_id}/contents/{hda_id}
     webapp.mapper.resource( 'content',
                             'contents',