diff --git a/.ci/pep8_sources.txt b/.ci/pep8_sources.txt index ce8e808d5e2c..41d9b18dad13 100644 --- a/.ci/pep8_sources.txt +++ b/.ci/pep8_sources.txt @@ -1,5 +1,6 @@ lib/galaxy/auth lib/galaxy/config.py +lib/galaxy/datatypes/data.py lib/galaxy/exceptions/error_codes.py lib/galaxy/jobs/{__init__,error_level,manager,stock_rules}.py lib/galaxy/main.py diff --git a/lib/galaxy/datatypes/data.py b/lib/galaxy/datatypes/data.py index 0977523b8db9..59e1384e816f 100644 --- a/lib/galaxy/datatypes/data.py +++ b/lib/galaxy/datatypes/data.py @@ -1,15 +1,15 @@ import logging -import metadata import mimetypes import os import shutil -import sys import tempfile import zipfile from cgi import escape from inspect import isclass + +import metadata from galaxy import util -from galaxy.datatypes.metadata import MetadataElement #import directly to maintain ease of use in Datatype class definitions +from galaxy.datatypes.metadata import MetadataElement # import directly to maintain ease of use in Datatype class definitions from galaxy.util import inflector from galaxy.util.bunch import Bunch from galaxy.util.odict import odict @@ -29,29 +29,23 @@ log = logging.getLogger(__name__) -comptypes=[] # Is this being used anywhere, why was this here? -JohnC -try: - import zlib - comptypes.append( 'zip' ) -except ImportError: - pass - - # Valid first column and strand column values vor bed, other formats col1_startswith = ['chr', 'chl', 'groupun', 'reftig_', 'scaffold', 'super_', 'vcho'] valid_strand = ['+', '-', '.'] + class DataMeta( type ): """ Metaclass for Data class. Sets up metadata spec. """ def __init__( cls, name, bases, dict_ ): cls.metadata_spec = metadata.MetadataSpecCollection() - for base in bases: #loop through bases (class/types) of cls - if hasattr( base, "metadata_spec" ): #base of class Data (object) has no metadata - cls.metadata_spec.update( base.metadata_spec ) #add contents of metadata spec of base class to cls + for base in bases: # loop through bases (class/types) of cls + if hasattr( base, "metadata_spec" ): # base of class Data (object) has no metadata + cls.metadata_spec.update( base.metadata_spec ) # add contents of metadata spec of base class to cls metadata.Statement.process( cls ) + @dataproviders.decorators.has_dataproviders class Data( object ): """ @@ -89,11 +83,11 @@ class Data( object ): # Allow user to change between this datatype and others. If False, this datatype # cannot be changed from or into. allow_datatype_change = True - #Composite datatypes + # Composite datatypes composite_type = None composite_files = odict() primary_file_name = 'index' - #A per datatype setting (inherited): max file size (in bytes) for setting optional metadata + # A per datatype setting (inherited): max file size (in bytes) for setting optional metadata _max_optional_metadata_filesize = None # Trackster track type. @@ -108,6 +102,7 @@ def __init__(self, **kwd): self.supported_display_apps = self.supported_display_apps.copy() self.composite_files = self.composite_files.copy() self.display_applications = odict() + def write_from_stream(self, dataset, stream): """Writes data from a stream""" fd = open(dataset.file_name, 'wb') @@ -117,24 +112,29 @@ def write_from_stream(self, dataset, stream): break os.write(fd, chunk) os.close(fd) + def set_raw_data(self, dataset, data): """Saves the data on the disc""" fd = open(dataset.file_name, 'wb') os.write(fd, data) os.close(fd) + def get_raw_data( self, dataset ): """Returns the full data. To stream it open the file_name and read/write as needed""" try: return file(dataset.file_name, 'rb').read(-1) - except OSError, e: + except OSError: log.exception('%s reading a file that does not exist %s' % (self.__class__.__name__, dataset.file_name)) return '' + def dataset_content_needs_grooming( self, file_name ): """This function is called on an output dataset file after the content is initially generated.""" return False + def groom_dataset_content( self, file_name ): """This function is called on an output dataset file if dataset_content_needs_grooming returns True.""" pass + def init_meta( self, dataset, copy_from=None ): # Metadata should be left mostly uninitialized. Dataset will # handle returning default values when metadata is not set. @@ -144,10 +144,12 @@ def init_meta( self, dataset, copy_from=None ): # flag the object as modified for SQLAlchemy. if copy_from: dataset.metadata = copy_from.metadata - def set_meta( self, dataset, overwrite = True, **kwd ): + + def set_meta( self, dataset, overwrite=True, **kwd ): """Unimplemented method, allows guessing of metadata from contents of file""" return True - def missing_meta( self, dataset, check = [], skip = [] ): + + def missing_meta( self, dataset, check=[], skip=[] ): """ Checks for empty metadata values, Returns True if non-optional metadata is missing Specifying a list of 'check' values will only check those names provided; when used, optionality is ignored @@ -159,22 +161,26 @@ def missing_meta( self, dataset, check = [], skip = [] ): to_check = dataset.metadata.items() for key, value in to_check: if key in skip or ( not check and dataset.metadata.spec[key].get( "optional" ) ): - continue #we skip check for optional and nonrequested values here + continue # we skip check for optional and nonrequested values here if not value: return True return False + def set_max_optional_metadata_filesize( self, max_value ): try: max_value = int( max_value ) except: return self.__class__._max_optional_metadata_filesize = max_value + def get_max_optional_metadata_filesize( self ): rval = self.__class__._max_optional_metadata_filesize if rval is None: return -1 return rval + max_optional_metadata_filesize = property( get_max_optional_metadata_filesize, set_max_optional_metadata_filesize ) + def set_peek( self, dataset, is_multi_byte=False ): """Set the peek and blurb text""" if not dataset.dataset.purged: @@ -191,7 +197,7 @@ def display_peek(self, dataset ): if not dataset.peek: dataset.set_peek() data = dataset.peek - lines = data.splitlines() + lines = data.splitlines() for line in lines: line = line.strip() if not line: @@ -232,17 +238,15 @@ def _archive_composite_dataset( self, trans, data=None, **kwd ): valid_chars = '.,^_-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' outfname = data.name[0:150] outfname = ''.join(c in valid_chars and c or '_' for c in outfname) - if (params.do_action == None): - params.do_action = 'zip' # default + if params.do_action is None: + params.do_action = 'zip' # default msg = util.restore_text( params.get( 'msg', '' ) ) - messagetype = params.get( 'messagetype', 'done' ) if not data: msg = "You must select at least one dataset" - messagetype = 'error' else: error = False try: - if (params.do_action == 'zip'): + if params.do_action == 'zip': # Can't use mkstemp - the file must not exist first tmpd = tempfile.mkdtemp() util.umask_fix_perms( tmpd, trans.app.config.umask, 0777, trans.app.config.gid ) @@ -257,33 +261,30 @@ def _archive_composite_dataset( self, trans, data=None, **kwd ): error = True log.exception( "Unable to create archive for download" ) msg = "Unable to create archive for %s for download, please report this error" % outfname - messagetype = 'error' if not error: - current_user_roles = trans.get_current_user_roles() ext = data.extension path = data.file_name fname = os.path.split(path)[-1] efp = data.extra_files_path - #Add any central file to the archive, + # Add any central file to the archive, display_name = os.path.splitext(outfname)[0] if not display_name.endswith(ext): display_name = '%s_%s' % (display_name, ext) - error, msg, messagetype = self._archive_main_file(archive, display_name, path) + error, msg = self._archive_main_file(archive, display_name, path)[:2] if not error: - #Add any child files to the archive, + # Add any child files to the archive, for root, dirs, files in os.walk(efp): for fname in files: - fpath = os.path.join(root,fname) - rpath = os.path.relpath(fpath,efp) + fpath = os.path.join(root, fname) + rpath = os.path.relpath(fpath, efp) try: - archive.add( fpath,rpath ) + archive.add( fpath, rpath ) except IOError: error = True log.exception( "Unable to add %s to temporary library download archive" % rpath) msg = "Unable to create archive for download, please report this error" - messagetype = 'error' continue if not error: if params.do_action == 'zip': @@ -298,7 +299,7 @@ def _archive_composite_dataset( self, trans, data=None, **kwd ): outext = 'tgz' if params.do_action == 'tbz': outext = 'tbz' - trans.response.headers[ "Content-Disposition" ] = 'attachment; filename="%s.%s"' % (outfname,outext) + trans.response.headers[ "Content-Disposition" ] = 'attachment; filename="%s.%s"' % (outfname, outext) archive.wsgi_status = trans.response.wsgi_status() archive.wsgi_headeritems = trans.response.wsgi_headeritems() return archive.stream @@ -308,7 +309,7 @@ def _serve_raw(self, trans, dataset, to_ext): trans.response.headers['Content-Length'] = int( os.stat( dataset.file_name ).st_size ) valid_chars = '.,^_-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' fname = ''.join(c in valid_chars and c or '_' for c in dataset.name)[0:150] - trans.response.set_content_type( "application/octet-stream" ) #force octet-stream so Safari doesn't append mime extensions to filename + trans.response.set_content_type( "application/octet-stream" ) # force octet-stream so Safari doesn't append mime extensions to filename trans.response.headers["Content-Disposition"] = 'attachment; filename="Galaxy%s-[%s].%s"' % (dataset.hid, fname, to_ext) return open( dataset.file_name ) @@ -321,11 +322,11 @@ def display_data(self, trans, data, preview=False, filename=None, to_ext=None, s TOOD: Document alternatives to overridding this method (data providers?). """ - #Relocate all composite datatype display to a common location. + # Relocate all composite datatype display to a common location. composite_extensions = trans.app.datatypes_registry.get_composite_extensions( ) - composite_extensions.append('html') # for archiving composite datatypes - #Prevent IE8 from sniffing content type since we're explicit about it. This prevents intentionally text/plain - #content from being rendered in the browser + composite_extensions.append('html') # for archiving composite datatypes + # Prevent IE8 from sniffing content type since we're explicit about it. This prevents intentionally text/plain + # content from being rendered in the browser trans.response.headers['X-Content-Type-Options'] = 'nosniff' if isinstance( data, basestring ): return data @@ -334,8 +335,8 @@ def display_data(self, trans, data, preview=False, filename=None, to_ext=None, s file_path = trans.app.object_store.get_filename(data.dataset, extra_dir='dataset_%s_files' % data.dataset.id, alt_name=filename) if os.path.exists( file_path ): if os.path.isdir( file_path ): - return trans.show_error_message( "Directory listing is not allowed." ) #TODO: Reconsider allowing listing of directories? - mime, encoding = mimetypes.guess_type( file_path ) + return trans.show_error_message( "Directory listing is not allowed." ) # TODO: Reconsider allowing listing of directories? + mime = mimetypes.guess_type( file_path )[0] if not mime: try: mime = trans.app.datatypes_registry.get_mimetype_by_extension( ".".split( file_path )[-1] ) @@ -348,8 +349,8 @@ def display_data(self, trans, data, preview=False, filename=None, to_ext=None, s self._clean_and_set_mime_type( trans, data.get_mime() ) trans.log_event( "Display dataset id: %s" % str( data.id ) ) - from galaxy import datatypes #DBTODO REMOVE THIS AT REFACTOR - if to_ext or isinstance(data.datatype, datatypes.binary.Binary): # Saving the file, or binary file + from galaxy import datatypes # DBTODO REMOVE THIS AT REFACTOR + if to_ext or isinstance(data.datatype, datatypes.binary.Binary): # Saving the file, or binary file if data.extension in composite_extensions: return self._archive_composite_dataset( trans, data, **kwd ) else: @@ -358,14 +359,14 @@ def display_data(self, trans, data, preview=False, filename=None, to_ext=None, s to_ext = data.extension valid_chars = '.,^_-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' fname = ''.join(c in valid_chars and c or '_' for c in data.name)[0:150] - trans.response.set_content_type( "application/octet-stream" ) #force octet-stream so Safari doesn't append mime extensions to filename + trans.response.set_content_type( "application/octet-stream" ) # force octet-stream so Safari doesn't append mime extensions to filename trans.response.headers["Content-Disposition"] = 'attachment; filename="Galaxy%s-[%s].%s"' % (data.hid, fname, to_ext) return open( data.file_name ) if not os.path.exists( data.file_name ): raise paste.httpexceptions.HTTPNotFound( "File Not Found (%s)." % data.file_name ) - max_peek_size = 1000000 # 1 MB + max_peek_size = 1000000 # 1 MB if isinstance(data.datatype, datatypes.images.Html): - max_peek_size = 10000000 # 10 MB for html + max_peek_size = 10000000 # 10 MB for html preview = util.string_as_bool( preview ) if not preview or isinstance(data.datatype, datatypes.images.Image) or os.stat( data.file_name ).st_size < max_peek_size: if trans.app.config.sanitize_all_html and trans.response.get_content_type() == "text/html": @@ -375,18 +376,19 @@ def display_data(self, trans, data, preview=False, filename=None, to_ext=None, s else: trans.response.set_content_type( "text/html" ) return trans.stream_template_mako( "/dataset/large_file.mako", - truncated_data = open( data.file_name ).read(max_peek_size), - data = data) + truncated_data=open( data.file_name ).read(max_peek_size), + data=data) def display_name(self, dataset): """Returns formatted html of dataset name""" try: - if type ( dataset.name ) is unicode: + if type( dataset.name ) is unicode: return escape( dataset.name ) else: return escape( unicode( dataset.name, 'utf-8 ') ) except: return "name unavailable" + def display_info(self, dataset): """Returns formatted html of dataset info""" try: @@ -406,16 +408,20 @@ def display_info(self, dataset): return info except: return "info unavailable" + def validate(self, dataset): """Unimplemented validate, return no exceptions""" return list() + def repair_methods(self, dataset): """Unimplemented method, returns dict with method/option for repairing errors""" return None + def get_mime(self): """Returns the mime type of the datatype""" return 'application/octet-stream' - def add_display_app ( self, app_id, label, file_function, links_function ): + + def add_display_app( self, app_id, label, file_function, links_function ): """ Adds a display app to the datatype. app_id is a unique id @@ -424,22 +430,27 @@ def add_display_app ( self, app_id, label, file_function, links_function ): links_function is a string containing the name of the function that returns a list of (link_name,link) """ self.supported_display_apps = self.supported_display_apps.copy() - self.supported_display_apps[app_id] = {'label':label,'file_function':file_function,'links_function':links_function} - def remove_display_app (self, app_id): + self.supported_display_apps[app_id] = {'label': label, 'file_function': file_function, 'links_function': links_function} + + def remove_display_app(self, app_id): """Removes a display app from the datatype""" self.supported_display_apps = self.supported_display_apps.copy() try: del self.supported_display_apps[app_id] except: log.exception('Tried to remove display app %s from datatype %s, but this display app is not declared.' % ( type, self.__class__.__name__ ) ) + def clear_display_apps( self ): self.supported_display_apps = {} + def add_display_application( self, display_application ): """New style display applications""" assert display_application.id not in self.display_applications, 'Attempted to add a display application twice' self.display_applications[ display_application.id ] = display_application - def get_display_application( self, key, default = None ): + + def get_display_application( self, key, default=None ): return self.display_applications.get( key, default ) + def get_display_applications_by_dataset( self, dataset, trans ): rval = odict() for key, value in self.display_applications.iteritems(): @@ -447,23 +458,27 @@ def get_display_applications_by_dataset( self, dataset, trans ): if value.links: rval[key] = value return rval + def get_display_types(self): """Returns display types available""" return self.supported_display_apps.keys() + def get_display_label(self, type): """Returns primary label for display app""" try: return self.supported_display_apps[type]['label'] except: return 'unknown' + def as_display_type(self, dataset, type, **kwd): """Returns modified file contents for a particular display type """ try: if type in self.get_display_types(): - return getattr (self, self.supported_display_apps[type]['file_function']) (dataset, **kwd) + return getattr(self, self.supported_display_apps[type]['file_function'])(dataset, **kwd) except: log.exception('Function %s is referred to in datatype %s for displaying as type %s, but is not accessible' % (self.supported_display_apps[type]['file_function'], self.__class__.__name__, type) ) return "This display type (%s) is not implemented for this datatype (%s)." % ( type, dataset.ext) + def get_display_links( self, dataset, type, app, base_url, target_frame='_blank', **kwd ): """ Returns a list of tuples of (name, link) for a particular display type. No check on @@ -473,26 +488,29 @@ def get_display_links( self, dataset, type, app, base_url, target_frame='_blank' """ try: if app.config.enable_old_display_applications and type in self.get_display_types(): - return target_frame, getattr ( self, self.supported_display_apps[type]['links_function'] ) ( dataset, type, app, base_url, **kwd ) + return target_frame, getattr( self, self.supported_display_apps[type]['links_function'] )( dataset, type, app, base_url, **kwd ) except: - log.exception( 'Function %s is referred to in datatype %s for generating links for type %s, but is not accessible' \ + log.exception( 'Function %s is referred to in datatype %s for generating links for type %s, but is not accessible' % ( self.supported_display_apps[type]['links_function'], self.__class__.__name__, type ) ) return target_frame, [] + def get_converter_types(self, original_dataset, datatypes_registry): """Returns available converters by type for this dataset""" return datatypes_registry.get_converters_by_datatype(original_dataset.ext) + def find_conversion_destination( self, dataset, accepted_formats, datatypes_registry, **kwd ): """Returns ( target_ext, existing converted dataset )""" return datatypes_registry.find_conversion_destination_for_dataset_by_extensions( dataset, accepted_formats, **kwd ) + def convert_dataset(self, trans, original_dataset, target_type, return_output=False, visible=True, deps=None, set_output_history=True): """This function adds a job to the queue to convert a dataset to another type. Returns a message about success/failure.""" converter = trans.app.datatypes_registry.get_converter_by_target_type( original_dataset.ext, target_type ) if converter is None: raise Exception( "A converter does not exist for %s to %s." % ( original_dataset.ext, target_type ) ) - #Generate parameter dictionary + # Generate parameter dictionary params = {} - #determine input parameter name and add to params + # determine input parameter name and add to params input_name = 'input1' for key, value in converter.inputs.items(): if deps and value.name in deps: @@ -501,26 +519,29 @@ def convert_dataset(self, trans, original_dataset, target_type, return_output=Fa input_name = key params[input_name] = original_dataset - #Run converter, job is dispatched through Queue + # Run converter, job is dispatched through Queue converted_dataset = converter.execute( trans, incoming=params, set_output_hid=visible, set_output_history=set_output_history)[1] if len(params) > 0: trans.log_event( "Converter params: %s" % (str(params)), tool_id=converter.id ) if not visible: - for name, value in converted_dataset.iteritems(): + for value in converted_dataset.itervalues(): value.visible = False if return_output: return converted_dataset return "The file conversion of %s on data %s has been added to the Queue." % (converter.name, original_dataset.hid) - #We need to clear associated files before we set metadata - #so that as soon as metadata starts to be set, e.g. implicitly converted datasets are deleted and no longer available 'while' metadata is being set, not just after - #We'll also clear after setting metadata, for backwards compatibility + + # We need to clear associated files before we set metadata + # so that as soon as metadata starts to be set, e.g. implicitly converted datasets are deleted and no longer available 'while' metadata is being set, not just after + # We'll also clear after setting metadata, for backwards compatibility def after_setting_metadata( self, dataset ): """This function is called on the dataset after metadata is set.""" - dataset.clear_associated_files( metadata_safe = True ) + dataset.clear_associated_files( metadata_safe=True ) + def before_setting_metadata( self, dataset ): """This function is called on the dataset before metadata is set.""" - dataset.clear_associated_files( metadata_safe = True ) - def __new_composite_file( self, name, optional = False, mimetype = None, description = None, substitute_name_with_metadata = None, is_binary = False, to_posix_lines = True, space_to_tab = False, **kwds ): + dataset.clear_associated_files( metadata_safe=True ) + + def __new_composite_file( self, name, optional=False, mimetype=None, description=None, substitute_name_with_metadata=None, is_binary=False, to_posix_lines=True, space_to_tab=False, **kwds ): kwds[ 'name' ] = name kwds[ 'optional' ] = optional kwds[ 'mimetype' ] = mimetype @@ -530,10 +551,12 @@ def __new_composite_file( self, name, optional = False, mimetype = None, descrip kwds[ 'to_posix_lines' ] = to_posix_lines kwds[ 'space_to_tab' ] = space_to_tab return Bunch( **kwds ) + def add_composite_file( self, name, **kwds ): - #self.composite_files = self.composite_files.copy() + # self.composite_files = self.composite_files.copy() self.composite_files[ name ] = self.__new_composite_file( name, **kwds ) - def __substitute_composite_key( self, key, composite_file, dataset = None ): + + def __substitute_composite_key( self, key, composite_file, dataset=None ): if composite_file.substitute_name_with_metadata: if dataset: meta_value = str( dataset.metadata.get( composite_file.substitute_name_with_metadata ) ) @@ -541,15 +564,17 @@ def __substitute_composite_key( self, key, composite_file, dataset = None ): meta_value = self.spec[composite_file.substitute_name_with_metadata].default return key % meta_value return key + @property - def writable_files( self, dataset = None ): + def writable_files( self, dataset=None ): files = odict() if self.composite_type != 'auto_primary_file': files[ self.primary_file_name ] = self.__new_composite_file( self.primary_file_name ) - for key, value in self.get_composite_files( dataset = dataset ).iteritems(): + for key, value in self.get_composite_files( dataset=dataset ).iteritems(): files[ key ] = value return files - def get_composite_files( self, dataset = None ): + + def get_composite_files( self, dataset=None ): def substitute_composite_key( key, composite_file ): if composite_file.substitute_name_with_metadata: if dataset: @@ -562,8 +587,10 @@ def substitute_composite_key( key, composite_file ): for key, value in self.composite_files.iteritems(): files[ substitute_composite_key( key, value ) ] = value return files - def generate_auto_primary_file( self, dataset = None ): + + def generate_auto_primary_file( self, dataset=None ): raise Exception( "generate_auto_primary_file is not implemented for this datatype." ) + @property def has_resolution(self): return False @@ -575,6 +602,7 @@ def matches_any( self, target_datatypes ): """ datatype_classes = tuple( [ datatype if isclass( datatype ) else datatype.__class__ for datatype in target_datatypes ] ) return isinstance( self, datatype_classes ) + def merge( split_files, output_file): """ Merge files with copy.copyfileobj() will not hit the @@ -606,7 +634,7 @@ def has_dataprovider( self, data_format ): """ Returns True if `data_format` is available in `dataproviders`. """ - return ( data_format in self.dataproviders ) + return data_format in self.dataproviders def dataprovider( self, dataset, data_format, **settings ): """ @@ -645,7 +673,7 @@ class Text( Data ): file_ext = 'txt' line_class = 'line' - """Add metadata elements""" + # Add metadata elements MetadataElement( name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, visible=False, no_value=0 ) def write_from_stream(self, dataset, stream): @@ -659,31 +687,35 @@ def write_from_stream(self, dataset, stream): os.write(fd, chunk) os.close(fd) # rewrite the file with unix newlines - fp = open(dataset.file_name, 'wt') + fp = open(dataset.file_name, 'w') for line in file(temp_name, "U"): line = line.strip() + '\n' fp.write(line) fp.close() + def set_raw_data(self, dataset, data): """Saves the data on the disc""" fd, temp_name = tempfile.mkstemp() os.write(fd, data) os.close(fd) # rewrite the file with unix newlines - fp = open(dataset.file_name, 'wt') + fp = open(dataset.file_name, 'w') for line in file(temp_name, "U"): line = line.strip() + '\n' fp.write(line) fp.close() os.remove( temp_name ) + def get_mime(self): """Returns the mime type of the datatype""" return 'text/plain' + def set_meta( self, dataset, **kwd ): """ Set the number of lines of data in dataset. """ dataset.metadata.data_lines = self.count_data_lines(dataset) + def estimate_file_lines( self, dataset ): """ Perform a rough estimate by extrapolating number of lines from a small read. @@ -695,6 +727,7 @@ def estimate_file_lines( self, dataset ): sample_lines = dataset_read.count('\n') est_lines = int(sample_lines * (float(dataset.get_size()) / float(sample_size))) return est_lines + def count_data_lines(self, dataset): """ Count the number of lines of data in dataset, @@ -706,6 +739,7 @@ def count_data_lines(self, dataset): if line and not line.startswith( '#' ): data_lines += 1 return data_lines + def set_peek( self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, skipchars=[] ): """ Set the peek. This method is used by various subclasses of Text. @@ -722,7 +756,7 @@ def set_peek( self, dataset, line_count=None, is_multi_byte=False, WIDTH=256, sk # needed to set metadata # This can happen when the file is larger than max_optional_metadata_filesize. if int(dataset.get_size()) <= 1048576: - #Small dataset, recount all lines and reset peek afterward. + # Small dataset, recount all lines and reset peek afterward. lc = self.count_data_lines(dataset) dataset.metadata.data_lines = lc dataset.blurb = "%s %s" % ( util.commaify( str(lc) ), inflector.cond_plural(lc, self.line_class) ) @@ -750,11 +784,12 @@ def split( cls, input_datasets, subdir_generator_function, split_params): chunk_size = None if split_params['split_mode'] == 'number_of_parts': lines_per_file = [] + # Computing the length is expensive! def _file_len(fname): i = 0 f = open(fname) - for i, l in enumerate(f): + for i, _ in enumerate(f): pass f.close() return i + 1 @@ -775,7 +810,7 @@ def _file_len(fname): else: raise Exception('Unsupported split mode %s' % split_params['split_mode']) - f = open(input_files[0], 'rt') + f = open(input_files[0], 'r') try: chunk_idx = 0 file_done = False @@ -801,7 +836,7 @@ def _file_len(fname): lines_remaining -= 1 if part_file is not None: part_file.close() - except Exception, e: + except Exception, e: log.error('Unable to split files: %s' % str(e)) f.close() if part_file is not None: @@ -905,12 +940,14 @@ def get_visualizations( self, dataset ): # datatypes. nice_size = util.nice_size + def get_test_fname( fname ): """Returns test data filename""" - path, name = os.path.split(__file__) + path = os.path.dirname(__file__) full_path = os.path.join( path, 'test', fname ) return full_path + def get_file_peek( file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5, skipchars=[] ): """ Returns the first LINE_COUNT lines wrapped to WIDTH