'''A directory of file formats and their properties'''
def get_icon(format_):
global ICON_MAP
if ICON_MAP is None:
ICON_MAP = dict((
# Format, icon
# Where:
# * format is canonical format from resource_formats.json
# * icon is in ckanext/dgu/theme/src/images/fugue/
('HTML', 'globe--arrow'),
('JPEG', 'image'),
('TIFF', 'image'),
('Database', 'database-sql'),
('API', 'server-cloud'),
('TXT', 'document-text'),
('PDF', 'document-pdf'),
('RTF', 'document-word'),
('Zip', 'folder-zipper'),
('DOC', 'document-word'),
('ODT', 'document-word'),
('PPT', 'document-powerpoint'),
('ODP', 'document-powerpoint'),
('XLS', 'document-excel'),
('SHP', 'globe-model'),
('CSV', 'document-invoice'),
('PSV', 'document-invoice'),
('TSV', 'document-invoice'),
('JSON', 'document-node'),
('XML', 'document-code'),
('RSS', 'feed-document'),
('ODS', 'document-excel'),
('WMS', 'globe-model'),
('KML', 'globe-model'),
('IATI', 'document-code'),
('iCalendar', 'calendar-day'),
('RDF', 'document-rdf'),
('RDFa', 'document-rdf'),
('SPARQL', 'document-rdf'),
('SPARQL web form', 'document-rdf'),
('OWL', 'document-rdf'),
return ICON_MAP.get(format_)
import re
class Formats(object):
def by_display_name(cls):
'''Returns the formats data as a dict keyed by the display name'''
if not hasattr(cls, '_by_display_name'):
data = cls.get_data()
cls._by_display_name = {}
for format_dict in data:
cls._by_display_name[format_dict['display_name']] = format_dict
return cls._by_display_name
def by_mime_type(cls):
'''Returns the formats data as a dict keyed by mime type'''
if not hasattr(cls, '_by_mime_type'):
data = cls.get_data()
cls._by_mime_type = {}
for format_dict in data:
for mime_type in format_dict['mime_types']:
cls._by_mime_type[mime_type] = format_dict
return cls._by_mime_type
def by_extension(cls):
'''Returns the formats data as a dict keyed by filename extension'''
if not hasattr(cls, '_by_extension'):
data = cls.get_data()
cls._by_extension = {}
for format_dict in data:
for extension in format_dict['extensions']:
cls._by_extension[extension] = format_dict
return cls._by_extension
def by_reduced_name(cls):
'''Returns the formats data as a dict keyed by "reduced" names for
each format. This is helpful for matching against user-inputted formats.
e.g. "TXT / .Zip" is "txt/zip"'''
if not hasattr(cls, '_by_reduced'):
data = cls.get_data()
cls._by_reduced = {}
for format_dict in data:
for name in [format_dict['display_name']] + list(format_dict['extensions']) \
+ list(format_dict['alternative_names']):
reduced_name = cls.reduce(name)
cls._by_reduced[reduced_name] = format_dict
return cls._by_reduced
def reduce(format_name):
format_name = format_name.strip().lower()
if format_name.startswith('.'): format_name = format_name[1:]
return re.sub('[^a-z/+]', '', format_name)
def match(cls, raw_resource_format):
'''Given a format that may be badly formatted, try and match it to
a known format and return that.
If no match is found, returns None.
# Try exact match
if raw_resource_format in cls.by_display_name():
return cls.by_display_name()[raw_resource_format]
# Try canonised match
reduced_raw = cls.reduce(raw_resource_format)
if reduced_raw in cls.by_reduced_name():
return cls.by_reduced_name()[reduced_raw]
def get_data(cls):
'''Returns the list of data formats, each one as a dict
e.g. [{'display_name': 'TXT', 'extensions': ('txt',), 'extension': 'txt',
'mime_types': ('text/plain',), 'openness': 1},
if not hasattr(cls, '_data'):
# store the data here so it only loads when first used, rather
# than on module load
data_flat = (
# Display name, alternative names, extensions (lower case), mime-types, openness, icon-name
('HTML', ('web page', 'website'), ('html', 'htm', 'asp', 'php'), ('text/html',), 0, 'globe--arrow'),
('JPEG', (), ('jpg','jpeg'), ('image/jpg',), 0, 'image'),
('TIFF', (), ('tifflzw','tiff'), ('image/tiff',), 0, 'image'),
('Database', ('database','sql'), (), (), 0, 'database-sql'),
('API', ('api',), (), (), 0, 'server-cloud'),
('TXT', (), ('txt',), ('text/plain',), 1, 'document-text'),
('PDF', (), ('pdf',), ('application/pdf',), 1, 'document-pdf'),
('RTF', (), ('rtf',), ('application/rtf',), 1, 'document-word'),
('Zip', (), ('zip',), ('application/x-zip', 'application/x-compressed', 'application/x-zip-compressed', 'application/zip', 'multipart/x-zip', 'application/x-gzip'), 1, 'folder-zipper'),
('Torrent', (), ('torrent',), ('application/x-bittorrent',), 1, ''),
('DOC', ('word',), ('doc', 'docx', 'mcw'), ('application/msword', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'application/'), 1, 'document-word'),
('ODT', (), ('odt',), ('application/vnd.oasis.opendocument.text', 'application/x-vnd.oasis.opendocument.text'), 1, 'document-word'),
('PPT', ('powerpoint',), ('ppt', 'pptx', 'ppz'), ('application/mspowerpoint', 'application/', 'application/'), 1, 'document-powerpoint'),
('ODP', (), ('odp',), ('application/vnd.oasis.opendocument.presentation', 'application/x-vnd.oasis.opendocument.presentation'), 1, 'document-powerpoint'),
('XLS', ('excel',), ('xls', 'xlsx', 'xlb'), ('application/excel', 'application/x-excel', 'application/x-msexcel', 'application/', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'application/', 'application/', 'application/vnd.msexcel'), 2, 'document-excel'),
('SHP', ('shapefile', 'esri shapefile',), ('shp',), (), 2, 'globe-model'),
('CSV', ('csvfile',), ('csv',), ('text/csv','text/comma-separated-values'), 3, 'document-invoice'),
('PSV', (), ('psv',), ('text/psv','text/pipe-separated-values'), 3, 'document-invoice'),
('JSON', (), ('json',), ('application/json', 'text/x-json'), 3, 'document-node'),
('XML', (), ('xml',), ('text/xml','application/xml'), 3, 'document-code'),
('RSS', (), ('rss',), ('text/rss+xml',), 3, 'feed-document'),
('ODS', (), ('ods',), ('application/vnd.oasis.opendocument.spreadsheet',), 3, 'document-excel'),
('WMS', (), ('wms',), ('application/vnd.ogc.wms_xml',), 3, 'globe-model'),
('KML', (), ('kml',), ('application/',), 3, 'globe-model'),
('NetCDF', (), ('cdf', 'netcdf'), ('application/x-netcdf',), 3, ''),
('IATI', (), ('iati',), ('application/x-iati+xml',), 3, 'document-code'),
('iCalendar', ('iCal', 'ICS'), ('ics', 'ical'), ('text/calendar',), 3, 'calendar-day'),
('RDF', ('rdf/xml','Turtle','N-Triples'), ('rdf','ttl','nt'), ('application/rdf+xml','text/turtle'), 5, 'document-rdf'),
('RDFa', ('html+rdfa',), (), (), 5, 'document-rdf'),
('SPARQL', (), (), ('application/sparql-results+xml', 'application/sparql-query'), 5, 'document-rdf'),
('SPARQL web form', (), (), (), 5, 'document-rdf'),
cls._data = []
for line in data_flat:
display_name, alternative_names, extensions, mime_types, openness, icon = line
format_dict = dict(zip(('display_name', 'alternative_names', 'extensions', 'mime_types', 'openness', 'icon'), line))
format_dict['extension'] = extensions[0] if extensions else ''
return cls._data
# Mime types which give not much clue to the format
VAGUE_MIME_TYPES = set(('application/octet-stream',))