Merge pull request #1350 from ckan/1350-resource-format-change

[#1350] Resource format not guessed automatically
ckan · Mar 26, 2014 · 7587c6e · 7587c6e
2 parents aebc52c + e18a24d
commit 7587c6e
Show file tree

Hide file tree

Showing 14 changed files with 229 additions and 48 deletions.
diff --git a/ckan/config/resource_formats.json b/ckan/config/resource_formats.json
@@ -0,0 +1,72 @@
+[
+  ["_comment",
+   "JSON field order as follows:",
+   ["Format", "Description", "Mimetype", ["List of alternative representations"]],
+   "where:",
+   " * Format - the short name for it, usually the file extension, because it will be displayed in many places, such as in the search results.",
+   " * Description - the name, human-friendly, to be displayed on the resource page. ",
+   " * Mimetype - canonical mimetype for the format. It must be unique to this resource format. It should be listed here: https://www.iana.org/assignments/media-types/media-types.xhtml or here: http://hg.python.org/cpython/file/2.7/Lib/mimetypes.py#l403",
+   " * List of alternative representations - these are other names that the user might type when they mean this format, or alternative mime-types or any other identifier. (They must be unique to this resource format.)"
+  ],
+  ["PPTX", "Powerpoint OOXML Presentation", "application/vnd.openxmlformats-officedocument.presentationml.presentation", []],
+  ["EXE", "Windows Executable Program", "application/x-msdownload", []],
+  ["DOC", "Word Document", "application/ms-word", []],
+  ["KML", "KML File", "application/vnd.google-earth.kml+xml", []],
+  ["XLS", "Excel Document", "application/vnd.ms-excel", []],
+  ["WCS", "Web Coverage Service", "wcs", []],
+  ["JS", "JavaScript", "application/x-javascript", []],
+  ["MDB", "Access Database", "application/x-msaccess", []],
+  ["NetCDF", "NetCDF File", "application/netcdf", []],
+  ["ArcGIS Map Service", "ArcGIS Map Service", "ArcGIS Map Service", ["arcgis map service"]],
+  ["TSV", "Tab Separated Values File", "text/tsv", []],
+  ["WFS", "Web Feature Service", null, []],
+  ["ArcGIS Online Map", "ArcGIS Online Map", "ArcGIS Online Map", ["web map application"]],
+  ["Perl", "Perl Script", "text/x-perl", []],
+  ["KMZ", "KMZ File", "application/vnd.google-earth.kmz+xml", ["application/vnd.google-earth.kmz"]],
+  ["OWL", "Web Ontology Language", "application/owl+xml", []],
+  ["N3", "N3 Triples", "application/x-n3", []],
+  ["ZIP", "Zip File", "application/zip", ["zip", "http://purl.org/NET/mediatypes/application/zip"]],
+  ["GZ", "Gzip File", "application/gzip", ["application/x-gzip"]],
+  ["QGIS", "QGIS File", "application/x-qgis", []],
+  ["ODS", "OpenDocument Spreadsheet", "application/vnd.oasis.opendocument.spreadsheet", []],
+  ["ODT", "OpenDocument Text", "application/vnd.oasis.opendocument.text", []],
+  ["JSON", "JavaScript Object Notation", "application/json", []],
+  ["BMP", "Bitmap Image File", "image/x-ms-bmp", []],
+  ["HTML", "Web Page", "text/html", ["htm", "http://purl.org/net/mediatypes/text/html"]],
+  ["RAR", "RAR Compressed File", "application/rar", []],
+  ["TIFF", "TIFF Image File", "image/tiff", []],
+  ["ODB", "OpenDocument Database", "application/vnd.oasis.opendocument.database", []],
+  ["TXT", "Text File", "text/plain", []],
+  ["DCR", "Adobe Shockwave format", "application/x-director", []],
+  ["ODF", "OpenDocument Math Formula", "application/vnd.oasis.opendocument.formula", []],
+  ["ODG", "OpenDocument Image", "application/vnd.oasis.opendocument.graphics", []],
+  ["XML", "XML File", "application/xml", ["text/xml", "http://purl.org/net/mediatypes/application/xml"]],
+  ["XLSX", "Excel OOXML Spreadsheet", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", []],
+  ["DOCX", "Word OOXML Document", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", []],
+  ["BIN", "Binary Data", "application/octet-stream", ["bin"]],
+  ["XSLT", "Extensible Stylesheet Language Transformations", "application/xslt+xml", []],
+  ["WMS", "Web Mapping Service", "WMS", ["wms"]],
+  ["SVG", "SVG vector image", "image/svg+xml", ["svg"]],
+  ["PPT", "Powerpoint Presentation", "application/vnd.ms-powerpoint", []],
+  ["JPEG", "JPG Image File", "image/jpeg", ["jpeg", "jpg"]],
+  ["SPARQL", "SPARQL end-point", "application/sparql-results+xml", []],
+  ["GIF", "GIF Image File", "image/gif", []],
+  ["RDF", "RDF", "application/rdf+xml", ["rdf/xml"]],
+  ["E00", " ARC/INFO interchange file format", "application/x-e00", []],
+  ["PDF", "PDF File", "application/pdf", []],
+  ["CSV", "Comma Separated Values File", "text/csv", ["text/comma-separated-values"]],
+  ["ODC", "OpenDocument Chart", "application/vnd.oasis.opendocument.chart", []],
+  ["Atom Feed", "Atom Feed", "application/atom+xml", []],
+  ["MrSID", "MrSID", "image/x-mrsid", []],
+  ["ArcGIS Map Preview", "ArcGIS Map Preview", "ArcGIS Map Preview", ["arcgis map preview"]],
+  ["XYZ", "XYZ Chemical File", "chemical/x-xyz", []],
+  ["MOP", "MOPAC Input format", "chemical/x-mopac-input", []],
+  ["Esri REST", "Esri Rest API Endpoint", "Esri REST", ["arcgis_rest"]],
+  ["dBase", "dBase Database", "application/x-dbf", ["dbf"]],
+  ["MXD", "ESRI ArcGIS project file", "application/x-mxd", []],
+  ["TAR", "TAR Compressed File", "application/x-tar", []],
+  ["PNG", "PNG Image File", "image/png", []],
+  ["RSS", "RSS feed", "application/rss+xml", []],
+  ["GeoJSON", "Geographic JavaScript Object Notation", null, []],
+  ["SHP", "Shapefile", null, ["esri shapefile"]]
+]
diff --git a/ckan/lib/create_test_data.py b/ckan/lib/create_test_data.py
@@ -432,7 +432,7 @@ def create(cls, auth_profile="", package_type=None):
             )
         pr2 = model.Resource(
             url=u'http://www.annakarenina.com/index.json',
-            format=u'json',
+            format=u'JSON',
             description=u'Index of the novel',
             hash=u'def456',
             extras={'size_extra': u'345'},

diff --git a/ckan/lib/dictization/model_dictize.py b/ckan/lib/dictization/model_dictize.py
@@ -118,30 +118,6 @@ def extras_list_dictize(extras_list, context):
 
     return sorted(result_list, key=lambda x: x["key"])
 
-def _unified_resource_format(format_):
-    ''' Convert resource formats into a more uniform set.
-    eg .json, json, JSON, text/json all converted to JSON.'''
-
-    format_clean = format_.lower().split('/')[-1].replace('.', '')
-    formats = {
-        'csv' : 'CSV',
-        'zip' : 'ZIP',
-        'pdf' : 'PDF',
-        'xls' : 'XLS',
-        'json' : 'JSON',
-        'kml' : 'KML',
-        'xml' : 'XML',
-        'shape' : 'SHAPE',
-        'rdf' : 'RDF',
-        'txt' : 'TXT',
-        'text' : 'TEXT',
-        'html' : 'HTML',
-    }
-    if format_clean in formats:
-        format_new = formats[format_clean]
-    else:
-        format_new = format_.lower()
-    return format_new
 
 def resource_dictize(res, context):
     model = context['model']
@@ -150,7 +126,6 @@ def resource_dictize(res, context):
     extras = resource.pop("extras", None)
     if extras:
         resource.update(extras)
-    resource['format'] = _unified_resource_format(res.format)
     # some urls do not have the protocol this adds http:// to these
     url = resource['url']
     ## for_edit is only called at the times when the dataset is to be edited

diff --git a/ckan/lib/dictization/model_save.py b/ckan/lib/dictization/model_save.py
@@ -38,12 +38,6 @@ def resource_dict_save(res_dict, context):
                     continue
             if key == 'url' and not new and obj.url <> value:
                 obj.url_changed = True
-            # this is an internal field so ignore
-            # FIXME This helps get the tests to pass but is a hack and should
-            # be fixed properly. basically don't update the format if not needed
-            if (key == 'format' and (value == obj.format
-                    or value == d.model_dictize._unified_resource_format(obj.format))):
-                continue
             setattr(obj, key, value)
         else:
             # resources save extras directly onto the object, instead

diff --git a/ckan/lib/helpers.py b/ckan/lib/helpers.py
@@ -9,6 +9,7 @@
 import datetime
 import logging
 import re
+import os
 import urllib
 import pprint
 import copy
@@ -1754,6 +1755,57 @@ def get_site_statistics():
 
     return stats
 
+_RESOURCE_FORMATS = None
+
+def resource_formats():
+    ''' Returns the resource formats as a dict, sourced from the resource format JSON file.
+    key:  potential user input value
+    value:  [canonical mimetype lowercased, canonical format (lowercase), human readable form]
+    Fuller description of the fields are described in
+    `ckan/config/resource_formats.json`.
+    '''
+    global _RESOURCE_FORMATS
+    if not _RESOURCE_FORMATS:
+        _RESOURCE_FORMATS = {}
+        format_file_path = config.get('ckan.resource_formats')
+        if not format_file_path:
+            format_file_path = os.path.join(
+                os.path.dirname(os.path.realpath(ckan.config.__file__)),
+                'resource_formats.json'
+            )
+        with open(format_file_path) as format_file:
+            try:
+                file_resource_formats = json.loads(format_file.read())
+            except ValueError, e:  # includes simplejson.decoder.JSONDecodeError
+                raise ValueError('Invalid JSON syntax in %s: %s' % (format_file_path, e))
+
+            for format_line in file_resource_formats:
+                if format_line[0] == '_comment':
+                    continue
+                line = [format_line[2], format_line[0], format_line[1]]
+                alternatives = format_line[3] if len(format_line) == 4 else []
+                for item in line + alternatives:
+                    if item:
+                        item = item.lower()
+                        if item in _RESOURCE_FORMATS \
+                                and _RESOURCE_FORMATS[item] != line:
+                            raise ValueError('Duplicate resource format '
+                                             'identifier in %s: %s' %
+                                             (format_file_path, item))
+                        _RESOURCE_FORMATS[item] = line
+
+    return _RESOURCE_FORMATS
+
+
+def unified_resource_format(format):
+    formats = resource_formats()
+    format_clean = format.lower()
+    if format_clean in formats:
+        format_new = formats[format_clean][1]
+    else:
+        format_new = format
+    return format_new
+
 def check_config_permission(permission):
     return new_authz.check_config_permission(permission)
 

diff --git a/ckan/logic/schema.py b/ckan/logic/schema.py
@@ -51,6 +51,8 @@
                                    url_validator,
                                    datasets_with_no_organization_cannot_be_private,
                                    list_of_strings,
+                                   if_empty_guess_format,
+                                   clean_format,
                                    no_loops_in_hierarchy,
                                    )
 from ckan.logic.converters import (convert_user_name_or_id_to_id,
@@ -71,7 +73,7 @@ def default_resource_schema():
         'package_id': [ignore],
         'url': [not_empty, unicode],#, URL(add_http=False)],
         'description': [ignore_missing, unicode],
-        'format': [ignore_missing, unicode],
+        'format': [if_empty_guess_format, ignore_missing, clean_format, unicode],
         'hash': [ignore_missing, unicode],
         'state': [ignore],
         'position': [ignore],
@@ -169,6 +171,8 @@ def default_create_package_schema():
 def default_update_package_schema():
     schema = default_create_package_schema()
 
+    schema['resources'] = default_update_resource_schema()
+
     # Users can (optionally) supply the package id when updating a package, but
     # only to identify the package to be updated, they cannot change the id.
     schema['id'] = [ignore_missing, package_id_not_changed]
@@ -198,6 +202,7 @@ def default_show_package_schema():
     # Add several keys to the 'resources' subschema so they don't get stripped
     # from the resource dicts by validation.
     schema['resources'].update({
+        'format': [ignore_missing, clean_format, unicode],
         'created': [ckan.lib.navl.validators.ignore_missing],
         'position': [not_empty],
         'last_modified': [ckan.lib.navl.validators.ignore_missing],

diff --git a/ckan/logic/validators.py b/ckan/logic/validators.py
@@ -1,6 +1,7 @@
 import datetime
 from itertools import count
 import re
+import mimetypes
 
 import ckan.lib.navl.dictization_functions as df
 import ckan.logic as logic
@@ -675,7 +676,6 @@ def url_validator(key, data, errors, context):
     errors[key].append(_('Please provide a valid URL'))
 
 
-
 def user_name_exists(user_name, context):
     model = context['model']
     session = context['session']
@@ -727,6 +727,20 @@ def list_of_strings(key, data, errors, context):
         if not isinstance(x, basestring):
             raise Invalid('%s: %s' % (_('Not a string'), x))
 
+def if_empty_guess_format(key, data, errors, context):
+    value = data[key]
+    resource_id = data.get(key[:-1] + ('id',))
+
+    # if resource_id then an update
+    if (not value or value is Missing) and not resource_id:
+        url = data.get(key[:-1] + ('url',), '')
+        mimetype, encoding = mimetypes.guess_type(url)
+        if mimetype:
+            data[key] = mimetype
+
+def clean_format(format):
+    return h.unified_resource_format(format)
+
 def no_loops_in_hierarchy(key, data, errors, context):
     '''Checks that the parent groups specified in the data would not cause
     a loop in the group hierarchy, and therefore cause the recursion up/down

diff --git a/ckan/model/resource.py b/ckan/model/resource.py
@@ -119,9 +119,6 @@ def as_dict(self, core_columns_only=False):
             _dict[k] = v
         if self.resource_group and not core_columns_only:
             _dict["package_id"] = self.resource_group.package_id
-        # FIXME format unification needs doing better
-        import ckan.lib.dictization.model_dictize as model_dictize
-        _dict[u'format'] = model_dictize._unified_resource_format(self.format)
         return _dict
 
     def get_package_id(self):

diff --git a/ckan/new_tests/logic/test_validators.py b/ckan/new_tests/logic/test_validators.py
@@ -318,6 +318,60 @@ def call_validator(*args, **kwargs):
     # TODO: Test user_name_validator()'s behavior when there's a 'user_obj' in
     # the context dict.
 
+    def test_if_empty_guess_format(self):
+
+        import ckan.logic.validators as validators
+        import ckan.lib.navl.dictization_functions as dictization_functions
+
+        data = {'name': 'package_name', 'resources': [
+            {'url': 'http://fakedomain/my.csv', 'format': ''},
+            {'url': 'http://fakedomain/my.pdf',
+             'format': dictization_functions.Missing},
+            {'url': 'http://fakedomain/my.pdf', 'format': 'pdf'},
+            {'url': 'http://fakedomain/my.pdf',
+             'id': 'fake_resource_id', 'format': ''}
+        ]}
+        data = dictization_functions.flatten_dict(data)
+
+        @t.does_not_modify_errors_dict
+        def call_validator(*args, **kwargs):
+            return validators.if_empty_guess_format(*args, **kwargs)
+
+        new_data = copy.deepcopy(data)
+        call_validator(key=('resources', 0, 'format'), data=new_data,
+                       errors={}, context={})
+        assert new_data[('resources', 0, 'format')] == 'text/csv'
+
+        new_data = copy.deepcopy(data)
+        call_validator(key=('resources', 1, 'format'), data=new_data,
+                       errors={}, context={})
+        assert new_data[('resources', 1, 'format')] == 'application/pdf'
+
+        new_data = copy.deepcopy(data)
+        call_validator(key=('resources', 2, 'format'), data=new_data,
+                       errors={}, context={})
+        assert new_data[('resources', 2, 'format')] == 'pdf'
+
+        new_data = copy.deepcopy(data)
+        call_validator(key=('resources', 3, 'format'), data=new_data,
+                       errors={}, context={})
+        assert new_data[('resources', 3, 'format')] == ''
+
+    def test_clean_format(self):
+        import ckan.logic.validators as validators
+
+        format = validators.clean_format('csv')
+        assert format == 'CSV'
+
+        format = validators.clean_format('text/csv')
+        assert format == 'CSV'
+
+        format = validators.clean_format('not a format')
+        assert format == 'not a format'
+
+        format = validators.clean_format('')
+        assert format == ''
+
     def test_datasets_with_org_can_be_private_when_creating(self):
 
         import ckan.logic.validators as validators

diff --git a/ckan/templates/package/snippets/resource_form.html b/ckan/templates/package/snippets/resource_form.html
@@ -35,6 +35,10 @@
     {% block basic_fields_format %}
       {% set format_attrs = {'data-module': 'autocomplete', 'data-module-source': '/api/2/util/resource/format_autocomplete?incomplete=?'} %}
       {% call form.input('format', id='field-format', label=_('Format'), placeholder=_('eg. CSV, XML or JSON'), value=data.format, error=errors.format, classes=['control-medium'], attrs=format_attrs) %}
+        <span class="info-block info-block-small">
+          <i class="icon-info-sign"></i>
+          {{ _('This will be guessed automatically. Leave blank if you wish') }}
+        </span>
       {% endcall %}
     {% endblock %}
 

diff --git a/ckan/tests/functional/api/base.py b/ckan/tests/functional/api/base.py
@@ -326,14 +326,14 @@ class BaseModelApiTestCase(ApiTestCase, ControllerTestCase):
         'url': u'http://blahblahblah.mydomain',
         'resources': [{
             u'url':u'http://blah.com/file.xml',
-            u'format':u'xml',
+            u'format':u'XML',
             u'description':u'Main file',
             u'hash':u'abc123',
             u'alt_url':u'alt_url',
             u'size_extra':u'200',
         }, {
             u'url':u'http://blah.com/file2.xml',
-            u'format':u'xml',
+            u'format':u'XML',
             u'description':u'Second file',
             u'hash':u'def123',
             u'alt_url':u'alt_url',