Skip to content

Commit

Permalink
Merge pull request #1350 from ckan/1350-resource-format-change
Browse files Browse the repository at this point in the history
[#1350] Resource format not guessed automatically
  • Loading branch information
David Read committed Mar 26, 2014
2 parents aebc52c + e18a24d commit 7587c6e
Show file tree
Hide file tree
Showing 14 changed files with 229 additions and 48 deletions.
72 changes: 72 additions & 0 deletions ckan/config/resource_formats.json
@@ -0,0 +1,72 @@
[
["_comment",
"JSON field order as follows:",
["Format", "Description", "Mimetype", ["List of alternative representations"]],
"where:",
" * Format - the short name for it, usually the file extension, because it will be displayed in many places, such as in the search results.",
" * Description - the name, human-friendly, to be displayed on the resource page. ",
" * Mimetype - canonical mimetype for the format. It must be unique to this resource format. It should be listed here: https://www.iana.org/assignments/media-types/media-types.xhtml or here: http://hg.python.org/cpython/file/2.7/Lib/mimetypes.py#l403",
" * List of alternative representations - these are other names that the user might type when they mean this format, or alternative mime-types or any other identifier. (They must be unique to this resource format.)"
],
["PPTX", "Powerpoint OOXML Presentation", "application/vnd.openxmlformats-officedocument.presentationml.presentation", []],
["EXE", "Windows Executable Program", "application/x-msdownload", []],
["DOC", "Word Document", "application/ms-word", []],
["KML", "KML File", "application/vnd.google-earth.kml+xml", []],
["XLS", "Excel Document", "application/vnd.ms-excel", []],
["WCS", "Web Coverage Service", "wcs", []],
["JS", "JavaScript", "application/x-javascript", []],
["MDB", "Access Database", "application/x-msaccess", []],
["NetCDF", "NetCDF File", "application/netcdf", []],
["ArcGIS Map Service", "ArcGIS Map Service", "ArcGIS Map Service", ["arcgis map service"]],
["TSV", "Tab Separated Values File", "text/tsv", []],
["WFS", "Web Feature Service", null, []],
["ArcGIS Online Map", "ArcGIS Online Map", "ArcGIS Online Map", ["web map application"]],
["Perl", "Perl Script", "text/x-perl", []],
["KMZ", "KMZ File", "application/vnd.google-earth.kmz+xml", ["application/vnd.google-earth.kmz"]],
["OWL", "Web Ontology Language", "application/owl+xml", []],
["N3", "N3 Triples", "application/x-n3", []],
["ZIP", "Zip File", "application/zip", ["zip", "http://purl.org/NET/mediatypes/application/zip"]],
["GZ", "Gzip File", "application/gzip", ["application/x-gzip"]],
["QGIS", "QGIS File", "application/x-qgis", []],
["ODS", "OpenDocument Spreadsheet", "application/vnd.oasis.opendocument.spreadsheet", []],
["ODT", "OpenDocument Text", "application/vnd.oasis.opendocument.text", []],
["JSON", "JavaScript Object Notation", "application/json", []],
["BMP", "Bitmap Image File", "image/x-ms-bmp", []],
["HTML", "Web Page", "text/html", ["htm", "http://purl.org/net/mediatypes/text/html"]],
["RAR", "RAR Compressed File", "application/rar", []],
["TIFF", "TIFF Image File", "image/tiff", []],
["ODB", "OpenDocument Database", "application/vnd.oasis.opendocument.database", []],
["TXT", "Text File", "text/plain", []],
["DCR", "Adobe Shockwave format", "application/x-director", []],
["ODF", "OpenDocument Math Formula", "application/vnd.oasis.opendocument.formula", []],
["ODG", "OpenDocument Image", "application/vnd.oasis.opendocument.graphics", []],
["XML", "XML File", "application/xml", ["text/xml", "http://purl.org/net/mediatypes/application/xml"]],
["XLSX", "Excel OOXML Spreadsheet", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", []],
["DOCX", "Word OOXML Document", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", []],
["BIN", "Binary Data", "application/octet-stream", ["bin"]],
["XSLT", "Extensible Stylesheet Language Transformations", "application/xslt+xml", []],
["WMS", "Web Mapping Service", "WMS", ["wms"]],
["SVG", "SVG vector image", "image/svg+xml", ["svg"]],
["PPT", "Powerpoint Presentation", "application/vnd.ms-powerpoint", []],
["JPEG", "JPG Image File", "image/jpeg", ["jpeg", "jpg"]],
["SPARQL", "SPARQL end-point", "application/sparql-results+xml", []],
["GIF", "GIF Image File", "image/gif", []],
["RDF", "RDF", "application/rdf+xml", ["rdf/xml"]],
["E00", " ARC/INFO interchange file format", "application/x-e00", []],
["PDF", "PDF File", "application/pdf", []],
["CSV", "Comma Separated Values File", "text/csv", ["text/comma-separated-values"]],
["ODC", "OpenDocument Chart", "application/vnd.oasis.opendocument.chart", []],
["Atom Feed", "Atom Feed", "application/atom+xml", []],
["MrSID", "MrSID", "image/x-mrsid", []],
["ArcGIS Map Preview", "ArcGIS Map Preview", "ArcGIS Map Preview", ["arcgis map preview"]],
["XYZ", "XYZ Chemical File", "chemical/x-xyz", []],
["MOP", "MOPAC Input format", "chemical/x-mopac-input", []],
["Esri REST", "Esri Rest API Endpoint", "Esri REST", ["arcgis_rest"]],
["dBase", "dBase Database", "application/x-dbf", ["dbf"]],
["MXD", "ESRI ArcGIS project file", "application/x-mxd", []],
["TAR", "TAR Compressed File", "application/x-tar", []],
["PNG", "PNG Image File", "image/png", []],
["RSS", "RSS feed", "application/rss+xml", []],
["GeoJSON", "Geographic JavaScript Object Notation", null, []],
["SHP", "Shapefile", null, ["esri shapefile"]]
]
2 changes: 1 addition & 1 deletion ckan/lib/create_test_data.py
Expand Up @@ -432,7 +432,7 @@ def create(cls, auth_profile="", package_type=None):
)
pr2 = model.Resource(
url=u'http://www.annakarenina.com/index.json',
format=u'json',
format=u'JSON',
description=u'Index of the novel',
hash=u'def456',
extras={'size_extra': u'345'},
Expand Down
25 changes: 0 additions & 25 deletions ckan/lib/dictization/model_dictize.py
Expand Up @@ -118,30 +118,6 @@ def extras_list_dictize(extras_list, context):

return sorted(result_list, key=lambda x: x["key"])

def _unified_resource_format(format_):
''' Convert resource formats into a more uniform set.
eg .json, json, JSON, text/json all converted to JSON.'''

format_clean = format_.lower().split('/')[-1].replace('.', '')
formats = {
'csv' : 'CSV',
'zip' : 'ZIP',
'pdf' : 'PDF',
'xls' : 'XLS',
'json' : 'JSON',
'kml' : 'KML',
'xml' : 'XML',
'shape' : 'SHAPE',
'rdf' : 'RDF',
'txt' : 'TXT',
'text' : 'TEXT',
'html' : 'HTML',
}
if format_clean in formats:
format_new = formats[format_clean]
else:
format_new = format_.lower()
return format_new

def resource_dictize(res, context):
model = context['model']
Expand All @@ -150,7 +126,6 @@ def resource_dictize(res, context):
extras = resource.pop("extras", None)
if extras:
resource.update(extras)
resource['format'] = _unified_resource_format(res.format)
# some urls do not have the protocol this adds http:// to these
url = resource['url']
## for_edit is only called at the times when the dataset is to be edited
Expand Down
6 changes: 0 additions & 6 deletions ckan/lib/dictization/model_save.py
Expand Up @@ -38,12 +38,6 @@ def resource_dict_save(res_dict, context):
continue
if key == 'url' and not new and obj.url <> value:
obj.url_changed = True
# this is an internal field so ignore
# FIXME This helps get the tests to pass but is a hack and should
# be fixed properly. basically don't update the format if not needed
if (key == 'format' and (value == obj.format
or value == d.model_dictize._unified_resource_format(obj.format))):
continue
setattr(obj, key, value)
else:
# resources save extras directly onto the object, instead
Expand Down
52 changes: 52 additions & 0 deletions ckan/lib/helpers.py
Expand Up @@ -9,6 +9,7 @@
import datetime
import logging
import re
import os
import urllib
import pprint
import copy
Expand Down Expand Up @@ -1754,6 +1755,57 @@ def get_site_statistics():

return stats

_RESOURCE_FORMATS = None

def resource_formats():
''' Returns the resource formats as a dict, sourced from the resource format JSON file.
key: potential user input value
value: [canonical mimetype lowercased, canonical format (lowercase), human readable form]
Fuller description of the fields are described in
`ckan/config/resource_formats.json`.
'''
global _RESOURCE_FORMATS
if not _RESOURCE_FORMATS:
_RESOURCE_FORMATS = {}
format_file_path = config.get('ckan.resource_formats')
if not format_file_path:
format_file_path = os.path.join(
os.path.dirname(os.path.realpath(ckan.config.__file__)),
'resource_formats.json'
)
with open(format_file_path) as format_file:
try:
file_resource_formats = json.loads(format_file.read())
except ValueError, e: # includes simplejson.decoder.JSONDecodeError
raise ValueError('Invalid JSON syntax in %s: %s' % (format_file_path, e))

for format_line in file_resource_formats:
if format_line[0] == '_comment':
continue
line = [format_line[2], format_line[0], format_line[1]]
alternatives = format_line[3] if len(format_line) == 4 else []
for item in line + alternatives:
if item:
item = item.lower()
if item in _RESOURCE_FORMATS \
and _RESOURCE_FORMATS[item] != line:
raise ValueError('Duplicate resource format '
'identifier in %s: %s' %
(format_file_path, item))
_RESOURCE_FORMATS[item] = line

return _RESOURCE_FORMATS


def unified_resource_format(format):
formats = resource_formats()
format_clean = format.lower()
if format_clean in formats:
format_new = formats[format_clean][1]
else:
format_new = format
return format_new

def check_config_permission(permission):
return new_authz.check_config_permission(permission)

Expand Down
7 changes: 6 additions & 1 deletion ckan/logic/schema.py
Expand Up @@ -51,6 +51,8 @@
url_validator,
datasets_with_no_organization_cannot_be_private,
list_of_strings,
if_empty_guess_format,
clean_format,
no_loops_in_hierarchy,
)
from ckan.logic.converters import (convert_user_name_or_id_to_id,
Expand All @@ -71,7 +73,7 @@ def default_resource_schema():
'package_id': [ignore],
'url': [not_empty, unicode],#, URL(add_http=False)],
'description': [ignore_missing, unicode],
'format': [ignore_missing, unicode],
'format': [if_empty_guess_format, ignore_missing, clean_format, unicode],
'hash': [ignore_missing, unicode],
'state': [ignore],
'position': [ignore],
Expand Down Expand Up @@ -169,6 +171,8 @@ def default_create_package_schema():
def default_update_package_schema():
schema = default_create_package_schema()

schema['resources'] = default_update_resource_schema()

# Users can (optionally) supply the package id when updating a package, but
# only to identify the package to be updated, they cannot change the id.
schema['id'] = [ignore_missing, package_id_not_changed]
Expand Down Expand Up @@ -198,6 +202,7 @@ def default_show_package_schema():
# Add several keys to the 'resources' subschema so they don't get stripped
# from the resource dicts by validation.
schema['resources'].update({
'format': [ignore_missing, clean_format, unicode],
'created': [ckan.lib.navl.validators.ignore_missing],
'position': [not_empty],
'last_modified': [ckan.lib.navl.validators.ignore_missing],
Expand Down
16 changes: 15 additions & 1 deletion ckan/logic/validators.py
@@ -1,6 +1,7 @@
import datetime
from itertools import count
import re
import mimetypes

import ckan.lib.navl.dictization_functions as df
import ckan.logic as logic
Expand Down Expand Up @@ -675,7 +676,6 @@ def url_validator(key, data, errors, context):
errors[key].append(_('Please provide a valid URL'))



def user_name_exists(user_name, context):
model = context['model']
session = context['session']
Expand Down Expand Up @@ -727,6 +727,20 @@ def list_of_strings(key, data, errors, context):
if not isinstance(x, basestring):
raise Invalid('%s: %s' % (_('Not a string'), x))

def if_empty_guess_format(key, data, errors, context):
value = data[key]
resource_id = data.get(key[:-1] + ('id',))

# if resource_id then an update
if (not value or value is Missing) and not resource_id:
url = data.get(key[:-1] + ('url',), '')
mimetype, encoding = mimetypes.guess_type(url)
if mimetype:
data[key] = mimetype

def clean_format(format):
return h.unified_resource_format(format)

def no_loops_in_hierarchy(key, data, errors, context):
'''Checks that the parent groups specified in the data would not cause
a loop in the group hierarchy, and therefore cause the recursion up/down
Expand Down
3 changes: 0 additions & 3 deletions ckan/model/resource.py
Expand Up @@ -119,9 +119,6 @@ def as_dict(self, core_columns_only=False):
_dict[k] = v
if self.resource_group and not core_columns_only:
_dict["package_id"] = self.resource_group.package_id
# FIXME format unification needs doing better
import ckan.lib.dictization.model_dictize as model_dictize
_dict[u'format'] = model_dictize._unified_resource_format(self.format)
return _dict

def get_package_id(self):
Expand Down
54 changes: 54 additions & 0 deletions ckan/new_tests/logic/test_validators.py
Expand Up @@ -318,6 +318,60 @@ def call_validator(*args, **kwargs):
# TODO: Test user_name_validator()'s behavior when there's a 'user_obj' in
# the context dict.

def test_if_empty_guess_format(self):

import ckan.logic.validators as validators
import ckan.lib.navl.dictization_functions as dictization_functions

data = {'name': 'package_name', 'resources': [
{'url': 'http://fakedomain/my.csv', 'format': ''},
{'url': 'http://fakedomain/my.pdf',
'format': dictization_functions.Missing},
{'url': 'http://fakedomain/my.pdf', 'format': 'pdf'},
{'url': 'http://fakedomain/my.pdf',
'id': 'fake_resource_id', 'format': ''}
]}
data = dictization_functions.flatten_dict(data)

@t.does_not_modify_errors_dict
def call_validator(*args, **kwargs):
return validators.if_empty_guess_format(*args, **kwargs)

new_data = copy.deepcopy(data)
call_validator(key=('resources', 0, 'format'), data=new_data,
errors={}, context={})
assert new_data[('resources', 0, 'format')] == 'text/csv'

new_data = copy.deepcopy(data)
call_validator(key=('resources', 1, 'format'), data=new_data,
errors={}, context={})
assert new_data[('resources', 1, 'format')] == 'application/pdf'

new_data = copy.deepcopy(data)
call_validator(key=('resources', 2, 'format'), data=new_data,
errors={}, context={})
assert new_data[('resources', 2, 'format')] == 'pdf'

new_data = copy.deepcopy(data)
call_validator(key=('resources', 3, 'format'), data=new_data,
errors={}, context={})
assert new_data[('resources', 3, 'format')] == ''

def test_clean_format(self):
import ckan.logic.validators as validators

format = validators.clean_format('csv')
assert format == 'CSV'

format = validators.clean_format('text/csv')
assert format == 'CSV'

format = validators.clean_format('not a format')
assert format == 'not a format'

format = validators.clean_format('')
assert format == ''

def test_datasets_with_org_can_be_private_when_creating(self):

import ckan.logic.validators as validators
Expand Down
4 changes: 4 additions & 0 deletions ckan/templates/package/snippets/resource_form.html
Expand Up @@ -35,6 +35,10 @@
{% block basic_fields_format %}
{% set format_attrs = {'data-module': 'autocomplete', 'data-module-source': '/api/2/util/resource/format_autocomplete?incomplete=?'} %}
{% call form.input('format', id='field-format', label=_('Format'), placeholder=_('eg. CSV, XML or JSON'), value=data.format, error=errors.format, classes=['control-medium'], attrs=format_attrs) %}
<span class="info-block info-block-small">
<i class="icon-info-sign"></i>
{{ _('This will be guessed automatically. Leave blank if you wish') }}
</span>
{% endcall %}
{% endblock %}

Expand Down
4 changes: 2 additions & 2 deletions ckan/tests/functional/api/base.py
Expand Up @@ -326,14 +326,14 @@ class BaseModelApiTestCase(ApiTestCase, ControllerTestCase):
'url': u'http://blahblahblah.mydomain',
'resources': [{
u'url':u'http://blah.com/file.xml',
u'format':u'xml',
u'format':u'XML',
u'description':u'Main file',
u'hash':u'abc123',
u'alt_url':u'alt_url',
u'size_extra':u'200',
}, {
u'url':u'http://blah.com/file2.xml',
u'format':u'xml',
u'format':u'XML',
u'description':u'Second file',
u'hash':u'def123',
u'alt_url':u'alt_url',
Expand Down

0 comments on commit 7587c6e

Please sign in to comment.