diff --git a/ckan/config/middleware.py b/ckan/config/middleware.py index af7b1707bf4..9e004a5806a 100644 --- a/ckan/config/middleware.py +++ b/ckan/config/middleware.py @@ -69,7 +69,7 @@ def make_app(global_conf, full_stack=True, static_files=True, **app_conf): #app = QueueLogMiddleware(app) # Fanstatic - if config.get('ckan.include_support', '').lower()[:3] == 'dev': + if asbool(config.get('debug', False)): fanstatic_config = { 'versioning' : True, 'recompute_hashes' : True, diff --git a/ckan/html_resources/.gitignore b/ckan/html_resources/.gitignore deleted file mode 100644 index cc64990231b..00000000000 --- a/ckan/html_resources/.gitignore +++ /dev/null @@ -1 +0,0 @@ -**.min.* diff --git a/ckan/html_resources/__init__.py b/ckan/html_resources/__init__.py deleted file mode 100644 index cb3abf7f65a..00000000000 --- a/ckan/html_resources/__init__.py +++ /dev/null @@ -1,285 +0,0 @@ -''' This file creates fanstatic resources from the sub directories. The -directory can contain a config.ini to specify how the resources should -be treated. minified copies of the resources are created if the resource -has a later modification time than existing minified versions. - -NOTE :currently each library requires its entry point adding to the main -ckan setup.py file. - - -config.ini (example) -========== -# all resources are named without their file extension -[main] -# dont_bundle prevents the resources from being bundled -dont_bundle = test1 -# order can be used to prevent dependency errors to ensure that the -# needed resources are created first -order = test1 test2 -[depends] -# resource dependencies can be specified here by listing dependent -# resources -test2 = test1 -[groups] -# a group containing several resources can be specified here -test3 = test2 test1 - - -''' -import os.path -import sys -import ConfigParser - -from fanstatic import Library, Resource, Group, get_library_registry -import fanstatic.core as core -from ckan.include.rjsmin import jsmin -from ckan.include.rcssmin import cssmin - -# TODO -# loop through dirs to setup -# warn on no entry point provided for fanstatic - -class IEConditionalRenderer(object): - ''' Allows for IE conditionals. ''' - def __init__(self, condition, renderer, other_browsers=False): - self.condition = condition - self.renderer = renderer - self.other_browsers = other_browsers - if other_browsers: - self.other_browsers_start= '' - self.other_browsers_end= '' % (self.condition, - self.other_browsers_start, - self.renderer(url), - self.other_browsers_end) - -# Fanstatic Patch # -# FIXME add full license info & push upstream -def __init__(self, library, relpath, - depends=None, - supersedes=None, - bottom=False, - renderer=None, - debug=None, - dont_bundle=False, - minified=None): - self.library = library - fullpath = os.path.normpath(os.path.join(library.path, relpath)) - if core._resource_file_existence_checking and not os.path.exists(fullpath): - raise core.UnknownResourceError("Resource file does not exist: %s" % - fullpath) - self.relpath = relpath - self.dirname, self.filename = os.path.split(relpath) - if self.dirname and not self.dirname.endswith('/'): - self.dirname += '/' - self.bottom = bottom - self.dont_bundle = dont_bundle - - self.ext = os.path.splitext(self.relpath)[1] - if renderer is None: - # No custom, ad-hoc renderer for this Resource, so lookup - # the default renderer by resource filename extension. - if self.ext not in core.inclusion_renderers: - raise core.UnknownResourceExtensionError( - "Unknown resource extension %s for resource: %s" % - (self.ext, repr(self))) - self.order, self.renderer = core.inclusion_renderers[self.ext] - else: - # Use the custom renderer. - self.renderer = renderer - # If we do not know about the filename extension inclusion - # order, we render the resource after all others. - self.order, _ = core.inclusion_renderers.get( - self.ext, (sys.maxint, None)) - - assert not isinstance(depends, basestring) - self.depends = set() - if depends is not None: - # Normalize groups into the underlying resources... - depends = core.normalize_groups(depends) - # ...before updating the set of dependencies of this resource. - self.depends.update(depends) - - self.resources = set([self]) - for depend in self.depends: - self.resources.update(depend.resources) - - # Check for library dependency cycles. - self.library.check_dependency_cycle(self) - - # generate an internal number for sorting the resource - # on dependency within the library - self.init_dependency_nr() - - self.modes = {} - for mode_name, argument in [(core.DEBUG, debug), (core.MINIFIED, minified)]: - if argument is None: - continue - elif isinstance(argument, basestring): - mode_resource = Resource(library, argument, bottom=bottom, renderer=renderer) - else: - # The dependencies of a mode resource should be the same - # or a subset of the dependencies this mode replaces. - if len(argument.depends - self.depends) > 0: - raise core.ModeResourceDependencyError - mode_resource = argument - - mode_resource.dependency_nr = self.dependency_nr - self.modes[mode_name] = mode_resource - - assert not isinstance(supersedes, basestring) - self.supersedes = supersedes or [] - - self.rollups = [] - # create a reference to the superseder in the superseded resource - for resource in self.supersedes: - resource.rollups.append(self) - # also create a reference to the superseding mode in the superseded - # mode - # XXX what if mode is full-fledged resource which lists - # supersedes itself? - for mode_name, mode in self.modes.items(): - for resource in self.supersedes: - superseded_mode = resource.mode(mode_name) - # if there is no such mode, let's skip it - if superseded_mode is resource: - continue - mode.supersedes.append(superseded_mode) - superseded_mode.rollups.append(mode) - - - # Register ourself with the Library. - self.library.register(self) - -core.Resource.__init__ = __init__ -# Fanstatic Patch # - - -def create_library(name, path): - ''' Creates a fanstatic library `name` with the contents of a - directory `path` using config.ini if found. Files are minified - if needed. ''' - - def min_path(path): - ''' return the .min filename eg moo.js -> moo.min.js ''' - if f.endswith('.js'): - return path[:-3] + '.min.js' - if f.endswith('.css'): - return path[:-4] + '.min.css' - - def minify(filename, min_function): - ''' Minify file path using min_function. ''' - # if the minified file was modified after the source file we can - # assume that it is up-to-date - path = os.path.join(resource_path, filename) - path_min = min_path(path) - op = os.path - if op.exists(path_min) and op.getmtime(path) < op.getmtime(path_min): - return - source = open(path, 'r').read() - f = open(path_min, 'w') - f.write(min_function(source)) - f.close() - print 'minified %s' % path - - def create_resource(filename, path, filepath): - ''' create the fanstatic Resource ''' - # resource_name is name of the file without the .js/.css - resource_name = '.'.join(filename.split('.')[:-1]) - kw = {} - path_min = min_path(os.path.join(resource_path, filename)) - if os.path.exists(path_min): - kw['minified'] = min_path(filename) - if filename.endswith('.js'): - kw['bottom'] = True - renderer = core.render_js - if filename.endswith('.css'): - renderer = core.render_css - if resource_name in depends: - dependencies = [] - for dependency in depends[resource_name]: - dependencies.append(getattr(module, dependency)) - kw['depends'] = dependencies - if resource_name in dont_bundle: - kw['dont_bundle'] = True - # FIXME needs config.ini options enabled - if False: - other_browsers = False - condition = '' - kw['renderer'] = IEConditionalRenderer( - condition=condition, - renderer=renderer, - other_browsers=other_browsers) - - resource = Resource(library, filename, **kw) - # add the resource to this module - fanstatic_name = '%s/%s' % (filepath, resource_name) - setattr(module, fanstatic_name, resource) - - order = [] - dont_bundle = [] - depends = {} - groups = {} - - # parse the config.ini file if it exists - resource_path = os.path.dirname(__file__) - resource_path = os.path.join(resource_path, path) - config_path = os.path.join(resource_path, 'config.ini') - if os.path.exists(config_path): - config = ConfigParser.RawConfigParser() - config.read(config_path) - if config.has_option('main', 'order'): - order = config.get('main', 'order').split() - if config.has_option('main', 'dont_bundle'): - dont_bundle = config.get('main', 'dont_bundle').split() - if config.has_section('depends'): - items = config.items('depends') - depends = dict((n, v.split()) for (n, v) in items) - if config.has_section('groups'): - items = config.items('groups') - groups = dict((n, v.split()) for (n, v) in items) - - library = Library(name, path) - module = sys.modules[__name__] - - # process each .js/.css file found - for dirname, dirnames, filenames in os.walk(resource_path): - for x in reversed(order): - if x in filenames: - filenames.remove(x) - filenames.insert(0, x) - for f in filenames: - if f.endswith('.js') and not f.endswith('.min.js'): - minify(f, jsmin) - create_resource(f, resource_path, path) - if f.endswith('.css') and not f.endswith('.min.css'): - minify(f, cssmin) - create_resource(f, resource_path, path) - - # add groups - for group_name in groups: - members = [] - for member in groups[group_name]: - fanstatic_name = '%s/%s' % (path, member) - members.append(getattr(module, fanstatic_name)) - group = Group(members) - fanstatic_name = '%s/%s' % (path, group_name) - setattr(module, fanstatic_name, group) - # finally add the library to this module - setattr(module, name, library) - # add to fanstatic - registry = get_library_registry() - registry.add(library) - - -# create our libraries here from any subdirectories -for dirname, dirnames, filenames in os.walk(os.path.dirname(__file__)): - if dirname == os.path.dirname(__file__): - continue - lib_name = os.path.basename(dirname) - create_library(lib_name, lib_name) diff --git a/ckan/lib/app_globals.py b/ckan/lib/app_globals.py index 5ada3e690b1..92fd5c9469e 100644 --- a/ckan/lib/app_globals.py +++ b/ckan/lib/app_globals.py @@ -70,7 +70,10 @@ def get_globals_key(key): def reset(): ''' set updatable values from config ''' def get_config_value(key, default=''): - value = model.get_system_info(key) + if model.meta.engine.has_table('system_info'): + value = model.get_system_info(key) + else: + value = None # we want to store the config the first time we get here so we can # reset them if needed config_value = config.get(key) diff --git a/ckan/lib/fanstatic_extensions.py b/ckan/lib/fanstatic_extensions.py new file mode 100644 index 00000000000..5bf5f016071 --- /dev/null +++ b/ckan/lib/fanstatic_extensions.py @@ -0,0 +1,114 @@ +import fanstatic.core as core + + +class CkanCustomRenderer(object): + ''' Allows for in-line js and IE conditionals via fanstatic. ''' + def __init__(self, script=None, renderer=None, condition=None, + other_browsers=False): + self.script = script + self.other_browsers = other_browsers + self.renderer = renderer + start = '' + end = '' + # IE conditionals + if condition: + start = '' + if other_browsers: + start += '' + end = ' {%- endblock -%} +{% resource g.main_css[6:] %} {# Allows custom attributes to be added to the tag #} - {# Append a .js class to the html element #} - - {# Add custom meta tags to the page. Call super() to get the default tags such as charset, viewport and generator. @@ -49,15 +47,6 @@ {%- endblock -%} - {# - The html5shim allows you to modify/override the html5 shim. This is useful - for modifying it's behaviour or removing it completely. - #} - {%- block html5shim -%} - {# Temporary link until we get resource helper working #} - - {%- endblock -%} - {# The links block allows you to add additonal content before the stylesheets such as rss feeds and favicons in the same way as the meta block. @@ -81,8 +70,6 @@ {% endblock %} #} {%- block styles %} - - {% endblock %} {# defined in the config.ini under "ckan.template_head_end" #} diff --git a/ckan/templates/package/resource_read.html b/ckan/templates/package/resource_read.html index 521c130bfaa..ecc077a4688 100644 --- a/ckan/templates/package/resource_read.html +++ b/ckan/templates/package/resource_read.html @@ -118,6 +118,7 @@

Resource I {% block styles %} {{ super() }} + {# - + #} {% endblock %} {% block scripts %} {{ super() }} + {# @@ -145,7 +147,8 @@

Resource I - + #} + {% resource 'datapreview/datapreview' %} diff --git a/ckan/templates/snippets/scripts.html b/ckan/templates/snippets/scripts.html deleted file mode 100644 index 46b2809beee..00000000000 --- a/ckan/templates/snippets/scripts.html +++ /dev/null @@ -1,35 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -{% if config.get('ckan.tracking_enabled', 'false') %} - {% snippet 'snippets/internal_tracking.html' %} -{% endif %} diff --git a/ckanext/datastore/__init__.py b/ckanext/datastore/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/ckanext/datastore/db.py b/ckanext/datastore/db.py new file mode 100644 index 00000000000..ad5c68e2e7c --- /dev/null +++ b/ckanext/datastore/db.py @@ -0,0 +1,577 @@ +import sqlalchemy +import ckan.plugins as p +import psycopg2.extras +import json +import datetime +import shlex + +_pg_types = {} +_type_names = set() +_engines = {} + +_date_formats = ['%Y-%m-%d', + '%Y-%m-%d %H:%M:%S', + '%Y-%m-%dT%H:%M:%S', + '%Y-%m-%dT%H:%M:%SZ', + '%d/%m/%Y', + '%m/%d/%Y', + '%d-%m-%Y', + '%m-%d-%Y', + ] + +def _is_valid_field_name(name): + ''' + Check that field name is valid: + * can't start with underscore + * can't contain double quote (") + ''' + if name.startswith('_') or '"' in name: + return False + return True + + +def _validate_int(i, field_name): + try: + int(i) + except ValueError: + raise p.toolkit.ValidationError({ + 'field_name': ['{} is not an integer'.format(i)] + }) + + +def _get_engine(context, data_dict): + 'Get either read or write engine.' + connection_url = data_dict['connection_url'] + engine = _engines.get(connection_url) + + if not engine: + engine = sqlalchemy.create_engine(connection_url) + _engines[connection_url] = engine + return engine + + +def _cache_types(context): + if not _pg_types: + connection = context['connection'] + results = connection.execute( + 'select oid, typname from pg_type;' + ) + for result in results: + _pg_types[result[0]] = result[1] + _type_names.add(result[1]) + if '_json' not in _type_names: + connection.execute('create type "_json" as (json text, extra text)') + _pg_types.clear() + ## redo cache types with json now availiable. + return _cache_types(context) + + psycopg2.extras.register_composite('_json', connection.connection, + True) + + +def _get_type(context, oid): + _cache_types(context) + return _pg_types[oid] + + +def _guess_type(field): + 'Simple guess type of field, only allowed are integer, numeric and text' + data_types = set([int, float]) + if isinstance(field, (dict, list)): + return '_json' + if isinstance(field, int): + return 'int' + if isinstance(field, float): + return 'float' + for data_type in list(data_types): + try: + data_type(field) + except (TypeError, ValueError): + data_types.discard(data_type) + if not data_types: + break + if int in data_types: + return 'integer' + elif float in data_types: + return 'numeric' + + ##try iso dates + for format in _date_formats: + try: + datetime.datetime.strptime(field, format) + return 'timestamp' + except ValueError: + continue + return 'text' + + +def _get_fields(context, data_dict): + fields = [] + all_fields = context['connection'].execute( + 'select * from "{0}" limit 1'.format(data_dict['resource_id']) + ) + for field in all_fields.cursor.description: + if not field[0].startswith('_'): + fields.append({ + 'id': field[0].decode('utf-8'), + 'type': _get_type(context, field[1]) + }) + return fields + + +def json_get_values(obj, current_list=None): + if current_list is None: + current_list = [] + if isinstance(obj, basestring): + current_list.append(obj) + if isinstance(obj, list): + for item in obj: + json_get_values(item, current_list) + if isinstance(obj, dict): + for item in obj.values(): + json_get_values(item, current_list) + return current_list + + +def check_fields(context, fields): + 'Check if field types are valid.' + for field in fields: + if field.get('type') and not field['type'] in _type_names: + raise p.toolkit.ValidationError({ + 'fields': ['{0} is not a valid field type'.format(field['type'])] + }) + elif not _is_valid_field_name(field['id']): + raise p.toolkit.ValidationError({ + 'fields': ['{0} is not a valid field name'.format(field['id'])] + }) + +def convert(data, type): + if data is None: + return None + if type == '_json': + return json.loads(data[0]) + if isinstance(data, datetime.datetime): + return data.isoformat() + if isinstance(data, (int, float)): + return data + return unicode(data) + +def create_table(context, data_dict): + 'Create table from combination of fields and first row of data.' + + datastore_fields = [ + {'id': '_id', 'type': 'serial primary key'}, + {'id': '_full_text', 'type': 'tsvector'}, + ] + + # check first row of data for additional fields + extra_fields = [] + supplied_fields = data_dict.get('fields', []) + check_fields(context, supplied_fields) + field_ids = [field['id'] for field in data_dict.get('fields', [])] + records = data_dict.get('records') + + # if type is field is not given try and guess or throw an error + for field in supplied_fields: + if 'type' not in field: + if not records or field['id'] not in records[0]: + raise p.toolkit.ValidationError({ + 'fields': ['{} type not guessable'.format(field['id'])] + }) + field['type'] = _guess_type(records[0][field['id']]) + + if records: + # check record for sanity + if not isinstance(records[0], dict): + raise p.toolkit.ValidationError({ + 'records': ['The first row is not a json object'] + }) + supplied_field_ids = records[0].keys() + for field_id in supplied_field_ids: + if not field_id in field_ids: + extra_fields.append({ + 'id': field_id, + 'type': _guess_type(records[0][field_id]) + }) + + fields = datastore_fields + supplied_fields + extra_fields + sql_fields = u", ".join([u'"{0}" {1}'.format(f['id'], f['type']) + for f in fields]) + + sql_string = u'create table "{0}" ({1});'.format( + data_dict['resource_id'], + sql_fields + ) + + context['connection'].execute(sql_string) + + +def alter_table(context, data_dict): + '''alter table from combination of fields and first row of data''' + supplied_fields = data_dict.get('fields', []) + current_fields = _get_fields(context, data_dict) + if not supplied_fields: + supplied_fields = current_fields + check_fields(context, supplied_fields) + field_ids = [field['id'] for field in supplied_fields] + records = data_dict.get('records') + new_fields = [] + + for num, field in enumerate(supplied_fields): + # check to see if field definition is the same or an + # extension of current fields + if num < len(current_fields): + if field['id'] != current_fields[num]['id']: + raise p.toolkit.ValidationError({ + 'fields': [('Supplied field "{}" not ' + 'present or in wrong order').format(field['id'])] + }) + ## no need to check type as field already defined. + continue + + if 'type' not in field: + if not records or field['id'] not in records[0]: + raise p.toolkit.ValidationError({ + 'fields': ['{} type not guessable'.format(field['id'])] + }) + field['type'] = _guess_type(records[0][field['id']]) + new_fields.append(field) + + if records: + # check record for sanity + if not isinstance(records[0], dict): + raise p.toolkit.ValidationError({ + 'records': ['The first row is not a json object'] + }) + supplied_field_ids = records[0].keys() + for field_id in supplied_field_ids: + if not field_id in field_ids: + new_fields.append({ + 'id': field_id, + 'type': _guess_type(records[0][field_id]) + }) + + for field in new_fields: + sql = 'alter table "{}" add "{}" {}'.format( + data_dict['resource_id'], + field['id'], + field['type']) + context['connection'].execute(sql) + + +def insert_data(context, data_dict): + '''insert all data from records''' + if not data_dict.get('records'): + return + + fields = _get_fields(context, data_dict) + field_names = [field['id'] for field in fields] + sql_columns = ", ".join(['"%s"' % name for name in field_names] + + ['_full_text']) + + rows = [] + ## clean up and validate data + + for num, record in enumerate(data_dict['records']): + # check record for sanity + if not isinstance(record, dict): + raise p.toolkit.ValidationError({ + 'records': [u'row {} is not a json object'.format(num)] + }) + ## check for extra fields in data + extra_keys = set(record.keys()) - set(field_names) + + if extra_keys: + raise p.toolkit.ValidationError({ + 'records': [u'row {} has extra keys "{}"'.format( + num + 1, + ', '.join(list(extra_keys)) + )] + }) + + full_text = [] + row = [] + for field in fields: + value = record.get(field['id']) + if field['type'].lower() == '_json' and value: + full_text.extend(json_get_values(value)) + ## a tuple with an empty second value + value = (json.dumps(value), '') + elif field['type'].lower() == 'text' and value: + full_text.append(value) + row.append(value) + + row.append(' '.join(full_text)) + rows.append(row) + + sql_string = u'insert into "{0}" ({1}) values ({2}, to_tsvector(%s));'.format( + data_dict['resource_id'], + sql_columns, + ', '.join(['%s' for field in field_names]) + ) + + context['connection'].execute(sql_string, rows) + + +def _where(field_ids, data_dict): + 'Return a SQL WHERE clause from data_dict filters and q' + filters = data_dict.get('filters', {}) + + if not isinstance(filters, dict): + raise p.toolkit.ValidationError({ + 'filters': ['Not a json object']} + ) + + where_clauses = [] + values = [] + + for field, value in filters.iteritems(): + if field not in field_ids: + raise p.toolkit.ValidationError({ + 'filters': ['field "{}" not in table']} + ) + where_clauses.append(u'"{}" = %s'.format(field)) + values.append(value) + + q = data_dict.get('q') + if q: + where_clauses.append('_full_text @@ to_tsquery(%s)') + values.append(q) + + where_clause = ' and '.join(where_clauses) + if where_clause: + where_clause = 'where ' + where_clause + return where_clause, values + +def _sort(context, sort, field_ids): + + if not sort: + return '' + + if isinstance(sort, basestring): + clauses = sort.split(',') + elif isinstance(sort, list): + clauses = sort + else: + raise p.toolkit.ValidationError({ + 'sort': ['sort is not a list or a string'] + }) + + clause_parsed = [] + + for clause in clauses: + clause = clause.encode('utf-8') + clause_parts = shlex.split(clause) + if len(clause_parts) == 1: + field, sort = clause_parts[0], 'asc' + elif len(clause_parts) == 2: + field, sort = clause_parts + else: + raise p.toolkit.ValidationError({ + 'sort': ['not valid syntax for sort clause'] + }) + field, sort = unicode(field, 'utf-8'), unicode(sort, 'utf-8') + + if field not in field_ids: + raise p.toolkit.ValidationError({ + 'sort': [u'field {} not it table'.format( + unicode(field, 'utf-8'))] + }) + if sort.lower() not in ('asc', 'desc'): + raise p.toolkit.ValidationError({ + 'sort': ['sorting can only be asc or desc'] + }) + clause_parsed.append(u'"{}" {}'.format( + field, sort) + ) + + if clause_parsed: + return "order by " + ", ".join(clause_parsed) + + +def delete_data(context, data_dict): + fields = _get_fields(context, data_dict) + field_ids = set([field['id'] for field in fields]) + where_clause, where_values = _where(field_ids, data_dict) + + context['connection'].execute( + u'delete from "{}" {}'.format( + data_dict['resource_id'], + where_clause + ), + where_values + ) + + +def search_data(context, data_dict): + all_fields = _get_fields(context, data_dict) + all_field_ids = [field['id'] for field in all_fields] + all_field_ids.insert(0,'_id') + + fields = data_dict.get('fields') + + if fields: + field_ids = fields + + for field in field_ids: + if not field in all_field_ids: + raise p.toolkit.ValidationError({ + 'fields': [u'field "{}" not in table'.format(field)]} + ) + else: + field_ids = all_field_ids + + select_columns = ', '.join([u'"{}"'.format(field_id) + for field_id in field_ids]) + where_clause, where_values = _where(all_field_ids, data_dict) + limit = data_dict.get('limit', 100) + offset = data_dict.get('offset', 0) + + _validate_int(limit, 'limit') + _validate_int(offset, 'offset') + + sort = _sort(context, data_dict.get('sort'), field_ids) + + sql_string = u'''select {}, count(*) over() as "_full_count" + from "{}" {} {} limit {} offset {}'''\ + .format(select_columns, data_dict['resource_id'], where_clause, + sort, limit, offset) + results = context['connection'].execute(sql_string, where_values) + + result_fields = [] + for field in results.cursor.description: + result_fields.append({ + 'id': field[0].decode('utf-8'), + 'type': _get_type(context, field[1]) + }) + result_fields.pop() # remove _full_count + + data_dict['total'] = 0 + + records = [] + for row in results: + converted_row = {} + if not data_dict['total']: + data_dict['total'] = row['_full_count'] + for field in result_fields: + converted_row[field['id']] = convert(row[field['id']], + field['type']) + records.append(converted_row) + data_dict['records'] = records + data_dict['fields'] = result_fields + return data_dict + +def create(context, data_dict): + ''' + The first row will be used to guess types not in the fields and the + guessed types will be added to the headers permanently. + Consecutive rows have to conform to the field definitions. + + rows can be empty so that you can just set the fields. + + fields are optional but needed if you want to do type hinting or + add extra information for certain columns or to explicitly + define ordering. + + eg: [{"id": "dob", "type": "timestamp"}, + {"id": "name", "type": "text"}] + + A header items values can not be changed after it has been defined + nor can the ordering of them be changed. They can be extended though. + + Any error results in total failure! For now pass back the actual error. + Should be transactional. + ''' + engine = _get_engine(context, data_dict) + context['connection'] = engine.connect() + timeout = context.get('query_timeout', 60000) + _cache_types(context) + + # close connection at all cost. + try: + # check if table already existes + trans = context['connection'].begin() + context['connection'].execute( + u'set local statement_timeout to {}'.format(timeout)) + result = context['connection'].execute( + 'select * from pg_tables where tablename = %s', + data_dict['resource_id'] + ).fetchone() + if not result: + create_table(context, data_dict) + else: + alter_table(context, data_dict) + insert_data(context, data_dict) + trans.commit() + return data_dict + except Exception, e: + if 'due to statement timeout' in str(e): + raise p.toolkit.ValidationError({ + 'query': ['Query took too long'] + }) + raise + finally: + context['connection'].close() + + +def delete(context, data_dict): + engine = _get_engine(context, data_dict) + context['connection'] = engine.connect() + _cache_types(context) + + try: + # check if table existes + trans = context['connection'].begin() + result = context['connection'].execute( + 'select * from pg_tables where tablename = %s', + data_dict['resource_id'] + ).fetchone() + if not result: + raise p.toolkit.ValidationError({ + 'resource_id': [u'table for resource {0} does not exist'.format( + data_dict['resource_id'])] + }) + if not 'filters' in data_dict: + context['connection'].execute( + u'drop table "{}"'.format(data_dict['resource_id']) + ) + else: + delete_data(context, data_dict) + + trans.commit() + return data_dict + except Exception, e: + trans.rollback() + raise + finally: + context['connection'].close() + + +def search(context, data_dict): + engine = _get_engine(context, data_dict) + context['connection'] = engine.connect() + timeout = context.get('query_timeout', 60000) + _cache_types(context) + + try: + # check if table existes + context['connection'].execute( + u'set local statement_timeout to {}'.format(timeout)) + result = context['connection'].execute( + 'select * from pg_tables where tablename = %s', + data_dict['resource_id'] + ).fetchone() + if not result: + raise p.toolkit.ValidationError({ + 'resource_id': [u'table for resource {0} does not exist'.format( + data_dict['resource_id'])] + }) + return search_data(context, data_dict) + except Exception, e: + if 'due to statement timeout' in str(e): + raise p.toolkit.ValidationError({ + 'query': ['Search took too long'] + }) + raise + finally: + context['connection'].close() diff --git a/ckanext/datastore/logic/__init__.py b/ckanext/datastore/logic/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/ckanext/datastore/logic/action.py b/ckanext/datastore/logic/action.py new file mode 100644 index 00000000000..489a954e8ec --- /dev/null +++ b/ckanext/datastore/logic/action.py @@ -0,0 +1,120 @@ +import logging +import pylons +import ckan.logic as logic +import ckan.plugins as p +import ckanext.datastore.db as db + +log = logging.getLogger(__name__) +_get_or_bust = logic.get_or_bust + + +def datastore_create(context, data_dict): + '''Adds a new table to the datastore. + + :param resource_id: resource id that the data is going to be stored under. + :type resource_id: string + :param fields: fields/columns and their extra metadata. + :type fields: list of dictionaries + :param records: the data, eg: [{"dob": "2005", "some_stuff": ['a', b']}] + :type records: list of dictionaries + + :returns: the newly created data object. + :rtype: dictionary + + ''' + model = _get_or_bust(context, 'model') + id = _get_or_bust(data_dict, 'resource_id') + + if not model.Resource.get(id): + raise p.toolkit.ObjectNotFound(p.toolkit._( + 'Resource "{}" was not found.'.format(id) + )) + + p.toolkit.check_access('datastore_create', context, data_dict) + + data_dict['connection_url'] = pylons.config['ckan.datastore_write_url'] + + result = db.create(context, data_dict) + result.pop('id') + result.pop('connection_url') + return result + + +def datastore_delete(context, data_dict): + '''Deletes a table from the datastore. + + :param resource_id: resource id that the data will be deleted from. + :type resource_id: string + :param filter: filter to do deleting on over (eg {'name': 'fred'}). + If missing delete whole table. + + :returns: original filters sent. + :rtype: dictionary + + ''' + model = _get_or_bust(context, 'model') + id = _get_or_bust(data_dict, 'resource_id') + + if not model.Resource.get(id): + raise p.toolkit.ObjectNotFound(p.toolkit._( + 'Resource "{}" was not found.'.format(id) + )) + + p.toolkit.check_access('datastore_delete', context, data_dict) + + data_dict['connection_url'] = pylons.config['ckan.datastore_write_url'] + + result = db.delete(context, data_dict) + result.pop('id') + result.pop('connection_url') + return result + + +@logic.side_effect_free +def datastore_search(context, data_dict): + '''Search a datastore table. + + :param resource_id: id of the data that is going to be selected. + :type resource_id: string + :param filters: matching conditions to select. + :type filters: dictionary + :param q: full text query + :type q: string + :param limit: maximum number of rows to return (default: 100) + :type limit: int + :param offset: offset the number of rows + :type offset: int + :param fields: ordered list of fields to return + (default: all fields in original order) + :type fields: list of dictionaries + :param sort: comma separated field names with ordering + eg: "fieldname1, fieldname2 desc" + :type sort: string + + :returns: a dictionary containing the search parameters and the + search results. + keys: fields: same as datastore_create accepts + offset: query offset value + limit: query limit value + filters: query filters + total: number of total matching records + records: list of matching results + :rtype: dictionary + + ''' + model = _get_or_bust(context, 'model') + id = _get_or_bust(data_dict, 'resource_id') + + if not model.Resource.get(id): + raise p.toolkit.ObjectNotFound(p.toolkit._( + 'Resource "{}" was not found.'.format(id) + )) + + p.toolkit.check_access('datastore_search', context, data_dict) + + data_dict['connection_url'] = pylons.config['ckan.datastore_write_url'] + + result = db.search(context, data_dict) + result.pop('id', None) + result.pop('connection_url') + return result diff --git a/ckanext/datastore/logic/auth.py b/ckanext/datastore/logic/auth.py new file mode 100644 index 00000000000..0cac0f18f31 --- /dev/null +++ b/ckanext/datastore/logic/auth.py @@ -0,0 +1,29 @@ +import ckan.plugins as p + + +def _datastore_auth(context, data_dict): + data_dict['id'] = data_dict.get('resource_id') + user = context.get('user') + + authorized = p.toolkit.check_access('resource_update', context, data_dict) + + if not authorized: + return { + 'success': False, + 'msg': p.toolkit._('User {} not authorized to update resource {}'\ + .format(str(user), data_dict['id'])) + } + else: + return {'success': True} + + +def datastore_create(context, data_dict): + return _datastore_auth(context, data_dict) + + +def datastore_delete(context, data_dict): + return _datastore_auth(context, data_dict) + + +def datastore_search(context, data_dict): + return {'success': True} diff --git a/ckanext/datastore/plugin.py b/ckanext/datastore/plugin.py new file mode 100644 index 00000000000..2a9e4f32e6a --- /dev/null +++ b/ckanext/datastore/plugin.py @@ -0,0 +1,68 @@ +import ckan.plugins as p +import ckanext.datastore.logic.action as action +import ckanext.datastore.logic.auth as auth +import ckanext.datastore.db as db +import ckan.logic as logic + + +class DatastoreException(Exception): + pass + + +class DatastorePlugin(p.SingletonPlugin): + ''' + Datastore plugin. + ''' + p.implements(p.IConfigurable, inherit=True) + p.implements(p.IActions) + p.implements(p.IAuthFunctions) + + def configure(self, config): + # check for ckan.datastore_write_url + if (not 'ckan.datastore_write_url' in config): + error_msg = 'ckan.datastore_write_url not found in config' + raise DatastoreException(error_msg) + + ## Do light wrapping around action function to add datastore_active + ## to resource dict. Not using IAction extension as this prevents other plugins + ## from having a custom resource_read. + + # Make sure actions are cached + resource_show = p.toolkit.get_action('resource_show') + + def new_resource_show(context, data_dict): + engine = db._get_engine( + context, + {'connection_url': config['ckan.datastore_write_url']} + ) + new_data_dict = resource_show(context, data_dict) + try: + connection = engine.connect() + result = connection.execute( + 'select 1 from pg_tables where tablename = %s', + new_data_dict['id'] + ).fetchone() + if result: + new_data_dict['datastore_active'] = True + else: + new_data_dict['datastore_active'] = False + finally: + connection.close() + return new_data_dict + + ## Make sure do not run many times if configure is called repeatedly + ## as in tests. + if not hasattr(resource_show, '_datastore_wrapped'): + new_resource_show._datastore_wrapped = True + logic._actions['resource_show'] = new_resource_show + + + def get_actions(self): + return {'datastore_create': action.datastore_create, + 'datastore_delete': action.datastore_delete, + 'datastore_search': action.datastore_search} + + def get_auth_functions(self): + return {'datastore_create': auth.datastore_create, + 'datastore_delete': auth.datastore_delete, + 'datastore_search': auth.datastore_search} diff --git a/ckanext/datastore/tests/__init__.py b/ckanext/datastore/tests/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/ckanext/datastore/tests/test_datastore.py b/ckanext/datastore/tests/test_datastore.py new file mode 100644 index 00000000000..e237f5509f3 --- /dev/null +++ b/ckanext/datastore/tests/test_datastore.py @@ -0,0 +1,709 @@ + +import json +import sqlalchemy +import ckan.plugins as p +import ckan.lib.create_test_data as ctd +import ckan.model as model +import ckan.tests as tests +import ckanext.datastore.db as db + + +class TestDatastoreCreate(tests.WsgiAppCase): + sysadmin_user = None + normal_user = None + p.load('datastore') + + @classmethod + def setup_class(cls): + ctd.CreateTestData.create() + cls.sysadmin_user = model.User.get('testsysadmin') + cls.normal_user = model.User.get('annafan') + + @classmethod + def teardown_class(cls): + model.repo.rebuild_db() + + def test_create_requires_auth(self): + resource = model.Package.get('annakarenina').resources[0] + data = { + 'resource_id': resource.id + } + postparams = '%s=1' % json.dumps(data) + res = self.app.post('/api/action/datastore_create', params=postparams, + status=403) + res_dict = json.loads(res.body) + assert res_dict['success'] is False + + def test_create_empty_fails(self): + postparams = '%s=1' % json.dumps({}) + auth = {'Authorization': str(self.sysadmin_user.apikey)} + res = self.app.post('/api/action/datastore_create', params=postparams, + extra_environ=auth, status=409) + res_dict = json.loads(res.body) + assert res_dict['success'] is False + + def test_create_invalid_field_type(self): + resource = model.Package.get('annakarenina').resources[0] + data = { + 'resource_id': resource.id, + 'fields': [{'id': 'book', 'type': 'INVALID'}, + {'id': 'author', 'type': 'INVALID'}] + } + postparams = '%s=1' % json.dumps(data) + auth = {'Authorization': str(self.sysadmin_user.apikey)} + res = self.app.post('/api/action/datastore_create', params=postparams, + extra_environ=auth, status=409) + res_dict = json.loads(res.body) + assert res_dict['success'] is False + + def test_create_invalid_field_name(self): + resource = model.Package.get('annakarenina').resources[0] + data = { + 'resource_id': resource.id, + 'fields': [{'id': '_book', 'type': 'text'}, + {'id': '_author', 'type': 'text'}] + } + postparams = '%s=1' % json.dumps(data) + auth = {'Authorization': str(self.sysadmin_user.apikey)} + res = self.app.post('/api/action/datastore_create', params=postparams, + extra_environ=auth, status=409) + res_dict = json.loads(res.body) + assert res_dict['success'] is False + + data = { + 'resource_id': resource.id, + 'fields': [{'id': '"book"', 'type': 'text'}, + {'id': '"author', 'type': 'text'}] + } + postparams = '%s=1' % json.dumps(data) + res = self.app.post('/api/action/datastore_create', params=postparams, + extra_environ=auth, status=409) + res_dict = json.loads(res.body) + assert res_dict['success'] is False + + def test_create_invalid_record_field(self): + resource = model.Package.get('annakarenina').resources[0] + data = { + 'resource_id': resource.id, + 'fields': [{'id': 'book', 'type': 'text'}, + {'id': 'author', 'type': 'text'}], + 'records': [{'book': 'annakarenina', 'author': 'tolstoy'}, + {'book': 'warandpeace', 'published': '1869'}] + } + postparams = '%s=1' % json.dumps(data) + auth = {'Authorization': str(self.sysadmin_user.apikey)} + res = self.app.post('/api/action/datastore_create', params=postparams, + extra_environ=auth, status=409) + res_dict = json.loads(res.body) + assert res_dict['success'] is False + + def test_bad_records(self): + resource = model.Package.get('annakarenina').resources[0] + data = { + 'resource_id': resource.id, + 'fields': [{'id': 'book', 'type': 'text'}, + {'id': 'author', 'type': 'text'}], + 'records': ['bad'] # treat author as null + } + postparams = '%s=1' % json.dumps(data) + auth = {'Authorization': str(self.sysadmin_user.apikey)} + res = self.app.post('/api/action/datastore_create', params=postparams, + extra_environ=auth, status=409) + res_dict = json.loads(res.body) + + assert res_dict['success'] is False + + resource = model.Package.get('annakarenina').resources[0] + data = { + 'resource_id': resource.id, + 'fields': [{'id': 'book', 'type': 'text'}, + {'id': 'author', 'type': 'text'}], + 'records': [{'book': 'annakarenina', 'author': 'tolstoy'}, + [], + {'book': 'warandpeace'}] # treat author as null + } + postparams = '%s=1' % json.dumps(data) + auth = {'Authorization': str(self.sysadmin_user.apikey)} + res = self.app.post('/api/action/datastore_create', params=postparams, + extra_environ=auth, status=409) + res_dict = json.loads(res.body) + + assert res_dict['success'] is False + + def test_create_basic(self): + resource = model.Package.get('annakarenina').resources[0] + data = { + 'resource_id': resource.id, + 'fields': [{'id': 'book', 'type': 'text'}, + {'id': 'author', 'type': '_json'}], + 'records': [ + {'book': 'crime', 'author': ['tolstoy', 'dostoevsky']}, + {'book': 'annakarenina', 'author': ['tolstoy', 'putin']}, + {'book': 'warandpeace'}] # treat author as null + } + ### Firstly test to see if resource things it has datastore table + postparams = '%s=1' % json.dumps({'id': resource.id}) + auth = {'Authorization': str(self.sysadmin_user.apikey)} + res = self.app.post('/api/action/resource_show', params=postparams, + extra_environ=auth) + res_dict = json.loads(res.body) + assert res_dict['result']['datastore_active'] == False + + + postparams = '%s=1' % json.dumps(data) + auth = {'Authorization': str(self.sysadmin_user.apikey)} + res = self.app.post('/api/action/datastore_create', params=postparams, + extra_environ=auth) + res_dict = json.loads(res.body) + + assert res_dict['success'] is True + assert res_dict['result']['resource_id'] == data['resource_id'] + assert res_dict['result']['fields'] == data['fields'] + assert res_dict['result']['records'] == data['records'] + + c = model.Session.connection() + results = c.execute('select * from "{0}"'.format(resource.id)) + + assert results.rowcount == 3 + for i, row in enumerate(results): + assert data['records'][i].get('book') == row['book'] + assert data['records'][i].get('author') == (json.loads(row['author'][0]) if row['author'] else None) + + results = c.execute('''select * from "{0}" where _full_text @@ to_tsquery('warandpeace') '''.format(resource.id)) + assert results.rowcount == 1, results.rowcount + + results = c.execute('''select * from "{0}" where _full_text @@ to_tsquery('tolstoy') '''.format(resource.id)) + assert results.rowcount == 2 + model.Session.remove() + + # check to test to see if resource now has a datastore table + postparams = '%s=1' % json.dumps({'id': resource.id}) + auth = {'Authorization': str(self.sysadmin_user.apikey)} + res = self.app.post('/api/action/resource_show', params=postparams, + extra_environ=auth) + res_dict = json.loads(res.body) + assert res_dict['result']['datastore_active'] == True + + ####### insert again simple + data2 = { + 'resource_id': resource.id, + 'records': [{'book': 'hagji murat', 'author': ['tolstoy']}] + } + + postparams = '%s=1' % json.dumps(data2) + auth = {'Authorization': str(self.sysadmin_user.apikey)} + res = self.app.post('/api/action/datastore_create', params=postparams, + extra_environ=auth) + res_dict = json.loads(res.body) + + assert res_dict['success'] is True + + c = model.Session.connection() + results = c.execute('select * from "{0}"'.format(resource.id)) + + assert results.rowcount == 4 + + all_data = data['records'] + data2['records'] + for i, row in enumerate(results): + assert all_data[i].get('book') == row['book'] + assert all_data[i].get('author') == (json.loads(row['author'][0]) if row['author'] else None) + + results = c.execute('''select * from "{0}" where _full_text @@ 'tolstoy' '''.format(resource.id)) + assert results.rowcount == 3 + model.Session.remove() + + ####### insert again extra field + data3 = { + 'resource_id': resource.id, + 'records': [{'book': 'crime and punsihment', + 'author': ['dostoevsky'], 'rating': 'good'}] + } + + postparams = '%s=1' % json.dumps(data3) + auth = {'Authorization': str(self.sysadmin_user.apikey)} + res = self.app.post('/api/action/datastore_create', params=postparams, + extra_environ=auth) + res_dict = json.loads(res.body) + + assert res_dict['success'] is True + + c = model.Session.connection() + results = c.execute('select * from "{0}"'.format(resource.id)) + + assert results.rowcount == 5 + + all_data = data['records'] + data2['records'] + data3['records'] + for i, row in enumerate(results): + assert all_data[i].get('book') == row['book'], (i, all_data[i].get('book'), row['book']) + assert all_data[i].get('author') == (json.loads(row['author'][0]) if row['author'] else None) + + results = c.execute('''select * from "{0}" where _full_text @@ to_tsquery('dostoevsky') '''.format(resource.id)) + assert results.rowcount == 2 + model.Session.remove() + + def test_guess_types(self): + resource = model.Package.get('annakarenina').resources[1] + data = { + 'resource_id': resource.id, + 'fields': [{'id': 'author', 'type': '_json'}, + {'id': 'count'}, + {'id': 'book'}, + {'id': 'date'}], + 'records': [{'book': 'annakarenina', 'author': 'tolstoy', + 'count': 1, 'date': '2005-12-01', 'count2': 2}, + {'book': 'crime', 'author': ['tolstoy', 'dostoevsky']}, + {'book': 'warandpeace'}] # treat author as null + } + postparams = '%s=1' % json.dumps(data) + auth = {'Authorization': str(self.sysadmin_user.apikey)} + res = self.app.post('/api/action/datastore_create', params=postparams, + extra_environ=auth) + res_dict = json.loads(res.body) + + c = model.Session.connection() + results = c.execute('''select * from "{0}" '''.format(resource.id)) + + types = [db._pg_types[field[1]] for field in results.cursor.description] + + assert types == [u'int4', u'tsvector', u'_json', u'int4', + u'text', u'timestamp', u'int4'], types + + assert results.rowcount == 3 + for i, row in enumerate(results): + assert data['records'][i].get('book') == row['book'] + assert data['records'][i].get('author') == (json.loads(row['author'][0]) if row['author'] else None) + model.Session.remove() + + ### extend types + + data = { + 'resource_id': resource.id, + 'fields': [{'id': 'author', 'type': 'text'}, + {'id': 'count'}, + {'id': 'book'}, + {'id': 'date'}, + {'id': 'count2'}, + {'id': 'extra', 'type':'text'}, + {'id': 'date2'}, + ], + 'records': [{'book': 'annakarenina', 'author': 'tolstoy', + 'count': 1, 'date': '2005-12-01', 'count2': 2, + 'nested': [1,2], 'date2': '2005-12-01'}] + } + + postparams = '%s=1' % json.dumps(data) + auth = {'Authorization': str(self.sysadmin_user.apikey)} + res = self.app.post('/api/action/datastore_create', params=postparams, + extra_environ=auth) + res_dict = json.loads(res.body) + + c = model.Session.connection() + results = c.execute('''select * from "{0}" '''.format(resource.id)) + + types = [db._pg_types[field[1]] for field in results.cursor.description] + + assert types == [u'int4', # id + u'tsvector', # fulltext + u'_json', # author + u'int4', # count + u'text', # book + u'timestamp', # date + u'int4', # count2 + u'text', # extra + u'timestamp', # date2 + u'_json', # count3 + ], types + + ### fields resupplied in wrong order + + data = { + 'resource_id': resource.id, + 'fields': [{'id': 'author', 'type': 'text'}, + {'id': 'count'}, + {'id': 'date'}, # date and book in wrong order + {'id': 'book'}, + {'id': 'count2'}, + {'id': 'extra', 'type':'text'}, + {'id': 'date2'}, + ], + 'records': [{'book': 'annakarenina', 'author': 'tolstoy', + 'count': 1, 'date': '2005-12-01', 'count2': 2, + 'count3': 432, 'date2': '2005-12-01'}] + } + + postparams = '%s=1' % json.dumps(data) + auth = {'Authorization': str(self.sysadmin_user.apikey)} + res = self.app.post('/api/action/datastore_create', params=postparams, + extra_environ=auth, status=409) + res_dict = json.loads(res.body) + + assert res_dict['success'] is False + + +class TestDatastoreDelete(tests.WsgiAppCase): + sysadmin_user = None + normal_user = None + + @classmethod + def setup_class(cls): + p.load('datastore') + ctd.CreateTestData.create() + cls.sysadmin_user = model.User.get('testsysadmin') + cls.normal_user = model.User.get('annafan') + resource = model.Package.get('annakarenina').resources[0] + cls.data = { + 'resource_id': resource.id, + 'fields': [{'id': 'book', 'type': 'text'}, + {'id': 'author', 'type': 'text'}], + 'records': [{'book': 'annakarenina', 'author': 'tolstoy'}, + {'book': 'warandpeace', 'author': 'tolstoy'}] + } + + @classmethod + def teardown_class(cls): + model.repo.rebuild_db() + + def _create(self): + postparams = '%s=1' % json.dumps(self.data) + auth = {'Authorization': str(self.sysadmin_user.apikey)} + res = self.app.post('/api/action/datastore_create', params=postparams, + extra_environ=auth) + res_dict = json.loads(res.body) + assert res_dict['success'] is True + return res_dict + + def _delete(self): + data = {'resource_id': self.data['resource_id']} + postparams = '%s=1' % json.dumps(data) + auth = {'Authorization': str(self.sysadmin_user.apikey)} + res = self.app.post('/api/action/datastore_delete', params=postparams, + extra_environ=auth) + res_dict = json.loads(res.body) + assert res_dict['success'] is True + assert res_dict['result'] == data + return res_dict + + def test_delete_basic(self): + self._create() + self._delete() + resource_id = self.data['resource_id'] + c = model.Session.connection() + + try: + # check that data was actually deleted: this should raise a + # ProgrammingError as the table should not exist any more + c.execute('select * from "{0}";'.format(resource_id)) + raise Exception("Data not deleted") + except sqlalchemy.exc.ProgrammingError as e: + expected_msg = 'relation "{}" does not exist'.format(resource_id) + assert expected_msg in str(e) + + model.Session.remove() + + def test_delete_invalid_resource_id(self): + postparams = '%s=1' % json.dumps({'resource_id': 'bad'}) + auth = {'Authorization': str(self.sysadmin_user.apikey)} + res = self.app.post('/api/action/datastore_delete', params=postparams, + extra_environ=auth, status=404) + res_dict = json.loads(res.body) + assert res_dict['success'] is False + + def test_delete_filters(self): + self._create() + resource_id = self.data['resource_id'] + + # try and delete just the 'warandpeace' row + data = {'resource_id': resource_id, + 'filters': {'book': 'warandpeace'}} + postparams = '%s=1' % json.dumps(data) + auth = {'Authorization': str(self.sysadmin_user.apikey)} + res = self.app.post('/api/action/datastore_delete', params=postparams, + extra_environ=auth) + res_dict = json.loads(res.body) + assert res_dict['success'] is True + + c = model.Session.connection() + result = c.execute('select * from "{0}";'.format(resource_id)) + results = [r for r in result] + assert len(results) == 1 + assert results[0].book == 'annakarenina' + model.Session.remove() + + # shouldn't delete anything + data = {'resource_id': resource_id, + 'filters': {'book': 'annakarenina', 'author': 'bad'}} + postparams = '%s=1' % json.dumps(data) + auth = {'Authorization': str(self.sysadmin_user.apikey)} + res = self.app.post('/api/action/datastore_delete', params=postparams, + extra_environ=auth) + res_dict = json.loads(res.body) + assert res_dict['success'] is True + + c = model.Session.connection() + result = c.execute('select * from "{0}";'.format(resource_id)) + results = [r for r in result] + assert len(results) == 1 + assert results[0].book == 'annakarenina' + model.Session.remove() + + # delete the 'annakarenina' row + data = {'resource_id': resource_id, + 'filters': {'book': 'annakarenina', 'author': 'tolstoy'}} + postparams = '%s=1' % json.dumps(data) + auth = {'Authorization': str(self.sysadmin_user.apikey)} + res = self.app.post('/api/action/datastore_delete', params=postparams, + extra_environ=auth) + res_dict = json.loads(res.body) + assert res_dict['success'] is True + + c = model.Session.connection() + result = c.execute('select * from "{0}";'.format(resource_id)) + results = [r for r in result] + assert len(results) == 0 + model.Session.remove() + + self._delete() + + +class TestDatastoreSearch(tests.WsgiAppCase): + sysadmin_user = None + normal_user = None + + @classmethod + def setup_class(cls): + p.load('datastore') + ctd.CreateTestData.create() + cls.sysadmin_user = model.User.get('testsysadmin') + cls.normal_user = model.User.get('annafan') + resource = model.Package.get('annakarenina').resources[0] + cls.data = { + 'resource_id': resource.id, + 'fields': [{'id': u'b\xfck', 'type': 'text'}, + {'id': 'author', 'type': 'text'}, + {'id': 'published'}], + 'records': [{u'b\xfck': 'annakarenina', 'author': 'tolstoy', 'published': '2005-03-01', 'nested': ['b', {'moo': 'moo'}]}, + {u'b\xfck': 'warandpeace', 'author': 'tolstoy', 'nested': {'a':'b'}} + ] + } + postparams = '%s=1' % json.dumps(cls.data) + auth = {'Authorization': str(cls.sysadmin_user.apikey)} + res = cls.app.post('/api/action/datastore_create', params=postparams, + extra_environ=auth) + res_dict = json.loads(res.body) + assert res_dict['success'] is True + + cls.expected_records = [{u'published': u'2005-03-01T00:00:00', + u'_id': 1, + u'nested': [u'b', {u'moo': u'moo'}], u'b\xfck': u'annakarenina', u'author': u'tolstoy'}, + {u'published': None, + u'_id': 2, + u'nested': {u'a': u'b'}, u'b\xfck': u'warandpeace', u'author': u'tolstoy'}] + + + @classmethod + def teardown_class(cls): + model.repo.rebuild_db() + + def test_search_basic(self): + data = {'resource_id': self.data['resource_id']} + postparams = '%s=1' % json.dumps(data) + auth = {'Authorization': str(self.sysadmin_user.apikey)} + res = self.app.post('/api/action/datastore_search', params=postparams, + extra_environ=auth) + res_dict = json.loads(res.body) + assert res_dict['success'] is True + result = res_dict['result'] + assert result['total'] == len(self.data['records']) + assert result['records'] == self.expected_records + + def test_search_invalid_field(self): + data = {'resource_id': self.data['resource_id'], + 'fields': [{'id': 'bad'}]} + postparams = '%s=1' % json.dumps(data) + auth = {'Authorization': str(self.sysadmin_user.apikey)} + res = self.app.post('/api/action/datastore_search', params=postparams, + extra_environ=auth, status=409) + res_dict = json.loads(res.body) + assert res_dict['success'] is False + + def test_search_fields(self): + data = {'resource_id': self.data['resource_id'], + 'fields': [u'b\xfck']} + postparams = '%s=1' % json.dumps(data) + auth = {'Authorization': str(self.sysadmin_user.apikey)} + res = self.app.post('/api/action/datastore_search', params=postparams, + extra_environ=auth) + res_dict = json.loads(res.body) + assert res_dict['success'] is True + result = res_dict['result'] + assert result['total'] == len(self.data['records']) + assert result['records'] == [{u'b\xfck': 'annakarenina'}, + {u'b\xfck': 'warandpeace'}], result['records'] + + def test_search_filters(self): + data = {'resource_id': self.data['resource_id'], + 'filters': {u'b\xfck': 'annakarenina'}} + postparams = '%s=1' % json.dumps(data) + auth = {'Authorization': str(self.sysadmin_user.apikey)} + res = self.app.post('/api/action/datastore_search', params=postparams, + extra_environ=auth) + res_dict = json.loads(res.body) + assert res_dict['success'] is True + result = res_dict['result'] + assert result['total'] == 1 + assert result['records'] == [self.expected_records[0]] + + def test_search_sort(self): + data = {'resource_id': self.data['resource_id'], + 'sort': u'b\xfck asc, author desc'} + postparams = '%s=1' % json.dumps(data) + auth = {'Authorization': str(self.sysadmin_user.apikey)} + res = self.app.post('/api/action/datastore_search', params=postparams, + extra_environ=auth) + res_dict = json.loads(res.body) + assert res_dict['success'] is True + result = res_dict['result'] + assert result['total'] == 2 + + assert result['records'] == self.expected_records, result['records'] + + data = {'resource_id': self.data['resource_id'], + 'sort': [u'b\xfck desc', '"author" asc']} + postparams = '%s=1' % json.dumps(data) + res = self.app.post('/api/action/datastore_search', params=postparams, + extra_environ=auth) + res_dict = json.loads(res.body) + assert res_dict['success'] is True + result = res_dict['result'] + assert result['total'] == 2 + + assert result['records'] == self.expected_records[::-1] + + def test_search_limit(self): + data = {'resource_id': self.data['resource_id'], + 'limit': 1} + postparams = '%s=1' % json.dumps(data) + auth = {'Authorization': str(self.sysadmin_user.apikey)} + res = self.app.post('/api/action/datastore_search', params=postparams, + extra_environ=auth) + res_dict = json.loads(res.body) + assert res_dict['success'] is True + result = res_dict['result'] + assert result['total'] == 2 + assert result['records'] == [self.expected_records[0]] + + def test_search_invalid_limit(self): + data = {'resource_id': self.data['resource_id'], + 'limit': 'bad'} + postparams = '%s=1' % json.dumps(data) + auth = {'Authorization': str(self.sysadmin_user.apikey)} + res = self.app.post('/api/action/datastore_search', params=postparams, + extra_environ=auth, status=409) + res_dict = json.loads(res.body) + assert res_dict['success'] is False + + def test_search_offset(self): + data = {'resource_id': self.data['resource_id'], + 'limit': 1, + 'offset': 1} + postparams = '%s=1' % json.dumps(data) + auth = {'Authorization': str(self.sysadmin_user.apikey)} + res = self.app.post('/api/action/datastore_search', params=postparams, + extra_environ=auth) + res_dict = json.loads(res.body) + assert res_dict['success'] is True + result = res_dict['result'] + assert result['total'] == 2 + assert result['records'] == [self.expected_records[1]] + + def test_search_invalid_offset(self): + data = {'resource_id': self.data['resource_id'], + 'offset': 'bad'} + postparams = '%s=1' % json.dumps(data) + auth = {'Authorization': str(self.sysadmin_user.apikey)} + res = self.app.post('/api/action/datastore_search', params=postparams, + extra_environ=auth, status=409) + res_dict = json.loads(res.body) + assert res_dict['success'] is False + + def test_search_full_text(self): + data = {'resource_id': self.data['resource_id'], + 'q': 'annakarenina'} + postparams = '%s=1' % json.dumps(data) + auth = {'Authorization': str(self.sysadmin_user.apikey)} + res = self.app.post('/api/action/datastore_search', params=postparams, + extra_environ=auth) + res_dict = json.loads(res.body) + assert res_dict['success'] is True + result = res_dict['result'] + assert result['total'] == 1 + assert result['records'] == [self.expected_records[0]] + + data = {'resource_id': self.data['resource_id'], + 'q': 'tolstoy'} + postparams = '%s=1' % json.dumps(data) + res = self.app.post('/api/action/datastore_search', params=postparams, + extra_environ=auth) + res_dict = json.loads(res.body) + assert res_dict['success'] is True + result = res_dict['result'] + assert result['total'] == 2 + assert result['records'] == self.expected_records, result['records'] + + assert result['fields'] == [{u'type': u'int4', u'id': u'_id'}, {u'type': u'text', u'id': u'b\xfck'}, {u'type': u'text', u'id': u'author'}, {u'type': u'timestamp', u'id': u'published'}, {u'type': u'_json', u'id': u'nested'}], result['fields'] + + +class TestDatastoreFullTextSearch(tests.WsgiAppCase): + @classmethod + def setup_class(cls): + p.load('datastore') + ctd.CreateTestData.create() + cls.sysadmin_user = model.User.get('testsysadmin') + cls.normal_user = model.User.get('annafan') + resource = model.Package.get('annakarenina').resources[0] + cls.data = dict( + resource_id = resource.id, + fields = [ + {'id': 'id'}, + {'id': 'date', 'type':'date'}, + {'id': 'x'}, + {'id': 'y'}, + {'id': 'z'}, + {'id': 'country'}, + {'id': 'title'}, + {'id': 'lat'}, + {'id': 'lon'} + ], + records = [ + {'id': 0, 'date': '2011-01-01', 'x': 1, 'y': 2, 'z': 3, 'country': 'DE', 'title': 'first', 'lat':52.56, 'lon':13.40}, + {'id': 1, 'date': '2011-02-02', 'x': 2, 'y': 4, 'z': 24, 'country': 'UK', 'title': 'second', 'lat':54.97, 'lon':-1.60}, + {'id': 2, 'date': '2011-03-03', 'x': 3, 'y': 6, 'z': 9, 'country': 'US', 'title': 'third', 'lat':40.00, 'lon':-75.5}, + {'id': 3, 'date': '2011-04-04', 'x': 4, 'y': 8, 'z': 6, 'country': 'UK', 'title': 'fourth', 'lat':57.27, 'lon':-6.20}, + {'id': 4, 'date': '2011-05-04', 'x': 5, 'y': 10, 'z': 15, 'country': 'UK', 'title': 'fifth', 'lat':51.58, 'lon':0}, + {'id': 5, 'date': '2011-06-02', 'x': 6, 'y': 12, 'z': 18, 'country': 'DE', 'title': 'sixth', 'lat':51.04, 'lon':7.9} + ] + ) + postparams = '%s=1' % json.dumps(cls.data) + auth = {'Authorization': str(cls.sysadmin_user.apikey)} + res = cls.app.post('/api/action/datastore_create', params=postparams, + extra_environ=auth) + res_dict = json.loads(res.body) + assert res_dict['success'] is True + + @classmethod + def teardown_class(cls): + model.repo.rebuild_db() + + def test_search_full_text(self): + data = {'resource_id': self.data['resource_id'], + 'q': 'DE'} + postparams = '%s=1' % json.dumps(data) + auth = {'Authorization': str(self.sysadmin_user.apikey)} + res = self.app.post('/api/action/datastore_search', params=postparams, + extra_environ=auth) + res_dict = json.loads(res.body) + import pprint + + assert res_dict['result']['total'] == 2, pprint.pformat(res_dict) + + diff --git a/ckanext/stats/plugin.py b/ckanext/stats/plugin.py index 909f0ad79fc..11caf7a42d4 100644 --- a/ckanext/stats/plugin.py +++ b/ckanext/stats/plugin.py @@ -24,3 +24,4 @@ def update_config(self, config): templates = 'templates_legacy' p.toolkit.add_template_directory(config, templates) p.toolkit.add_public_directory(config, 'public') + p.toolkit.add_resource('public/ckanext/stats', 'ckanext_stats') diff --git a/ckanext/stats/public/.gitignore b/ckanext/stats/public/.gitignore new file mode 100644 index 00000000000..6ac02e79926 --- /dev/null +++ b/ckanext/stats/public/.gitignore @@ -0,0 +1,2 @@ +**.min.js +**.min.css diff --git a/ckanext/stats/public/ckanext/stats/resource.config b/ckanext/stats/public/ckanext/stats/resource.config new file mode 100644 index 00000000000..e651a3f1b9e --- /dev/null +++ b/ckanext/stats/public/ckanext/stats/resource.config @@ -0,0 +1,12 @@ +[IE conditional] + +lte IE 8 = vendor/excanvas.js + +[groups] + +stats = + css/stats.css + vendor/excanvas.js + vendor/jquery.flot.js + javascript/modules/plot.js + javascript/modules/stats-nav.js diff --git a/ckanext/stats/templates/ckanext/stats/index.html b/ckanext/stats/templates/ckanext/stats/index.html index 1ca83cafd0d..947454034e3 100644 --- a/ckanext/stats/templates/ckanext/stats/index.html +++ b/ckanext/stats/templates/ckanext/stats/index.html @@ -1,20 +1,5 @@ {% extends "page.html" %} -{%- block html5shim -%} - {# - Hellish hack to get excanvas to work in IE8. We disable html5shiv from - overriding the createElement() method on this page. - See: http://stackoverflow.com/questions/10208062/using-flot-with-bootstrap-ie8-incompatibility - #} - - {{ super() }} -{%- endblock -%} - -{% block styles %} - {{ super() }} - -{% endblock %} - {% block breadcrumb_content %}
  • {{ 'Statistics' }}
  • {% endblock %} @@ -204,8 +189,11 @@

    {{ _('Stat {% block scripts %} {{ super() }} - - - - +{# +Hellish hack to get excanvas to work in IE8. We disable html5shiv from +overriding the createElement() method on this page. +See: http://stackoverflow.com/questions/10208062/using-flot-with-bootstrap-ie8-incompatibility +#} +{% resource "vendor/block_html5_shim" %} +{% resource "ckanext_stats/stats" %} {% endblock %} diff --git a/doc/datastore.rst b/doc/datastore.rst index cb31e00df5c..a9d0eb6c7f5 100644 --- a/doc/datastore.rst +++ b/doc/datastore.rst @@ -6,16 +6,6 @@ The CKAN DataStore provides a database for structured storage of data together with a powerful Web-accesible Data API, all seamlessly integrated into the CKAN interface and authorization system. -Overview -======== - -The following short set of slides provide a brief overview and introduction to -the DataStore and the Data API. - -.. raw:: html - - - Relationship to FileStore ========================= @@ -33,87 +23,40 @@ queries over the spreadsheet contents. The DataStore Data API ====================== -The DataStore's Data API, which derives from the underlying ElasticSearch -data-table, is RESTful and JSON-based with extensive query capabilities. - -Each resource in a CKAN instance has an associated DataStore 'table'. This -table will be accessible via a web interface at:: - - /api/data/{resource-id} - -This interface to this data is *exactly* the same as that provided by -ElasticSearch to documents of a specific type in one of its indices. +The DataStore's Data API, which derives from the underlying data-table, +is RESTful and JSON-based with extensive query capabilities. -For a detailed tutorial on using this API see :doc:`using-data-api`. +Each resource in a CKAN instance can have an associated DataStore 'table'. The +basic API for accessing the DataStore is detailed below. For a detailed +tutorial on using this API see :doc:`using-data-api`. Installation and Configuration ============================== -The DataStore uses ElasticSearch_ as the persistence and query layer with CKAN -wrapping this with a thin authorization and authentication layer. - -It also requires the use of Nginx as your webserver as its XSendfile_ feature -is used to transparently hand off data requests to ElasticSeach internally. - -.. _ElasticSearch: http://www.elasticsearch.org/ -.. _XSendfile: http://wiki.nginx.org/XSendfile - -1. Install ElasticSearch_ -------------------------- - -Please see the ElasticSearch_ documentation. - -2. Configure Nginx ------------------- - -As previously mentioned, Nginx will be used on top of CKAN to forward -requests to Elastic Search. CKAN will still be served by Apache or the -development server (Paster), but all requests will be forwarded to it -by Ngnix. - -This is an example of an Nginx configuration file. Note the two locations -defined, `/` will point to the server running CKAN (Apache or Paster), and -`/elastic/` to the Elastic Search instance:: +The DataStore in previous lives required a custom setup of ElasticSearch and Nginx, +but that is no more, as it can use any relational database management system +(PostgreSQL for example). - server { - listen 80 default; - server_name localhost; +In your config file ensure that the datastore extension is enabled:: - access_log /var/log/nginx/localhost.access.log; + ckan.plugins = datastore + +Also ensure that the ckan.datastore_write_url variable is set:: - location / { - # location of apache or ckan under paster - proxy_pass http://127.0.0.1:8080; - proxy_set_header Host $host; - } - location /elastic/ { - internal; - # location of elastic search - proxy_pass http://127.0.0.1:9200/; - proxy_set_header Host $host; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - } - } + ckan.datastore_write_url = postgresql://ckanuser:pass@localhost/ckantest + +To test you can create a new datastore, so on linux command line do:: -.. note:: update the proxy_pass field value to point to your ElasticSearch - instance (if it is not localhost and default port). + curl -X POST http://127.0.0.1:5000/api/3/action/datastore_create -H "Authorization: {YOUR-API-KEY}" + -d '{"resource_id": "{RESOURCE-ID}", "fields": [ {"id": "a"}, {"id": "b"} ], + "records": [ { "a": 1, "b": "xyz"}, {"a": 2, "b": "zzz"} ]}' -Remember that after setting up Nginx, you need to access CKAN via its port -(80), not the Apache or Paster (5000) one, otherwise the DataStore won't work. - -3. Enable datastore features in CKAN ------------------------------------- - -In your config file set:: - - ckan.datastore.enabled = 1 - -.. _datastorer: DataStorer: Automatically Add Data to the DataStore =================================================== -Often, one wants data that is added to CKAN (whether it is linked to or uploaded to the :doc:`FileStore `) to be automatically added to the +Often, one wants data that is added to CKAN (whether it is linked to or +uploaded to the :doc:`FileStore `) to be automatically added to the DataStore. This requires some processing, to extract the data from your files and to add it to the DataStore in the format the DataStore can handle. @@ -122,19 +65,52 @@ performed by a DataStorer, a queue process that runs asynchronously and can be triggered by uploads or other activities. The DataStorer is an extension and can be found, along with installation instructions, at: -https://github.com/okfn/ckanext-datastorer +.. _datastorer: https://github.com/okfn/ckanext-datastorer -How It Works (Technically) -========================== +API Reference +------------- + +datastore_create +~~~~~~~~~~~~~~~~ + +The datastore_create API endpoint allows a user to post JSON data to +be stored against a resource, the JSON must be in the following form:: + + { + resource_id: resource_id, # the data is going to be stored against. + fields: [], # a list of dictionaries of fields/columns and their extra metadata. + records: [], # a list of dictionaries of the data eg [{"dob": "2005", "some_stuff": ['a', b']}, ..] + } + + +datastore_search +~~~~~~~~~~~~~~~~ + +The datastore_search API endpoint allows a user to search data at a resource, +the JSON for searching must be in the following form:: + + { + resource_id: # the resource id to be searched against + filters : # dictionary of matching conditions to select e.g {'key1': 'a. 'key2': 'b'} + # this will be equivalent to "select * from table where key1 = 'a' and key2 = 'b' " + q: # full text query + limit: # limit the amount of rows to size defaults to 20 + offset: # offset the amount of rows + fields: # list of fields return in that order, defaults (empty or not present) to all fields in fields order. + sort: + } + +datastore_delete +~~~~~~~~~~~~~~~~ -1. Request arrives at e.g. /dataset/{id}/resource/{resource-id}/data -2. CKAN checks authentication and authorization. -3. (Assuming OK) CKAN hands (internally) to ElasticSearch which handles the - request +The datastore_delete API endpoint allows a user to delete from a resource, +the JSON for searching must be in the following form:: - * To do this we use Nginx's Sendfile / Accel-Redirect feature. This allows - us to hand off a user request *directly* to ElasticSearch after the - authentication and authorization. This avoids the need to proxy the - request and results through CKAN code. + { + resource_id: resource_id # the data that is going to be deleted. + filters: # dictionary of matching conditions to delete + # e.g {'key1': 'a. 'key2': 'b'} + # this will be equivalent to "delete from table where key1 = 'a' and key2 = 'b' " + } diff --git a/doc/resources.rst b/doc/resources.rst new file mode 100644 index 00000000000..ccb1e9c726c --- /dev/null +++ b/doc/resources.rst @@ -0,0 +1,162 @@ +Resources +========= + +.. Note:: + Resources are only supported in the new Jinja2 style templates in Ckan 2.0 + and above. + +Resources are .css and .js files that may be included in an html page. +Resources are included in the page by using the +``{% resource [full resource name] %}`` tag. Resources are grouped into libraries +and the full resource name consists of /. It is +important to note that these resources will be added to the page as defined by +the resources not in the location of the ``{% resource %}`` tag. Duplicate +resources will not be added and any dependencies will be included as well as +the resources adding in the correct order (see below for details). + +Libraries can be added to Ckan from extensions using the toolkit helper +function ``add_resource(path, name)`` where path is the path to the resource +directory relative to the file calling the function and name is the name of the +library. Resources will the be created for the library for any .js and .css +files found in the directory or it's subfolders. The resource name being the +name of the file including any path needed to get to it from the resource +directory. For greater control of the creation a resource.config file can be +created and placed in the resource directory (see below for details). + +In debug mode resources are served un-minified and unbundled (each resource is +served separately). In non-debug mode the files are served minified and bundled +(where allowed). + +.. Important:: + .js and .css resources must be supplied as un-minified files. Minified + files will be created. It is advised to include a .gitignore file to + prevent minified files being added to the repository. + +resource.config +--------------- + +This file is used to define the resources in a directory and is sub folders. +Here is an example file. The general layout of the file and allowed syntax is +the same as for the .ini config file. + +:: + + # Example resource.config file + + [main] + + dont_bundle = jquery.js + force_top = html5.js + order = jquery.js jed.js + + [IE conditional] + + lte IE 8 = html5.js block_html5_shim + IE 7 = font-awesome/css/font-awesome-ie7.css + others = html5.js + + [custom render order] + + block_html5_shim = 1 + html5.js = 2 + select2/select2.css = 9 + + [inline scripts] + + block_html5_shim = + var html5 = {shivMethods: false}; + + [depends] + + vendor = jquery.js + + [groups] + + vendor = + jed.js + html5.js + select2/select2.js + select2/select2.css + bootstrap/js/bootstrap-transition.js + bootstrap/js/bootstrap-modal.js + bootstrap/js/bootstrap-alert.js + bootstrap/js/bootstrap-tab.js + bootstrap/js/bootstrap-button.js + font-awesome/css/font-awesome-ie7.css + + +[main] +~~~~~~ + +This can contain the following values + +**force_top** + +The resources listed will be placed in the head of the page. This is only relevant +to .js files which will by default will be added to the bottom of the page. + +**dont_bundle** + +Bundeling resources causes them to be served to the browser as a single +resource to prevent multiple requests to the server. The resources listed will +not be bundled. By default items will be bundled where possible. Note that +.css files can only be bundled if they are in the same directory. + +**order** + +This is used to make sure that resources are created in the order specified. It +should not generally be needed but is available if there are problems. + + +[IE conditional] +~~~~~~~~~~~~~~~~ + +This allows IE conditionals to be wrapped around resources + +eg ```` + +The condition is supplied followed by a list of resources that need that condition. + +**others** + +This is a special condition that means that the resource will also be available +for none IE browsers. + +[custom render order] +~~~~~~~~~~~~~~~~~~~~~ + +By default resources have a render order this is 10 for .css and 20 for .js +resources. Sometimes we need to add resources before or after they would be +included an example being the html5shim.js that needs including before .css +resources. By providing a custom render order for the resource it's placement +can be altered. Lower numbered resources are rendered earlier. Note that +resources rendered in the head will still be placed before ones rendered in the +body. + +[inline scripts] +~~~~~~~~~~~~~~~~ + +It is possible to define inline scripts in the resource.config file this can be +helpful in some situations but is probably best avoided if possible. + +[depends] +~~~~~~~~~ + +Some times one resource depends on another eg many scripts need jquery.js +included in the page before them. External resource libraries will +automatically depend on the core ckan JavaScript modules so do not need to +specify this. + +[groups] +~~~~~~~~ + +Groups of resources can be specified this allows the group to be included by +just using it's name rather than having to specify each resource individuality +when requesting them. The order that items are added to a group will be used +to order the resources when added to the page but other factors such as +dependencies, custom render order and resource type can affect the final order +used. + + +Groups can be referred to in many places in the +resource.config file eg. [depends] diff --git a/doc/using-data-api.rst b/doc/using-data-api.rst index f0a96172d08..ac5b2cc97e6 100644 --- a/doc/using-data-api.rst +++ b/doc/using-data-api.rst @@ -8,599 +8,183 @@ The following provides an introduction to using the CKAN :doc:`DataStore Introduction ============ -Each 'table' in the DataStore is an ElasticSearch_ index type ('table'). As -such the Data API for each CKAN resource is directly equivalent to a single -index 'type' in ElasticSearch (we tend to refer to it as a 'table'). +The DataStore API allows tabular data to be stored inside CKAN quickly and easily. It is accessible through an interface accessible over HTTP and can be interacted with using JSON (the JavaScript Object Notation). -This means you can (usually) directly re-use `ElasticSearch client libraries`_ -when connecting to a Data API endpoint. It also means that what follows is, in -essence, a tutorial in using the ElasticSearch_ API. - -The following short set of slides provide a brief overview and introduction to -the DataStore and the Data API. - -.. raw:: html - - - -.. _ElasticSearch: http://elasticsearch.org/ -.. _ElasticSearch client libraries: http://www.elasticsearch.org/guide/appendix/clients.html Quickstart ========== -``{{endpoint}}`` refers to the data API endpoint (or ElasticSearch index / -table). For example, on the DataHub_ this gold prices data resource -http://datahub.io/dataset/gold-prices/resource/b9aae52b-b082-4159-b46f-7bb9c158d013 -would have its Data API endpoint at: -http://datahub.io/api/data/b9aae52b-b082-4159-b46f-7bb9c158d013. If you were -just using ElasticSearch standalone an example of an endpoint would be: -http://localhost:9200/gold-prices/monthly-price-table. +There are several endpoints into the DataStore API, they are: + +* datastore_create: ``http://{YOUR-CKAN-INSTALLATION}/api/3/action/datastore_create`` +* datastore_search: ``http://{YOUR-CKAN-INSTALLATION}/api/3/action/datastore_search`` +* datastore_delete: ``http://{YOUR-CKAN-INSTALLATION}/api/3/action/datastore_delete`` + +Before going into detail about the API and the examples, it is useful to create a resource first so that you can store data against it in the Datastore. This can be done either through the CKAN Graphical User Interface. -.. note:: every resource on a CKAN instance for which a DataStore table is - enabled provides links to its Data API endpoint via the Data API - button at the top right of the resource page. +Using the data API, we need to send JSON with this structure:: -Key urls: + { + id: Uuid, + name: Name-String, + title: String, + version: String, + url: String, + resources: [ { url: String, format: String, description: String, hash: String }, ...], + author: String, + author_email: String, + maintainer: String, + maintainer_email: String, + license_id: String, + tags: Tag-List, + notes: String, + extras: { Name-String: String, ... } + } -* Query: ``{{endpoint}}/_search`` (in ElasticSearch < 0.19 this will return an - error if visited without a query parameter) +To the following endpoint: - * Query example: ``{{endpoint}}/_search?size=5&pretty=true`` +* Dataset Model Endpoint: ``http://{YOUR-CKAN-INSTALLATION}/api/rest/dataset`` -* Schema (Mapping): ``{{endpoint}}/_mapping`` +More details about creating a resource through the Data API are available on the :ref:`CKAN API page `. More information about the Datastore API can be found on the :doc:`datastore page `. -.. _DataHub: http://datahub.io/ Examples -------- +Some of the following commands require obtaining an :ref:`API Key `. + cURL (or Browser) ~~~~~~~~~~~~~~~~~ The following examples utilize the cURL_ command line utility. If you prefer, you you can just open the relevant urls in your browser:: - // query for documents / rows with title field containing 'jones' - // added pretty=true to get the json results pretty printed - curl {{endpoint}}/_search?q=title:jones&size=5&pretty=true - -Adding some data (requires an :ref:`API Key `):: + # This creates a datastore + curl -X POST {ENDPOINT:datastore_create} -H "Authorization: {YOUR-API-KEY}" -d " + {\"resource_id\": \"{RESOURCE-ID}\", \"fields\": [ {\"id\": \"a\"}, {\"id\": \"b\"} ], + \"records\": [ { \"a\": 1, \"b\": \"xyz\"}, {\"a\": 2, \"b\": \"zzz\"} ]}" - // requires an API key - // Data (argument to -d) should be a JSON document - curl -X POST -H "Authorization: {{YOUR-API-KEY}}" {{endpoint}} -d '{ - "title": "jones", - "amount": 5.7 - }' + #This queries a datastore + curl {ENDPOINT:datastore_search}?resource_id={RESOURCE-ID} -H "Authorization: {YOUR-API-KEY}" .. _cURL: http://curl.haxx.se/ Javascript ~~~~~~~~~~ -A simple ajax (JSONP) request to the data API using jQuery:: - - var data = { - size: 5 // get 5 results - q: 'title:jones' // query on the title field for 'jones' - }; - $.ajax({ - url: {{endpoint}}/_search, - dataType: 'jsonp', - success: function(data) { - alert('Total results found: ' + data.hits.total) - } - }); - -The Data API supports CORs so you can also write to it (this requires the json2_ library for ``JSON.stringify``):: - - var data = { - title: 'jones', - amount: 5.7 - }; - $.ajax({ - url: {{endpoint}}, - type: 'POST', - data: JSON.stringify(data), - success: function(data) { - alert('Uploaded ok') - } - }); - -.. _json2: https://github.com/douglascrockford/JSON-js/blob/master/json2.js - -Python -~~~~~~ - -.. note:: You can also use the `DataStore Python client library`_. - -.. _DataStore Python client library: http://github.com/okfn/datastore-client - -:: - - import urllib2 - import json - - # ================================= - # Store data in the DataStore table - - url = '{{endpoint}}' - data = { - 'title': 'jones', - 'amount': 5.7 - } - # have to send the data as JSON - data = json.dumps(data) - # need to add your API key (and have authorization to write to this endpoint) - headers = {'Authorization': 'YOUR-API-KEY'} - - req = urllib2.Request(url, data, headers) - out = urllib2.urlopen(req) - print out.read() - - # ========================= - # Query the DataStore table - - url = '{{endpoint}}/_search?q=title:jones&size=5' - req = urllib2.Request(url) - out = urllib2.urlopen(req) - data = out.read() - print data - # returned data is JSON - data = json.loads(data) - # total number of results - print data['hits']['total'] - -Querying -======== +Coming soon... -Basic Queries Using Only the Query String ------------------------------------------ +.. + A simple ajax (JSONP) request to the data API using jQuery:: -Basic queries can be done using only query string parameters in the URL. For -example, the following searches for text 'hello' in any field in any document -and returns at most 5 results:: - - {{endpoint}}/_search?q=hello&size=5 - -Basic queries like this have the advantage that they only involve accessing a -URL and thus, for example, can be performed just using any web browser. -However, this method is limited and does not give you access to most of the -more powerful query features. - -Basic queries use the `q` query string parameter which supports the `Lucene -query parser syntax`_ and hence filters on specific fields (e.g. `fieldname:value`), wildcards (e.g. `abc*`) and more. - -.. _Lucene query parser syntax: http://lucene.apache.org/core/old_versioned_docs/versions/3_0_0/queryparsersyntax.html - -There are a variety of other options (e.g. size, from etc) that you can also -specify to customize the query and its results. Full details can be found in -the `ElasticSearch URI request docs`_. - -.. _ElasticSearch URI request docs: http://www.elasticsearch.org/guide/reference/api/search/uri-request.html - -Full Query API --------------- - -More powerful and complex queries, including those that involve faceting and -statistical operations, should use the full ElasticSearch query language and API. - -In the query language queries are written as a JSON structure and is then sent -to the query endpoint (details of the query langague below). There are two -options for how a query is sent to the search endpoint: - -1. Either as the value of a source query parameter e.g.:: - - {{endpoint}}/_search?source={Query-as-JSON} - -2. Or in the request body, e.g.:: - - curl -XGET {{endpoint}}/_search -d 'Query-as-JSON' - - For example:: - - curl -XGET {{endpoint}}/_search -d '{ - "query" : { - "term" : { "user": "kimchy" } + var data = { + size: 5 // get 5 results + q: 'title:jones' // query on the title field for 'jones' + }; + $.ajax({ + url: {{endpoint}}/_search, + dataType: 'jsonp', + success: function(data) { + alert('Total results found: ' + data.hits.total) } - }' - - -Query Language -============== - -Queries are JSON objects with the following structure (each of the main -sections has more detail below):: - - { - size: # number of results to return (defaults to 10) - from: # offset into results (defaults to 0) - fields: # list of document fields that should be returned - http://elasticsearch.org/guide/reference/api/search/fields.html - sort: # define sort order - see http://elasticsearch.org/guide/reference/api/search/sort.html - - query: { - # "query" object following the Query DSL: http://elasticsearch.org/guide/reference/query-dsl/ - # details below - }, - - facets: { - # facets specifications - # Facets provide summary information about a particular field or fields in the data + }); + + The Data API supports CORs so you can also write to it (this requires the json2_ library for ``JSON.stringify``):: + + var data = { + title: 'jones', + amount: 5.7 + }; + $.ajax({ + url: {{endpoint}}, + type: 'POST', + data: JSON.stringify(data), + success: function(data) { + alert('Uploaded ok') } + }); - # special case for situations where you want to apply filter/query to results but *not* to facets - filter: { - # filter objects - # a filter is a simple "filter" (query) on a specific field. - # Simple means e.g. checking against a specific value or range of values - }, - } - -Query results look like:: - - { - # some info about the query (which shards it used, how long it took etc) - ... - # the results - hits: { - total: # total number of matching documents - hits: [ - # list of "hits" returned - { - _id: # id of document - score: # the search index score - _source: { - # document 'source' (i.e. the original JSON document you sent to the index - } - } - ] - } - # facets if these were requested - facets: { - ... - } - } - -Query DSL: Overview -------------------- - -Query objects are built up of sub-components. These sub-components are either -basic or compound. Compound sub-components may contains other sub-components -while basic may not. Example:: - - { - "query": { - # compound component - "bool": { - # compound component - "must": { - # basic component - "term": { - "user": "jones" - } - } - # compound component - "must_not": { - # basic component - "range" : { - "age" : { - "from" : 10, - "to" : 20 - } - } - } - } - } - } - -In addition, and somewhat confusingly, ElasticSearch distinguishes between -sub-components that are "queries" and those that are "filters". Filters, are -really special kind of queries that are: mostly basic (though boolean -compounding is alllowed); limited to one field or operation and which, as such, -are especially performant. - -Examples, of filters are (full list on RHS at the bottom of the query-dsl_ page): - - * term: filter on a value for a field - * range: filter for a field having a range of values (>=, <= etc) - * geo_bbox: geo bounding box - * geo_distance: geo distance - -.. _query-dsl: http://elasticsearch.org/guide/reference/query-dsl/ - -Rather than attempting to set out all the constraints and options of the -query-dsl we now offer a variety of examples. - -Examples --------- - -Match all / Find Everything -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -:: - - { - "query": { - "match_all": {} - } - } - -Classic Search-Box Style Full-Text Query -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This will perform a full-text style query across all fields. The query string -supports the `Lucene query parser syntax`_ and hence filters on specific fields -(e.g. `fieldname:value`), wildcards (e.g. `abc*`) as well as a variety of -options. For full details see the query-string_ documentation. - -:: - - { - "query": { - "query_string": { - "query": {query string} - } - } - } - -.. _query-string: http://elasticsearch.org/guide/reference/query-dsl/query-string-query.html + .. _json2: https://github.com/douglascrockford/JSON-js/blob/master/json2.js -Filter on One Field -~~~~~~~~~~~~~~~~~~~ - -:: - - { - "query": { - "term": { - {field-name}: {value} - } - } - } - -High performance equivalent using filters:: - - { - "query": { - "constant_score": { - "filter": { - "term": { - # note that value should be *lower-cased* - {field-name}: {value} - } - } - } - } - -Find all documents with value in a range -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This can be used both for text ranges (e.g. A to Z), numeric ranges (10-20) and -for dates (ElasticSearch will converts dates to ISO 8601 format so you can -search as 1900-01-01 to 1920-02-03). - -:: - - { - "query": { - "constant_score": { - "filter": { - "range": { - {field-name}: { - "from": {lower-value} - "to": {upper-value} - } - } - } - } - } - } - -For more details see `range filters`_. - -.. _range filters: http://elasticsearch.org/guide/reference/query-dsl/range-filter.html - -Full-Text Query plus Filter on a Field -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -:: - - { - "query": { - "query_string": { - "query": {query string} - }, - "term": { - {field}: {value} - } - } - } - - -Filter on two fields -~~~~~~~~~~~~~~~~~~~~ - -Note that you cannot, unfortunately, have a simple and query by adding two -filters inside the query element. Instead you need an 'and' clause in a filter -(which in turn requires nesting in 'filtered'). You could also achieve the same -result here using a `bool query`_. - -.. _bool query: http://elasticsearch.org/guide/reference/query-dsl/bool-query.html - -:: - - { - "query": { - "filtered": { - "query": { - "match_all": {} - }, - "filter": { - "and": [ - { - "range" : { - "b" : { - "from" : 4, - "to" : "8" - } - }, - }, - { - "term": { - "a": "john" - } - } - ] - } - } - } - } - -Geospatial Query to find results near a given point -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This uses the `Geo Distance filter`_. It requires that indexed documents have a field of `geo point type`_. - -.. _Geo Distance filter: http://www.elasticsearch.org/guide/reference/query-dsl/geo-distance-filter.html -.. _geo point type: http://www.elasticsearch.org/guide/reference/mapping/geo-point-type.html - -Source data (a point in San Francisco!):: - - # This should be in lat,lon order - { - ... - "Location": "37.7809035011582, -122.412119695795" - } - -There are alternative formats to provide lon/lat locations e.g. (see ElasticSearch documentation for more):: - - # Note this must have lon,lat order (opposite of previous example!) - { - "Location":[-122.414753390488, 37.7762147914147] - } - - # or ... - { - "Location": { - "lon": -122.414753390488, - "lat": 37.7762147914147 - } - } - -We also need a mapping to specify that Location field is of type geo_point as this will not usually get guessed from the data (see below for more on mappings):: - - "properties": { - "Location": { - "type": "geo_point" - } - ... - } - -Now the actual query:: - - { - "query": { - "filtered" : { - "query" : { - "match_all" : {} - }, - "filter" : { - "geo_distance" : { - "distance" : "20km", - "Location" : { - "lat" : 37.776, - "lon" : -122.41 - } - } - } - } - } - } - -Note that you can specify the query using specific lat, lon attributes even -though original data did not have this structure (you can also use a query -similar to the original structure if you wish - see `Geo distance filter`_ for -more information). - - -Facets ------- - -Facets provide a way to get summary information about then data in an -elasticsearch table, for example counts of distinct values. - -ElasticSearch (and hence the Data API) provides rich faceting capabilities: -http://www.elasticsearch.org/guide/reference/api/search/facets/ +Python +~~~~~~ -There are various kinds of facets available, for example (full list on the facets page): +A Python URLLib2 datastore_create and datastore_search would look like:: -* Terms_ - counts by distinct terms (values) in a field -* Range_ - counts for a given set of ranges in a field -* Histogram_ and `Date Histogram`_ - counts by constant interval ranges -* Statistical_ - statistical summary of a field (mean, sum etc) -* `Terms Stats`_ - statistical summary on one field (stats field) for distinct - terms in another field. For example, spending stats per department or per - region. -* `Geo Distance`_: counts by distance ranges from a given point + #! /usr/bin/env python + import urllib + import urllib2 + import json -Note that you can apply multiple facets per query. + auth_key = '{YOUR-AUTH-KEY}' -.. _Terms: http://www.elasticsearch.org/guide/reference/api/search/facets/terms-facet.html -.. _Range: http://www.elasticsearch.org/guide/reference/api/search/facets/range-facet.html -.. _Histogram: http://www.elasticsearch.org/guide/reference/api/search/facets/histogram-facet.html -.. _Date Histogram: http://www.elasticsearch.org/guide/reference/api/search/facets/date-histogram-facet.html -.. _Statistical: http://www.elasticsearch.org/guide/reference/api/search/facets/statistical-facet.html -.. _Terms Stats: http://www.elasticsearch.org/guide/reference/api/search/facets/terms-stats-facet.html -.. _Geo Distance: http://www.elasticsearch.org/guide/reference/api/search/facets/geo-distance-facet.html + # In python using urllib2 for datastore_create it is... + url = "http://127.0.0.1:5000/api/3/action/" -Adding, Updating and Deleting Data -================================== + datastore_structure = { + 'resource_id': '{RESOURCE-ID}', + 'fields': [ {"id": "a"}, {"id": "b"} ], + "records": [ { "a": 12, "b": "abc"}, {"a": 2, "b": "zzz"} ] + } + headers = {'content-type': 'application/json', 'Authorization': auth_key} -ElasticSeach, and hence the Data API, have a standard RESTful API. Thus:: - POST {{endpoint}} : INSERT - PUT/POST {{endpoint}}/{{id}} : UPDATE (or INSERT) - DELETE {{endpoint}}/{{id}} : DELETE -For more on INSERT and UPDATE see the `Index API`_ documentation. + req = urllib2.Request(url + 'datastore_create', data=json.dumps(datastore_structure), headers=headers) + response = urllib2.urlopen(req) -.. _Index API: http://elasticsearch.org/guide/reference/api/index_.html -There is also support bulk insert and updates via the `Bulk API`_. + # in python for datastore_search using urllib2.... -.. _Bulk API: http://elasticsearch.org/guide/reference/api/bulk.html + datastore_structure = { + 'resource_id': '{RESOURCE-ID}' + } -.. note:: The `DataStore Python client library`_ has support for inserting, - updating (in bulk) and deleting. There is also support for these - operations in the ReclineJS javascript library. + url_values = urllib.urlencode(datastore_structure) + req = urllib2.Request(url + 'datastore_search?' + url_values, headers=headers) + response = urllib2.urlopen(req) + print response.read() -Schema Mapping -============== + print "done\n" -As the ElasticSearch documentation states: - Mapping is the process of defining how a document should be mapped to the - Search Engine, including its searchable characteristics such as which fields - are searchable and if/how they are tokenized. In ElasticSearch, an index may - store documents of different “mapping types”. ElasticSearch allows one to - associate multiple mapping definitions for each mapping type. +Using the Python Requests_ library we can create a datastore like this:: - Explicit mapping is defined on an index/type level. By default, there isn't a - need to define an explicit mapping, since one is automatically created and - registered when a new type or new field is introduced (with no performance - overhead) and have sensible defaults. Only when the defaults need to be - overridden must a mapping definition be provided. + #! /usr/bin/env python + + import requests + import json + + auth_key = '' + + url = "http://127.0.0.1:5000/api/3/action/" # An example "action" endpoint + + datastore_structure = { + 'resource_id': '', + 'fields': [ {"id": "a"}, {"id": "b"} ], + "records": [ { "a": 1, "b": "xyz"}, {"a": 2, "b": "zzz"} ] + } + headers = {'content-type': 'application/json', 'Authorization': auth_key} + r = requests.post(url + 'datastore_create', data=json.dumps(datastore_structure), headers=headers) + print "done, and now for a quick search\n" -Relevant docs: http://elasticsearch.org/guide/reference/mapping/. + datastore_structure = { + 'resource_id': '' + } + headers = {'content-type': 'application/json', 'Authorization': auth_key} + r = requests.post(url + 'datastore_search', data=json.dumps(datastore_structure), headers=headers) + + print r.text + + print "done\n" -JSONP support -============= +.. _Requests: http://docs.python-requests.org/ -JSONP support is available on any request via a simple callback query string parameter:: +PHP +~~~~~~ - ?callback=my_callback_name +Coming soon... diff --git a/setup.py b/setup.py index 962a4f89b46..a14cb2b62e1 100644 --- a/setup.py +++ b/setup.py @@ -115,6 +115,7 @@ multilingual_tag=ckanext.multilingual.plugin:MultilingualTag organizations=ckanext.organizations.forms:OrganizationForm organizations_dataset=ckanext.organizations.forms:OrganizationDatasetForm + datastore=ckanext.datastore.plugin:DatastorePlugin test_tag_vocab_plugin=ckanext.test_tag_vocab_plugin:MockVocabTagsPlugin example_iorganizationform=ckanext.example_iorganizationform.plugin:ExampleIOrganizationFormPlugin