diff --git a/ckan/cli/__init__.py b/ckan/cli/__init__.py new file mode 100644 index 00000000000..281f52b6ac4 --- /dev/null +++ b/ckan/cli/__init__.py @@ -0,0 +1,52 @@ +# encoding: utf-8 + +import os + +import click +import logging +from logging.config import fileConfig as loggingFileConfig + +log = logging.getLogger(__name__) + + +def error_shout(exception): + click.secho(str(exception), fg=u'red', err=True) + + +click_config_option = click.option( + u'-c', + u'--config', + default=None, + metavar=u'CONFIG', + help=u'Config file to use (default: development.ini)' +) + + +def load_config(config=None): + from paste.deploy import appconfig + if config: + filename = os.path.abspath(config) + config_source = u'-c parameter' + elif os.environ.get(u'CKAN_INI'): + filename = os.environ.get(u'CKAN_INI') + config_source = u'$CKAN_INI' + else: + default_filename = u'development.ini' + filename = os.path.join(os.getcwd(), default_filename) + if not os.path.exists(filename): + # give really clear error message for this common situation + msg = u'ERROR: You need to specify the CKAN config (.ini) '\ + u'file path.'\ + u'\nUse the --config parameter or set environment ' \ + u'variable CKAN_INI or have {}\nin the current directory.' \ + .format(default_filename) + exit(msg) + + if not os.path.exists(filename): + msg = u'Config file not found: %s' % filename + msg += u'\n(Given by: %s)' % config_source + exit(msg) + + loggingFileConfig(filename) + log.info(u'Using configuration file {}'.format(filename)) + return appconfig(u'config:' + filename) diff --git a/ckan/cli/cli.py b/ckan/cli/cli.py new file mode 100644 index 00000000000..236196c88c6 --- /dev/null +++ b/ckan/cli/cli.py @@ -0,0 +1,30 @@ +# encoding: utf-8 + +import logging + +import click + +from ckan.cli import click_config_option, db, load_config, search_index, server +from ckan.config.middleware import make_app + +log = logging.getLogger(__name__) + + +class CkanCommand(object): + + def __init__(self, conf=None): + self.config = load_config(conf) + self.app = make_app(self.config.global_conf, **self.config.local_conf) + + +@click.group() +@click.help_option(u'-h', u'--help') +@click_config_option +@click.pass_context +def ckan(ctx, config, *args, **kwargs): + ctx.obj = CkanCommand(config) + + +ckan.add_command(server.run) +ckan.add_command(db.db) +ckan.add_command(search_index.search_index) diff --git a/ckan/cli/db.py b/ckan/cli/db.py new file mode 100644 index 00000000000..860668cfd64 --- /dev/null +++ b/ckan/cli/db.py @@ -0,0 +1,72 @@ +# encoding: utf-8 + +import logging + +import click + +from ckan.cli import error_shout + +log = logging.getLogger(__name__) + + +@click.group(name=u'db', short_help=u'Database commands') +def db(): + pass + + +@db.command(u'init', short_help=u'Initialize the database') +def initdb(): + u'''Initialising the database''' + log.info(u"Initialize the Database") + try: + import ckan.model as model + model.repo.init_db() + except Exception as e: + error_shout(e) + else: + click.secho(u'Initialising DB: SUCCESS', fg=u'green', bold=True) + + +PROMPT_MSG = u'This will delete all of your data!\nDo you want to continue?' + + +@db.command(u'clean', short_help=u'Clean the database') +@click.confirmation_option(prompt=PROMPT_MSG) +def cleandb(): + u'''Cleaning the database''' + try: + import ckan.model as model + model.repo.clean_db() + except Exception as e: + error_shout(e) + else: + click.secho(u'Cleaning DB: SUCCESS', fg=u'green', bold=True) + + +@db.command(u'upgrade', short_help=u'Upgrade the database') +@click.option(u'-v', u'--version', help=u'Migration version') +def updatedb(version=None): + u'''Upgrading the database''' + try: + import ckan.model as model + model.repo.upgrade_db(version) + except Exception as e: + error_shout(e) + else: + click.secho(u'Upgrading DB: SUCCESS', fg=u'green', bold=True) + + +@db.command(u'version', short_help=u'Returns current version of data schema') +def version(): + u'''Return current version''' + log.info(u"Returning current DB version") + try: + from ckan.model import Session + ver = Session.execute(u'select version from ' + u'migrate_version;').fetchall() + click.secho( + u"Latest data schema version: {0}".format(ver[0][0]), + bold=True + ) + except Exception as e: + error_shout(e) diff --git a/ckan/cli/search_index.py b/ckan/cli/search_index.py new file mode 100644 index 00000000000..c8e809cbd30 --- /dev/null +++ b/ckan/cli/search_index.py @@ -0,0 +1,112 @@ +# encoding: utf-8 + +import multiprocessing as mp + +import click +import sqlalchemy as sa + +from ckan.cli import error_shout + + +@click.group(name=u'search-index', short_help=u'Search index commands') +@click.help_option(u'-h', u'--help') +def search_index(): + pass + + +@search_index.command(name=u'rebuild', short_help=u'Rebuild search index') +@click.option(u'-v', u'--verbose', is_flag=True) +@click.option(u'-i', u'--force', is_flag=True, + help=u'Ignore exceptions when rebuilding the index') +@click.option(u'-r', u'--refresh', help=u'Refresh current index', is_flag=True) +@click.option(u'-o', u'--only-missing', + help=u'Index non indexed datasets only', is_flag=True) +@click.option(u'-q', u'--quiet', help=u'Do not output index rebuild progress', + is_flag=True) +@click.option(u'-e', u'--commit-each', is_flag=True, + help=u'Perform a commit after indexing each dataset. This' + u'ensures that changes are immediately available on the' + u'search, but slows significantly the process. Default' + u'is false.') +@click.pass_context +def rebuild(ctx, verbose, force, refresh, only_missing, quiet, commit_each): + u''' Rebuild search index ''' + flask_app = ctx.obj.app.apps['flask_app']._wsgi_app + from ckan.lib.search import rebuild, commit + try: + with flask_app.test_request_context(): + rebuild(only_missing=only_missing, + force=force, + refresh=refresh, + defer_commit=(not commit_each), + quiet=quiet) + except Exception as e: + error_shout(e) + if not commit_each: + commit() + + +@search_index.command(name=u'check', short_help=u'Check search index') +def check(): + from ckan.lib.search import check + check() + + +@search_index.command(name=u'show', short_help=u'Show index of a dataset') +@click.argument(u'dataset_name') +def show(dataset_name): + from ckan.lib.search import show + + index = show(dataset_name) + click.echo(index) + + +@search_index.command(name=u'clear', short_help=u'Clear the search index') +@click.argument(u'dataset_name', required=False) +def clear(dataset_name): + from ckan.lib.search import clear, clear_all + + if dataset_name: + clear(dataset_name) + else: + clear_all() + + +@search_index.command(name=u'rebuild-fast', + short_help=u'Reindex with multiprocessing') +@click.pass_context +def rebuild_fast(ctx): + conf = ctx.obj.config + flask_app = ctx.obj.app.apps['flask_app']._wsgi_app + db_url = conf['sqlalchemy.url'] + engine = sa.create_engine(db_url) + package_ids = [] + result = engine.execute(u"select id from package where state = 'active';") + for row in result: + package_ids.append(row[0]) + + def start(ids): + from ckan.lib.search import rebuild, commit + rebuild(package_ids=ids) + commit() + + def chunks(l, n): + u""" Yield n successive chunks from l.""" + newn = int(len(l) / n) + for i in range(0, n-1): + yield l[i*newn:i*newn+newn] + yield l[n*newn-newn:] + + processes = [] + with flask_app.test_request_context(): + try: + for chunk in chunks(package_ids, mp.cpu_count()): + process = mp.Process(target=start, args=(chunk,)) + processes.append(process) + process.daemon = True + process.start() + + for process in processes: + process.join() + except Exception as e: + click.echo(e.message) diff --git a/ckan/cli/server.py b/ckan/cli/server.py new file mode 100644 index 00000000000..ef38c75fa07 --- /dev/null +++ b/ckan/cli/server.py @@ -0,0 +1,19 @@ +# encoding: utf-8 + +import logging + +import click +from werkzeug.serving import run_simple + +log = logging.getLogger(__name__) + + +@click.command(u'run', short_help=u'Start development server') +@click.option(u'-H', u'--host', default=u'localhost', help=u'Set host') +@click.option(u'-p', u'--port', default=5000, help=u'Set port') +@click.option(u'-r', u'--reloader', default=True, help=u'Use reloader') +@click.pass_context +def run(ctx, host, port, reloader): + u'''Runs development server''' + log.info(u"Running server {0} on port {1}".format(host, port)) + run_simple(host, port, ctx.obj.app, use_reloader=reloader, use_evalex=True) diff --git a/ckan/config/middleware/flask_app.py b/ckan/config/middleware/flask_app.py index 6bfbde423ae..3593ff1c937 100644 --- a/ckan/config/middleware/flask_app.py +++ b/ckan/config/middleware/flask_app.py @@ -2,6 +2,7 @@ import os import re +import time import inspect import itertools import pkgutil @@ -107,6 +108,13 @@ def make_flask_stack(conf, **app_conf): app.config['DEBUG_TB_INTERCEPT_REDIRECTS'] = False DebugToolbarExtension(app) + from werkzeug.debug import DebuggedApplication + app = DebuggedApplication(app, True) + app = app.app + + log = logging.getLogger('werkzeug') + log.setLevel(logging.DEBUG) + # Use Beaker as the Flask session interface class BeakerSessionInterface(SessionInterface): def open_session(self, app, request): @@ -298,6 +306,8 @@ def ckan_before_request(): # with extensions set_controller_and_action() + g.__timer = time.time() + def ckan_after_request(response): u'''Common handler executed after all Flask requests''' @@ -311,6 +321,11 @@ def ckan_after_request(response): # Set CORS headers if necessary response = set_cors_headers_for_response(response) + r_time = time.time() - g.__timer + url = request.environ['CKAN_CURRENT_URL'].split('?')[0] + + log.info(' %s render time %.3f seconds' % (url, r_time)) + return response diff --git a/ckan/config/solr/schema.xml b/ckan/config/solr/schema.xml index 8e5018a2e2d..97559299a37 100644 --- a/ckan/config/solr/schema.xml +++ b/ckan/config/solr/schema.xml @@ -24,7 +24,7 @@ - + @@ -81,6 +81,18 @@ schema. In this case the version should be set to the next CKAN version number. + + + + + + + + + + + + @@ -89,10 +101,12 @@ schema. In this case the version should be set to the next CKAN version number. + + @@ -165,6 +179,8 @@ schema. In this case the version should be set to the next CKAN version number. + + diff --git a/ckan/controllers/api.py b/ckan/controllers/api.py index 464d209e8b5..826a9b6d279 100644 --- a/ckan/controllers/api.py +++ b/ckan/controllers/api.py @@ -3,12 +3,8 @@ import os.path import logging import cgi -import datetime -import glob import urllib -from webob.multidict import UnicodeMultiDict -from paste.util.multidict import MultiDict from six import text_type import ckan.model as model @@ -22,7 +18,7 @@ from ckan.views import identify_user -from ckan.common import _, c, request, response, config +from ckan.common import _, c, request, response log = logging.getLogger(__name__) diff --git a/ckan/lib/dictization/model_dictize.py b/ckan/lib/dictization/model_dictize.py index 18017352134..ffb7dcb8aad 100644 --- a/ckan/lib/dictization/model_dictize.py +++ b/ckan/lib/dictization/model_dictize.py @@ -108,6 +108,7 @@ def resource_dictize(res, context): ## for_edit is only called at the times when the dataset is to be edited ## in the frontend. Without for_edit the whole qualified url is returned. if resource.get('url_type') == 'upload' and not context.get('for_edit'): + url = url.rsplit('/')[-1] cleaned_name = munge.munge_filename(url) resource['url'] = h.url_for('resource.download', id=resource['package_id'], diff --git a/ckan/lib/dictization/model_save.py b/ckan/lib/dictization/model_save.py index b0abf1abcea..454159f41df 100644 --- a/ckan/lib/dictization/model_save.py +++ b/ckan/lib/dictization/model_save.py @@ -13,7 +13,9 @@ log = logging.getLogger(__name__) + def resource_dict_save(res_dict, context): + model = context["model"] session = context["session"] @@ -30,6 +32,10 @@ def resource_dict_save(res_dict, context): table = class_mapper(model.Resource).mapped_table fields = [field.name for field in table.c] + # Strip the full url for resources of type 'upload' + if res_dict.get('url') and res_dict.get('url_type') == u'upload': + res_dict['url'] = res_dict['url'].rsplit('/')[-1] + # Resource extras not submitted will be removed from the existing extras # dict new_extras = {} diff --git a/ckan/lib/helpers.py b/ckan/lib/helpers.py index 969374ca1b6..910b31fc827 100644 --- a/ckan/lib/helpers.py +++ b/ckan/lib/helpers.py @@ -48,7 +48,7 @@ import ckan.plugins as p import ckan -from ckan.common import _, ungettext, c, request, session, json +from ckan.common import _, ungettext, c, g, request, session, json from markupsafe import Markup, escape @@ -731,7 +731,7 @@ def _link_active_pylons(kwargs): def _link_active_flask(kwargs): - blueprint, endpoint = request.url_rule.endpoint.split('.') + blueprint, endpoint = p.toolkit.get_endpoint() return(kwargs.get('controller') == blueprint and kwargs.get('action') == endpoint) @@ -785,7 +785,7 @@ def nav_link(text, *args, **kwargs): def nav_link_flask(text, *args, **kwargs): if len(args) > 1: raise Exception('Too many unnamed parameters supplied') - blueprint, endpoint = request.url_rule.endpoint.split('.') + blueprint, endpoint = p.toolkit.get_endpoint() if args: kwargs['controller'] = blueprint or None kwargs['action'] = endpoint or None @@ -1154,8 +1154,10 @@ def sorted_extras(package_extras, auto_clean=False, subs=None, exclude=None): @core_helper def check_access(action, data_dict=None): + if not getattr(g, u'user', None): + g.user = '' context = {'model': model, - 'user': c.user} + 'user': g.user} if not data_dict: data_dict = {} try: @@ -1805,7 +1807,7 @@ def _create_url_with_params(params=None, controller=None, action=None, if not controller: controller = getattr(c, 'controller', False) or request.blueprint if not action: - action = getattr(c, 'action', False) or request.endpoint.split('.')[1] + action = getattr(c, 'action', False) or p.toolkit.get_endpoint()[1] if not extras: extras = {} diff --git a/ckan/lib/search/__init__.py b/ckan/lib/search/__init__.py index dbc8ac12cc5..fe34325396a 100644 --- a/ckan/lib/search/__init__.py +++ b/ckan/lib/search/__init__.py @@ -31,7 +31,7 @@ def text_traceback(): return res -SUPPORTED_SCHEMA_VERSIONS = ['2.8'] +SUPPORTED_SCHEMA_VERSIONS = ['2.8', '2.9'] DEFAULT_OPTIONS = { 'limit': 20, diff --git a/ckan/lib/search/query.py b/ckan/lib/search/query.py index 68877959982..c469838c7df 100644 --- a/ckan/lib/search/query.py +++ b/ckan/lib/search/query.py @@ -2,17 +2,20 @@ import re import logging - -from ckan.common import config +import six import pysolr + from paste.deploy.converters import asbool -from paste.util.multidict import MultiDict -import six +from werkzeug.datastructures import MultiDict -from ckan.lib.search.common import make_connection, SearchError, SearchQueryError import ckan.logic as logic import ckan.model as model +from ckan.common import config +from ckan.lib.search.common import ( + make_connection, SearchError, SearchQueryError +) + log = logging.getLogger(__name__) _open_licenses = None @@ -211,7 +214,7 @@ def run(self, fields={}, options=None, **kwargs): options.update(kwargs) context = { - 'model':model, + 'model': model, 'session': model.Session, 'search_query': True, } @@ -219,6 +222,7 @@ def run(self, fields={}, options=None, **kwargs): # Transform fields into structure required by the resource_search # action. query = [] + for field, terms in fields.items(): if isinstance(terms, six.string_types): terms = terms.split() diff --git a/ckan/logic/action/get.py b/ckan/logic/action/get.py index b5244a88719..302e6fa3951 100644 --- a/ckan/logic/action/get.py +++ b/ckan/logic/action/get.py @@ -1515,34 +1515,46 @@ def package_autocomplete(context, data_dict): :rtype: list of dictionaries ''' - model = context['model'] - _check_access('package_autocomplete', context, data_dict) + user = context.get('user') limit = data_dict.get('limit', 10) q = data_dict['q'] - like_q = u"%s%%" % q + # enforce permission filter based on user + if context.get('ignore_auth') or (user and authz.is_sysadmin(user)): + labels = None + else: + labels = lib_plugins.get_permission_labels().get_user_dataset_labels( + context['auth_user_obj'] + ) + + data_dict = { + 'q': ' OR '.join([ + 'name_ngram:{0}', + 'title_ngram:{0}', + 'name:{0}', + 'title:{0}', + ]).format(search.query.solr_literal(q)), + 'fl': 'name,title', + 'rows': limit + } + query = search.query_for(model.Package) - query = model.Session.query(model.Package) - query = query.filter(model.Package.state == 'active') - query = query.filter(model.Package.private == False) - query = query.filter(_or_(model.Package.name.ilike(like_q), - model.Package.title.ilike(like_q))) - query = query.limit(limit) + results = query.run(data_dict, permission_labels=labels)['results'] q_lower = q.lower() pkg_list = [] - for package in query: - if package.name.startswith(q_lower): + for package in results: + if q_lower in package['name']: match_field = 'name' - match_displayed = package.name + match_displayed = package['name'] else: match_field = 'title' - match_displayed = '%s (%s)' % (package.title, package.name) + match_displayed = '%s (%s)' % (package['title'], package['name']) result_dict = { - 'name': package.name, - 'title': package.title, + 'name': package['name'], + 'title': package['title'], 'match_field': match_field, 'match_displayed': match_displayed} pkg_list.append(result_dict) diff --git a/ckan/logic/action/patch.py b/ckan/logic/action/patch.py index db377f81ad6..7e1fecb892f 100644 --- a/ckan/logic/action/patch.py +++ b/ckan/logic/action/patch.py @@ -19,7 +19,14 @@ def package_patch(context, data_dict): The difference between the update and patch methods is that the patch will perform an update of the provided parameters, while leaving all other parameters unchanged, whereas the update methods deletes all parameters - not explicitly provided in the data_dict + not explicitly provided in the data_dict. + + You are able to partially update and/or create resources with + package_patch. If you are updating existing resources be sure to provide + the resource id. Existing resources excluded from the package_patch + data_dict will be removed. Resources in the package data_dict without + an id will be treated as new resources and will be added. New resources + added with the patch method do not create the default views. You must be authorized to edit the dataset and the groups that it belongs to. diff --git a/ckan/tests/legacy/lib/test_resource_search.py b/ckan/tests/legacy/lib/test_resource_search.py index 92862ac33a5..4e60c4ae4ce 100644 --- a/ckan/tests/legacy/lib/test_resource_search.py +++ b/ckan/tests/legacy/lib/test_resource_search.py @@ -1,7 +1,6 @@ # encoding: utf-8 -from webob.multidict import UnicodeMultiDict, MultiDict -from nose.tools import assert_raises, assert_equal +from nose.tools import assert_raises, assert_equal, assert_set_equal from ckan.tests.legacy import * from ckan.tests.legacy import is_search_supported @@ -76,10 +75,9 @@ def test_02_search_url_2(self): assert set([self.ab]) == urls, urls def test_03_search_url_multiple_words(self): - fields = UnicodeMultiDict(MultiDict(url='e')) - fields.add('url', 'f') + fields = dict([['url', 'e f']]) urls = self.res_search(fields=fields) - assert set([self.ef]) == urls, urls + assert_set_equal({self.ef}, urls) def test_04_search_url_none(self): urls = self.res_search(fields={'url':'nothing'}) diff --git a/ckan/tests/legacy/logic/test_action.py b/ckan/tests/legacy/logic/test_action.py index 6a7d418cc95..9714da797ac 100644 --- a/ckan/tests/legacy/logic/test_action.py +++ b/ckan/tests/legacy/logic/test_action.py @@ -130,7 +130,7 @@ def test_02_package_autocomplete_match_name(self): assert_equal(res_obj['result'][0]['match_displayed'], 'warandpeace') def test_02_package_autocomplete_match_title(self): - postparams = '%s=1' % json.dumps({'q':'a%20w', 'limit': 5}) + postparams = '%s=1' % json.dumps({'q': 'won', 'limit': 5}) res = self.app.post('/api/action/package_autocomplete', params=postparams) res_obj = json.loads(res.body) assert_equal(res_obj['success'], True) diff --git a/ckan/tests/lib/dictization/test_model_dictize.py b/ckan/tests/lib/dictization/test_model_dictize.py index 011391e8db9..c26c0590990 100644 --- a/ckan/tests/lib/dictization/test_model_dictize.py +++ b/ckan/tests/lib/dictization/test_model_dictize.py @@ -5,7 +5,7 @@ from nose.tools import assert_equal -from ckan.lib.dictization import model_dictize +from ckan.lib.dictization import model_dictize, model_save from ckan import model from ckan.lib import search @@ -414,8 +414,7 @@ def test_package_dictize_resource(self): result = model_dictize.package_dictize(dataset_obj, context) - assert_equal_for_keys(result['resources'][0], resource, - 'name', 'url') + assert_equal_for_keys(result['resources'][0], resource, 'name', 'url') expected_dict = { u'cache_last_updated': None, u'cache_url': None, @@ -435,6 +434,40 @@ def test_package_dictize_resource(self): } self.assert_equals_expected(expected_dict, result['resources'][0]) + def test_package_dictize_resource_upload_and_striped(self): + dataset = factories.Dataset() + resource = factories.Resource(package=dataset['id'], + name='test_pkg_dictize', + url_type='upload', + url='some_filename.csv') + + context = {'model': model, 'session': model.Session} + + result = model_save.resource_dict_save(resource, context) + + expected_dict = { + u'url': u'some_filename.csv', + u'url_type': u'upload' + } + assert expected_dict['url'] == result.url + + def test_package_dictize_resource_upload_with_url_and_striped(self): + dataset = factories.Dataset() + resource = factories.Resource(package=dataset['id'], + name='test_pkg_dictize', + url_type='upload', + url='http://some_filename.csv') + + context = {'model': model, 'session': model.Session} + + result = model_save.resource_dict_save(resource, context) + + expected_dict = { + u'url': u'some_filename.csv', + u'url_type': u'upload' + } + assert expected_dict['url'] == result.url + def test_package_dictize_tags(self): dataset = factories.Dataset(tags=[{'name': 'fish'}]) dataset_obj = model.Package.get(dataset['id']) diff --git a/ckan/tests/logic/action/test_get.py b/ckan/tests/logic/action/test_get.py index 514b5cc8363..344f2026e4d 100644 --- a/ckan/tests/logic/action/test_get.py +++ b/ckan/tests/logic/action/test_get.py @@ -923,10 +923,39 @@ def test_package_autocomplete_does_not_return_private_datasets(self): dataset2 = factories.Dataset(user=user, owner_org=org['name'], private=True, title='Some private stuff') - package_list = helpers.call_action('package_autocomplete', - q='some') + package_list = helpers.call_action( + 'package_autocomplete', context={'ignore_auth': False}, q='some' + ) eq(len(package_list), 1) + def test_package_autocomplete_does_return_private_datasets_from_my_org(self): + user = factories.User() + org = factories.Organization( + users=[{'name': user['name'], 'capacity': 'member'}] + ) + factories.Dataset( + user=user, owner_org=org['id'], title='Some public stuff' + ) + factories.Dataset( + user=user, owner_org=org['id'], private=True, + title='Some private stuff' + ) + package_list = helpers.call_action( + 'package_autocomplete', + context={'user': user['name'], 'ignore_auth': False}, + q='some' + ) + eq(len(package_list), 2) + + def test_package_autocomplete_works_for_the_middle_part_of_title(self): + factories.Dataset(title='Some public stuff') + factories.Dataset(title='Some random stuff') + + package_list = helpers.call_action('package_autocomplete', q='bli') + eq(len(package_list), 1) + package_list = helpers.call_action('package_autocomplete', q='tuf') + eq(len(package_list), 2) + class TestPackageSearch(helpers.FunctionalTestBase): diff --git a/ckan/views/__init__.py b/ckan/views/__init__.py index d0be1468aec..58cc0e1536b 100644 --- a/ckan/views/__init__.py +++ b/ckan/views/__init__.py @@ -204,11 +204,4 @@ def _get_user_for_apikey(): def set_controller_and_action(): - try: - controller, action = request.endpoint.split(u'.') - except ValueError: - log.debug( - u'Endpoint does not contain dot: {}'.format(request.endpoint) - ) - controller = action = request.endpoint - g.controller, g.action = controller, action + g.controller, g.action = p.toolkit.get_endpoint() diff --git a/ckan/views/user.py b/ckan/views/user.py index 00efe36b76e..d1fc9eabddc 100644 --- a/ckan/views/user.py +++ b/ckan/views/user.py @@ -78,7 +78,7 @@ def before_request(): context = dict(model=model, user=g.user, auth_user_obj=g.userobj) logic.check_access(u'site_read', context) except logic.NotAuthorized: - _, action = request.url_rule.endpoint.split(u'.') + _, action = plugins.toolkit.get_endpoint() if action not in ( u'login', u'request_reset', diff --git a/ckanext/datastore/backend/postgres.py b/ckanext/datastore/backend/postgres.py index 4fb81bf3b7a..2c23d9841c2 100644 --- a/ckanext/datastore/backend/postgres.py +++ b/ckanext/datastore/backend/postgres.py @@ -1189,7 +1189,9 @@ def validate(context, data_dict): data_dict_copy.pop('id', None) data_dict_copy.pop('include_total', None) + data_dict_copy.pop('total_estimation_threshold', None) data_dict_copy.pop('records_format', None) + data_dict_copy.pop('calculate_record_count', None) for key, values in data_dict_copy.iteritems(): if not values: @@ -1312,17 +1314,53 @@ def search_data(context, data_dict): _insert_links(data_dict, limit, offset) if data_dict.get('include_total', True): - count_sql_string = u'''SELECT count(*) FROM ( - SELECT {distinct} {select} - FROM "{resource}" {ts_query} {where}) as t;'''.format( - distinct=distinct, - select=select_columns, - resource=resource_id, - ts_query=ts_query, - where=where_clause) - count_result = _execute_single_statement( - context, count_sql_string, where_values) - data_dict['total'] = count_result.fetchall()[0][0] + total_estimation_threshold = \ + data_dict.get('total_estimation_threshold') + estimated_total = None + if total_estimation_threshold is not None and \ + not (where_clause or distinct): + # there are no filters, so we can try to use the estimated table + # row count from pg stats + # See: https://wiki.postgresql.org/wiki/Count_estimate + # (We also tried using the EXPLAIN to estimate filtered queries but + # it didn't estimate well in tests) + analyze_count_sql = sqlalchemy.text(''' + SELECT reltuples::BIGINT AS approximate_row_count + FROM pg_class + WHERE relname=:resource; + ''') + count_result = context['connection'].execute(analyze_count_sql, + resource=resource_id) + try: + estimated_total = count_result.fetchall()[0][0] + except ValueError: + # the table doesn't have the stats calculated yet. (This should + # be done by xloader/datapusher at the end of loading.) + # We could provoke their creation with an ANALYZE, but that + # takes 10x the time to run, compared to SELECT COUNT(*) so + # we'll just revert to the latter. At some point the autovacuum + # will run and create the stats so we can use an estimate in + # future. + pass + + if estimated_total is not None \ + and estimated_total >= total_estimation_threshold: + data_dict['total'] = estimated_total + data_dict['total_was_estimated'] = True + else: + # this is slow for large results + count_sql_string = u'''SELECT count(*) FROM ( + SELECT {distinct} {select} + FROM "{resource}" {ts_query} {where}) as t;'''.format( + distinct=distinct, + select=select_columns, + resource=resource_id, + ts_query=ts_query, + where=where_clause) + count_result = _execute_single_statement( + context, count_sql_string, where_values) + data_dict['total'] = count_result.fetchall()[0][0] + data_dict['total_was_estimated'] = False return data_dict @@ -1964,6 +2002,19 @@ def before_fork(self): # to avoid sharing them between parent and child processes. _dispose_engines() + def calculate_record_count(self, resource_id): + ''' + Calculate an estimate of the record/row count and store it in + Postgresql's pg_stat_user_tables. This number will be used when + specifying `total_estimation_threshold` + ''' + connection = get_write_engine().connect() + sql = 'ANALYZE "{}"'.format(resource_id) + try: + connection.execute(sql) + except sqlalchemy.exc.DatabaseError as err: + raise DatastoreException(err) + def create_function(name, arguments, rettype, definition, or_replace): sql = u''' diff --git a/ckanext/datastore/logic/action.py b/ckanext/datastore/logic/action.py index f7309f210cd..75a2a6e757f 100644 --- a/ckanext/datastore/logic/action.py +++ b/ckanext/datastore/logic/action.py @@ -70,6 +70,12 @@ def datastore_create(context, data_dict): {"function": "trigger_clean_reference"}, {"function": "trigger_check_codes"}] :type triggers: list of dictionaries + :param calculate_record_count: updates the stored count of records, used to + optimize datastore_search in combination with the + `total_estimation_threshold` parameter. If doing a series of requests + to change a resource, you only need to set this to True on the last + request. + :type calculate_record_count: bool (optional, default: False) Please note that setting the ``aliases``, ``indexes`` or ``primary_key`` replaces the exising aliases or constraints. Setting ``records`` appends @@ -152,6 +158,9 @@ def datastore_create(context, data_dict): except InvalidDataError as err: raise p.toolkit.ValidationError(text_type(err)) + if data_dict.get('calculate_record_count', False): + backend.calculate_record_count(data_dict['resource_id']) + # Set the datastore_active flag on the resource if necessary model = _get_or_bust(context, 'model') resobj = model.Resource.get(data_dict['resource_id']) @@ -229,6 +238,12 @@ def datastore_upsert(context, data_dict): Possible options are: upsert, insert, update (optional, default: upsert) :type method: string + :param calculate_record_count: updates the stored count of records, used to + optimize datastore_search in combination with the + `total_estimation_threshold` parameter. If doing a series of requests + to change a resource, you only need to set this to True on the last + request. + :type calculate_record_count: bool (optional, default: False) :param dry_run: set to True to abort transaction instead of committing, e.g. to check for validation or type errors. :type dry_run: bool (optional, default: False) @@ -264,6 +279,10 @@ def datastore_upsert(context, data_dict): result = backend.upsert(context, data_dict) result.pop('id', None) result.pop('connection_url', None) + + if data_dict.get('calculate_record_count', False): + backend.calculate_record_count(data_dict['resource_id']) + return result @@ -306,6 +325,12 @@ def datastore_delete(context, data_dict): If missing delete whole table and all dependent views. (optional) :type filters: dictionary + :param calculate_record_count: updates the stored count of records, used to + optimize datastore_search in combination with the + `total_estimation_threshold` parameter. If doing a series of requests + to change a resource, you only need to set this to True on the last + request. + :type calculate_record_count: bool (optional, default: False) **Results:** @@ -313,7 +338,7 @@ def datastore_delete(context, data_dict): :rtype: dictionary ''' - schema = context.get('schema', dsschema.datastore_upsert_schema()) + schema = context.get('schema', dsschema.datastore_delete_schema()) backend = DatastoreBackend.get_active_backend() # Remove any applied filters before running validation. @@ -349,6 +374,9 @@ def datastore_delete(context, data_dict): result = backend.delete(context, data_dict) + if data_dict.get('calculate_record_count', False): + backend.calculate_record_count(data_dict['resource_id']) + # Set the datastore_active flag on the resource if necessary model = _get_or_bust(context, 'model') resource = model.Resource.get(data_dict['resource_id']) @@ -407,6 +435,17 @@ def datastore_search(context, data_dict): :param include_total: True to return total matching record count (optional, default: true) :type include_total: bool + :param total_estimation_threshold: If "include_total" is True and + "total_estimation_threshold" is not None and the estimated total + (matching record count) is above the "total_estimation_threshold" then + this datastore_search will return an *estimate* of the total, rather + than a precise one. This is often good enough, and saves + computationally expensive row counting for larger results (e.g. >100000 + rows). The estimated total comes from the PostgreSQL table statistics, + generated when Express Loader or DataPusher finishes a load, or by + autovacuum. NB Currently estimation can't be done if the user specifies + 'filters' or 'distinct' options. (optional, default: None) + :type total_estimation_threshold: int or None :param records_format: the format for the records return value: 'objects' (default) list of {fieldname1: value1, ...} dicts, 'lists' list of [value1, value2, ...] lists, @@ -441,6 +480,8 @@ def datastore_search(context, data_dict): :type filters: list of dictionaries :param total: number of total matching records :type total: int + :param total_was_estimated: whether or not the total was estimated + :type total_was_estimated: bool :param records: list of matching results :type records: depends on records_format value passed :param records_truncated: indicates whether the number of records returned diff --git a/ckanext/datastore/logic/auth.py b/ckanext/datastore/logic/auth.py index 06aad380f9b..5a20c20a728 100644 --- a/ckanext/datastore/logic/auth.py +++ b/ckanext/datastore/logic/auth.py @@ -83,3 +83,7 @@ def datastore_function_delete(context, data_dict): def datastore_run_triggers(context, data_dict): '''sysadmin-only: functions can be used to skip access checks''' return {'success': False} + + +def datastore_analyze(context, data_dict): + return {'success': False} diff --git a/ckanext/datastore/logic/schema.py b/ckanext/datastore/logic/schema.py index 6254619d7cb..160ff3f5fab 100644 --- a/ckanext/datastore/logic/schema.py +++ b/ckanext/datastore/logic/schema.py @@ -124,6 +124,8 @@ def datastore_create_schema(): OneOf([u'row'])], 'function': [not_empty, unicode_only], }, + 'calculate_record_count': [ignore_missing, default(False), + boolean_validator], '__junk': [empty], '__before': [rename('id', 'resource_id')] } @@ -137,6 +139,8 @@ def datastore_upsert_schema(): 'id': [ignore_missing], 'method': [ignore_missing, text_type, OneOf( ['upsert', 'insert', 'update'])], + 'calculate_record_count': [ignore_missing, default(False), + boolean_validator], 'dry_run': [ignore_missing, boolean_validator], '__junk': [empty], '__before': [rename('id', 'resource_id')] @@ -149,6 +153,8 @@ def datastore_delete_schema(): 'resource_id': [not_missing, not_empty, text_type], 'force': [ignore_missing, boolean_validator], 'id': [ignore_missing], + 'calculate_record_count': [ignore_missing, default(False), + boolean_validator], '__junk': [empty], '__before': [rename('id', 'resource_id')] } @@ -169,6 +175,7 @@ def datastore_search_schema(): 'sort': [ignore_missing, list_of_strings_or_string], 'distinct': [ignore_missing, boolean_validator], 'include_total': [default(True), boolean_validator], + 'total_estimation_threshold': [default(None), int_validator], 'records_format': [ default(u'objects'), OneOf([u'objects', u'lists', u'csv', u'tsv'])], @@ -197,3 +204,9 @@ def datastore_function_delete_schema(): 'name': [unicode_only, not_empty], 'if_exists': [default(False), boolean_validator], } + + +def datastore_analyze_schema(): + return { + 'resource_id': [text_type, resource_id_exists], + } diff --git a/ckanext/datastore/tests/helpers.py b/ckanext/datastore/tests/helpers.py index c94aacd3a65..2d6e5728736 100644 --- a/ckanext/datastore/tests/helpers.py +++ b/ckanext/datastore/tests/helpers.py @@ -60,6 +60,21 @@ def set_url_type(resources, user): p.toolkit.get_action('resource_update')(context, resource) +def execute_sql(sql, *args): + engine = db.get_write_engine() + session = orm.scoped_session(orm.sessionmaker(bind=engine)) + return session.connection().execute(sql, *args) + + +def when_was_last_analyze(resource_id): + results = execute_sql( + '''SELECT last_analyze + FROM pg_stat_user_tables + WHERE relname=%s; + ''', resource_id).fetchall() + return results[0][0] + + class DatastoreFunctionalTestBase(FunctionalTestBase): _load_plugins = (u'datastore', ) diff --git a/ckanext/datastore/tests/test_create.py b/ckanext/datastore/tests/test_create.py index 9533769b326..27c0650e1b2 100644 --- a/ckanext/datastore/tests/test_create.py +++ b/ckanext/datastore/tests/test_create.py @@ -1,8 +1,7 @@ # encoding: utf-8 import json -import nose -from nose.tools import assert_equal, raises +from nose.tools import assert_equal, assert_not_equal, raises import sqlalchemy.orm as orm from ckan.tests.helpers import _get_test_app @@ -11,13 +10,13 @@ import ckan.plugins as p import ckan.lib.create_test_data as ctd import ckan.model as model -import ckan.tests.legacy as tests import ckan.tests.helpers as helpers import ckan.tests.factories as factories import ckanext.datastore.backend.postgres as db from ckanext.datastore.tests.helpers import ( - set_url_type, DatastoreFunctionalTestBase, DatastoreLegacyTestBase) + set_url_type, DatastoreFunctionalTestBase, DatastoreLegacyTestBase, + execute_sql, when_was_last_analyze) from ckan.plugins.toolkit import ValidationError @@ -163,7 +162,7 @@ def _has_index_on_field(self, resource_id, field): pg_class.relname = %s """ index_name = db._generate_index_name(resource_id, field) - results = self._execute_sql(sql, index_name).fetchone() + results = execute_sql(sql, index_name).fetchone() return bool(results) def _get_index_names(self, resource_id): @@ -180,14 +179,9 @@ def _get_index_names(self, resource_id): AND t.relkind = 'r' AND t.relname = %s """ - results = self._execute_sql(sql, resource_id).fetchall() + results = execute_sql(sql, resource_id).fetchall() return [result[0] for result in results] - def _execute_sql(self, sql, *args): - engine = db.get_write_engine() - session = orm.scoped_session(orm.sessionmaker(bind=engine)) - return session.connection().execute(sql, *args) - def test_sets_datastore_active_on_resource_on_create(self): resource = factories.Resource() @@ -244,6 +238,36 @@ def test_create_exceeds_column_name_limit(self): } result = helpers.call_action('datastore_create', **data) + def test_calculate_record_count_is_false(self): + resource = factories.Resource() + data = { + 'resource_id': resource['id'], + 'fields': [{'id': 'name', 'type': 'text'}, + {'id': 'age', 'type': 'text'}], + 'records': [{"name": "Sunita", "age": "51"}, + {"name": "Bowan", "age": "68"}], + 'force': True, + } + helpers.call_action('datastore_create', **data) + last_analyze = when_was_last_analyze(resource['id']) + assert_equal(last_analyze, None) + + def test_calculate_record_count(self): + # how datapusher loads data (send_resource_to_datastore) + resource = factories.Resource() + data = { + 'resource_id': resource['id'], + 'fields': [{'id': 'name', 'type': 'text'}, + {'id': 'age', 'type': 'text'}], + 'records': [{"name": "Sunita", "age": "51"}, + {"name": "Bowan", "age": "68"}], + 'calculate_record_count': True, + 'force': True, + } + helpers.call_action('datastore_create', **data) + last_analyze = when_was_last_analyze(resource['id']) + assert_not_equal(last_analyze, None) + class TestDatastoreCreate(DatastoreLegacyTestBase): sysadmin_user = None diff --git a/ckanext/datastore/tests/test_delete.py b/ckanext/datastore/tests/test_delete.py index 50dfcb3e4ef..2ce59dd630c 100644 --- a/ckanext/datastore/tests/test_delete.py +++ b/ckanext/datastore/tests/test_delete.py @@ -1,29 +1,98 @@ # encoding: utf-8 import json -import nose -from nose.tools import assert_equal +from nose.tools import assert_equal, assert_not_equal, assert_raises -import sqlalchemy import sqlalchemy.orm as orm -import ckan.plugins as p import ckan.lib.create_test_data as ctd import ckan.model as model -import ckan.tests.legacy as tests from ckan.tests import helpers from ckan.plugins.toolkit import ValidationError import ckan.tests.factories as factories from ckan.logic import NotFound import ckanext.datastore.backend.postgres as db from ckanext.datastore.tests.helpers import ( - rebuild_all_dbs, set_url_type, + set_url_type, when_was_last_analyze, execute_sql, DatastoreFunctionalTestBase, DatastoreLegacyTestBase) -assert_raises = nose.tools.assert_raises +class TestDatastoreDelete(DatastoreFunctionalTestBase): + def test_delete_basic(self): + resource = factories.Resource() + data = { + 'resource_id': resource['id'], + 'force': True, + 'aliases': u'b\xfck2', + 'fields': [{'id': 'book', 'type': 'text'}, + {'id': 'author', 'type': 'text'}, + {'id': 'rating with %', 'type': 'text'}], + 'records': [{'book': 'annakarenina', 'author': 'tolstoy', + 'rating with %': '90%'}, + {'book': 'warandpeace', 'author': 'tolstoy', + 'rating with %': '42%'}] + } + helpers.call_action('datastore_create', **data) + data = { + 'resource_id': resource['id'], + 'force': True, + } + helpers.call_action('datastore_delete', **data) + + results = execute_sql(u'select 1 from pg_views where viewname = %s', u'b\xfck2') + assert results.rowcount == 0 + + # check the table is gone + results = execute_sql( + u'''SELECT table_name + FROM information_schema.tables + WHERE table_name=%s;''', + resource['id']) + assert results.rowcount == 0 -class TestDatastoreDelete(DatastoreLegacyTestBase): + def test_calculate_record_count_is_false(self): + resource = factories.Resource() + data = { + 'resource_id': resource['id'], + 'force': True, + 'fields': [{'id': 'name', 'type': 'text'}, + {'id': 'age', 'type': 'text'}], + 'records': [{"name": "Sunita", "age": "51"}, + {"name": "Bowan", "age": "68"}], + } + helpers.call_action('datastore_create', **data) + data = { + 'resource_id': resource['id'], + 'filters': {'name': 'Bowan'}, + 'force': True, + } + helpers.call_action('datastore_delete', **data) + last_analyze = when_was_last_analyze(resource['id']) + assert_equal(last_analyze, None) + + def test_calculate_record_count(self): + resource = factories.Resource() + data = { + 'resource_id': resource['id'], + 'force': True, + 'fields': [{'id': 'name', 'type': 'text'}, + {'id': 'age', 'type': 'text'}], + 'records': [{"name": "Sunita", "age": "51"}, + {"name": "Bowan", "age": "68"}], + } + helpers.call_action('datastore_create', **data) + data = { + 'resource_id': resource['id'], + 'filters': {'name': 'Bowan'}, + 'calculate_record_count': True, + 'force': True, + } + helpers.call_action('datastore_delete', **data) + last_analyze = when_was_last_analyze(resource['id']) + assert_not_equal(last_analyze, None) + + +class TestDatastoreDeleteLegacy(DatastoreLegacyTestBase): sysadmin_user = None normal_user = None Session = None @@ -31,7 +100,7 @@ class TestDatastoreDelete(DatastoreLegacyTestBase): @classmethod def setup_class(cls): cls.app = helpers._get_test_app() - super(TestDatastoreDelete, cls).setup_class() + super(TestDatastoreDeleteLegacy, cls).setup_class() ctd.CreateTestData.create() cls.sysadmin_user = model.User.get('testsysadmin') cls.normal_user = model.User.get('annafan') @@ -74,32 +143,6 @@ def _delete(self): assert res_dict['result'] == data return res_dict - def test_delete_basic(self): - self._create() - self._delete() - resource_id = self.data['resource_id'] - c = self.Session.connection() - - # It's dangerous to build queries as someone could inject sql. - # It's okay here as it is a test but don't use it anyhwere else! - results = c.execute( - u"select 1 from pg_views where viewname = '{0}'".format( - self.data['aliases'] - ) - ) - assert results.rowcount == 0 - - try: - # check that data was actually deleted: this should raise a - # ProgrammingError as the table should not exist any more - c.execute(u'select * from "{0}";'.format(resource_id)) - raise Exception("Data not deleted") - except sqlalchemy.exc.ProgrammingError as e: - expected_msg = 'relation "{0}" does not exist'.format(resource_id) - assert expected_msg in str(e) - - self.Session.remove() - def test_datastore_deleted_during_resource_deletion(self): package = factories.Dataset() data = { @@ -320,7 +363,7 @@ def test_delete_nonexistant(self): else: assert 0, u'no validation error' - def test_delete_if_exitst(self): + def test_delete_if_exists(self): helpers.call_action( u'datastore_function_delete', name=u'test_not_there_either', diff --git a/ckanext/datastore/tests/test_dump.py b/ckanext/datastore/tests/test_dump.py index 30c010cf68d..4f3167a727b 100644 --- a/ckanext/datastore/tests/test_dump.py +++ b/ckanext/datastore/tests/test_dump.py @@ -1,6 +1,7 @@ # encoding: utf-8 from nose.tools import assert_equals, assert_in +import mock from ckanext.datastore.tests.helpers import DatastoreFunctionalTestBase import ckan.tests.helpers as helpers @@ -446,3 +447,59 @@ def test_dump_with_low_rows_max(self): '1,annakarenina\n', response.body) assert response.headers['X-Records-Truncated'] == 'true' + + @mock.patch('ckanext.datastore.controller.PAGINATE_BY', 5) + def test_dump_pagination(self): + resource = factories.Resource() + data = { + 'resource_id': resource['id'], + 'force': True, + 'records': [{u'record': str(num)} for num in range(12)], + } + helpers.call_action('datastore_create', **data) + + app = self._get_test_app() + response = app.get('/datastore/dump/{0}'.format(str(resource['id']))) + assert_equals( + '_id,record\r\n' + '1,0\n2,1\n3,2\n4,3\n5,4\n6,5\n7,6\n8,7\n9,8\n10,9\n' + '11,10\n12,11\n', + response.body) + + @mock.patch('ckanext.datastore.controller.PAGINATE_BY', 5) + def test_dump_pagination_csv_with_limit(self): + resource = factories.Resource() + data = { + 'resource_id': resource['id'], + 'force': True, + 'records': [{u'record': str(num)} for num in range(12)], + } + helpers.call_action('datastore_create', **data) + + app = self._get_test_app() + response = app.get('/datastore/dump/{0}?limit=6'.format( + str(resource['id']))) + assert_equals( + '_id,record\r\n' + '1,0\n2,1\n3,2\n4,3\n5,4\n6,5\n', + response.body) + + @mock.patch('ckanext.datastore.controller.PAGINATE_BY', 5) + def test_dump_pagination_json_with_limit(self): + resource = factories.Resource() + data = { + 'resource_id': resource['id'], + 'force': True, + 'records': [{u'record': str(num)} for num in range(12)], + } + helpers.call_action('datastore_create', **data) + + app = self._get_test_app() + response = app.get('/datastore/dump/{0}?limit=6&format=json'.format( + str(resource['id']))) + assert_equals( + '{\n "fields": [{"type":"int","id":"_id"},' + '{"type":"int4","id":"record"}],\n' + ' "records": [\n [1,0],\n [2,1],\n [3,2],\n [4,3],\n' + ' [5,4],\n [6,5]\n]}\n', + response.body) diff --git a/ckanext/datastore/tests/test_search.py b/ckanext/datastore/tests/test_search.py index 626b5fcc98d..c0116ed8b7e 100644 --- a/ckanext/datastore/tests/test_search.py +++ b/ckanext/datastore/tests/test_search.py @@ -101,6 +101,25 @@ def test_all_params_work_with_fields_with_whitespaces(self): result_years = [r['the year'] for r in result['records']] assert_equals(result_years, [2013]) + def test_search_total(self): + resource = factories.Resource() + data = { + 'resource_id': resource['id'], + 'force': True, + 'records': [ + {'the year': 2014}, + {'the year': 2013}, + ], + } + result = helpers.call_action('datastore_create', **data) + search_data = { + 'resource_id': resource['id'], + 'include_total': True, + } + result = helpers.call_action('datastore_search', **search_data) + assert_equals(result['total'], 2) + assert_equals(result.get('total_was_estimated'), False) + def test_search_without_total(self): resource = factories.Resource() data = { @@ -118,6 +137,151 @@ def test_search_without_total(self): } result = helpers.call_action('datastore_search', **search_data) assert 'total' not in result + assert 'total_was_estimated' not in result + + def test_estimate_total(self): + resource = factories.Resource() + data = { + 'resource_id': resource['id'], + 'force': True, + 'records': [{'the year': 1900 + i} for i in range(100)], + } + result = helpers.call_action('datastore_create', **data) + analyze_sql = ''' + ANALYZE "{resource}"; + '''.format(resource=resource['id']) + db.get_write_engine().execute(analyze_sql) + search_data = { + 'resource_id': resource['id'], + 'total_estimation_threshold': 50, + } + result = helpers.call_action('datastore_search', **search_data) + assert_equals(result.get('total_was_estimated'), True) + assert 95 < result['total'] < 105, result['total'] + + def test_estimate_total_with_filters(self): + resource = factories.Resource() + data = { + 'resource_id': resource['id'], + 'force': True, + 'records': [{'the year': 1900 + i} for i in range(3)] * 10, + } + result = helpers.call_action('datastore_create', **data) + analyze_sql = ''' + ANALYZE "{resource}"; + '''.format(resource=resource['id']) + db.get_write_engine().execute(analyze_sql) + search_data = { + 'resource_id': resource['id'], + 'filters': {u'the year': 1901}, + 'total_estimation_threshold': 5, + } + result = helpers.call_action('datastore_search', **search_data) + assert_equals(result['total'], 10) + # estimation is not compatible with filters + assert_equals(result.get('total_was_estimated'), False) + + def test_estimate_total_with_distinct(self): + resource = factories.Resource() + data = { + 'resource_id': resource['id'], + 'force': True, + 'records': [{'the year': 1900 + i} for i in range(3)] * 10, + } + result = helpers.call_action('datastore_create', **data) + analyze_sql = ''' + ANALYZE "{resource}"; + '''.format(resource=resource['id']) + db.get_write_engine().execute(analyze_sql) + search_data = { + 'resource_id': resource['id'], + 'fields': ['the year'], + 'distinct': True, + 'total_estimation_threshold': 1, + } + result = helpers.call_action('datastore_search', **search_data) + assert_equals(result['total'], 3) + # estimation is not compatible with distinct + assert_equals(result.get('total_was_estimated'), False) + + def test_estimate_total_where_analyze_is_not_already_done(self): + # ANALYSE is done by latest datapusher/xloader, but need to cope in + # if tables created in other ways which may not have had an ANALYSE + resource = factories.Resource() + data = { + 'resource_id': resource['id'], + 'force': True, + 'records': [{'the year': 1900 + i} for i in range(100)], + } + result = helpers.call_action('datastore_create', **data) + search_data = { + 'resource_id': resource['id'], + 'total_estimation_threshold': 50, + } + result = helpers.call_action('datastore_search', **search_data) + assert_equals(result.get('total_was_estimated'), True) + assert 95 < result['total'] < 105, result['total'] + + def test_estimate_total_with_zero_threshold(self): + resource = factories.Resource() + data = { + 'resource_id': resource['id'], + 'force': True, + 'records': [{'the year': 1900 + i} for i in range(100)], + } + result = helpers.call_action('datastore_create', **data) + analyze_sql = ''' + ANALYZE "{resource}"; + '''.format(resource=resource['id']) + db.get_write_engine().execute(analyze_sql) + search_data = { + 'resource_id': resource['id'], + 'total_estimation_threshold': 0, + } + result = helpers.call_action('datastore_search', **search_data) + # threshold of 0 means always estimate + assert_equals(result.get('total_was_estimated'), True) + assert 95 < result['total'] < 105, result['total'] + + def test_estimate_total_off(self): + resource = factories.Resource() + data = { + 'resource_id': resource['id'], + 'force': True, + 'records': [{'the year': 1900 + i} for i in range(100)], + } + result = helpers.call_action('datastore_create', **data) + analyze_sql = ''' + ANALYZE "{resource}"; + '''.format(resource=resource['id']) + db.get_write_engine().execute(analyze_sql) + search_data = { + 'resource_id': resource['id'], + 'total_estimation_threshold': None, + } + result = helpers.call_action('datastore_search', **search_data) + # threshold of None means don't estimate + assert_equals(result.get('total_was_estimated'), False) + + def test_estimate_total_default_off(self): + resource = factories.Resource() + data = { + 'resource_id': resource['id'], + 'force': True, + 'records': [{'the year': 1900 + i} for i in range(100)], + } + result = helpers.call_action('datastore_create', **data) + analyze_sql = ''' + ANALYZE "{resource}"; + '''.format(resource=resource['id']) + db.get_write_engine().execute(analyze_sql) + search_data = { + 'resource_id': resource['id'], + # don't specify total_estimation_threshold + } + result = helpers.call_action('datastore_search', **search_data) + # default threshold is None, meaning don't estimate + assert_equals(result.get('total_was_estimated'), False) def test_search_limit(self): resource = factories.Resource() diff --git a/ckanext/datastore/tests/test_upsert.py b/ckanext/datastore/tests/test_upsert.py index 57e56160587..58d88e5166e 100644 --- a/ckanext/datastore/tests/test_upsert.py +++ b/ckanext/datastore/tests/test_upsert.py @@ -1,26 +1,21 @@ # encoding: utf-8 import json -import nose import datetime +from nose.tools import assert_equal, assert_not_equal import sqlalchemy.orm as orm -import ckan.plugins as p import ckan.lib.create_test_data as ctd import ckan.model as model -import ckan.tests.legacy as tests import ckan.tests.helpers as helpers import ckan.tests.factories as factories from ckan.plugins.toolkit import ValidationError -from ckan.common import config - import ckanext.datastore.backend.postgres as db from ckanext.datastore.tests.helpers import ( - set_url_type, DatastoreFunctionalTestBase, DatastoreLegacyTestBase) - -assert_equal = nose.tools.assert_equal + set_url_type, DatastoreFunctionalTestBase, DatastoreLegacyTestBase, + when_was_last_analyze) class TestDatastoreUpsert(DatastoreFunctionalTestBase): @@ -135,6 +130,47 @@ def test_dry_run_trigger_error(self): else: assert 0, 'error not raised' + def test_calculate_record_count_is_false(self): + resource = factories.Resource() + data = { + 'resource_id': resource['id'], + 'force': True, + 'fields': [{'id': 'name', 'type': 'text'}, + {'id': 'age', 'type': 'text'}], + } + helpers.call_action('datastore_create', **data) + data = { + 'resource_id': resource['id'], + 'force': True, + 'method': 'insert', + 'records': [{"name": "Sunita", "age": "51"}, + {"name": "Bowan", "age": "68"}], + } + helpers.call_action('datastore_upsert', **data) + last_analyze = when_was_last_analyze(resource['id']) + assert_equal(last_analyze, None) + + def test_calculate_record_count(self): + resource = factories.Resource() + data = { + 'resource_id': resource['id'], + 'force': True, + 'fields': [{'id': 'name', 'type': 'text'}, + {'id': 'age', 'type': 'text'}], + } + helpers.call_action('datastore_create', **data) + data = { + 'resource_id': resource['id'], + 'force': True, + 'method': 'insert', + 'records': [{"name": "Sunita", "age": "51"}, + {"name": "Bowan", "age": "68"}], + 'calculate_record_count': True + } + helpers.call_action('datastore_upsert', **data) + last_analyze = when_was_last_analyze(resource['id']) + assert_not_equal(last_analyze, None) + class TestDatastoreUpsertLegacyTests(DatastoreLegacyTestBase): sysadmin_user = None diff --git a/ckanext/reclineview/theme/public/vendor/ckan.js/ckan.js b/ckanext/reclineview/theme/public/vendor/ckan.js/ckan.js index 82a6472a5dc..184ec4b5248 100644 --- a/ckanext/reclineview/theme/public/vendor/ckan.js/ckan.js +++ b/ckanext/reclineview/theme/public/vendor/ckan.js/ckan.js @@ -10,7 +10,7 @@ if (isNodeModule) { } (function(my) { - my.Client = function(endpoint, apiKey) { + my.Client = function(endpoint, apiKey) { this.endpoint = _getEndpoint(endpoint); this.apiKey = apiKey; }; @@ -51,7 +51,8 @@ if (isNodeModule) { var out = { total: results.result.total, fields: fields, - hits: results.result.records + hits: results.result.records, + total_was_estimated: results.result.total_was_estimated }; cb(null, out); }); @@ -67,7 +68,7 @@ if (isNodeModule) { 'bool': 'boolean', }; - // + // my.jsonTableSchema2CkanTypes = { 'string': 'text', 'number': 'float', @@ -128,7 +129,7 @@ if (isNodeModule) { code: obj.status, message: obj.responseText } - cb(err); + cb(err); } if (options.headers) { options.beforeSend = function(req) { @@ -147,7 +148,8 @@ if (isNodeModule) { q: queryObj.q, filters: {}, limit: queryObj.size || 10, - offset: queryObj.from || 0 + offset: queryObj.from || 0, + total_estimation_threshold: 1000 }; if (queryObj.sort && queryObj.sort.length > 0) { @@ -188,7 +190,7 @@ if (isNodeModule) { // This provides connection to the CKAN DataStore (v2) // // General notes -// +// // We need 2 things to make most requests: // // 1. CKAN API endpoint @@ -196,13 +198,13 @@ if (isNodeModule) { // // There are 2 ways to specify this information. // -// EITHER (checked in order): +// EITHER (checked in order): // // * Every dataset must have an id equal to its resource id on the CKAN instance // * The dataset has an endpoint attribute pointing to the CKAN API endpoint // // OR: -// +// // Set the url attribute of the dataset to point to the Resource on the CKAN instance. The endpoint and id will then be automatically computed. var recline = recline || {}; recline.Backend = recline.Backend || {}; diff --git a/ckanext/reclineview/theme/public/vendor/recline/recline.js b/ckanext/reclineview/theme/public/vendor/recline/recline.js index 25caa0520c6..5f0954b411d 100755 --- a/ckanext/reclineview/theme/public/vendor/recline/recline.js +++ b/ckanext/reclineview/theme/public/vendor/recline/recline.js @@ -422,6 +422,7 @@ my.Dataset = Backbone.Model.extend({ }; this.facets = new my.FacetList(); this.recordCount = null; + this.recordCountWasEstimated = null; this.queryState = new my.Query(); this.queryState.bind('change facet:add', function () { self.query(); // We want to call query() without any arguments. @@ -602,6 +603,10 @@ my.Dataset = Backbone.Model.extend({ _handleQueryResult: function(queryResult) { var self = this; self.recordCount = queryResult.total; + self.recordCountWasEstimated = queryResult.total_was_estimated; + if (self.recordCountWasEstimated) { + self.recordCount = Math.floor((self.recordCount + 500)/1000) + '000'; + } var docs = _.map(queryResult.hits, function(hit) { var _doc = new my.Record(hit); _doc.fields = self.fields; @@ -626,6 +631,7 @@ my.Dataset = Backbone.Model.extend({ toTemplateJSON: function() { var data = this.toJSON(); data.recordCount = this.recordCount; + data.recordCountWasEstimated = this.recordCountWasEstimated; data.fields = this.fields.toJSON(); return data; }, @@ -2619,7 +2625,10 @@ my.MultiView = Backbone.View.extend({ \ \
\ - {{recordCount}} records\ + {{#recordCountWasEstimated}} \ + about \ + {{/recordCountWasEstimated}} \ + {{recordCount}} records \
\