Skip to content

Commit

Permalink
Backport of 4561-limit-datastore_search 74d8fb2
Browse files Browse the repository at this point in the history
  • Loading branch information
David Read committed Dec 7, 2018
1 parent 70bd900 commit 1534ae0
Show file tree
Hide file tree
Showing 8 changed files with 600 additions and 143 deletions.
17 changes: 16 additions & 1 deletion ckan/lib/navl/validators.py
Expand Up @@ -2,7 +2,7 @@

import ckan.lib.navl.dictization_functions as df

from ckan.common import _
from ckan.common import _, config

missing = df.missing
StopOnError = df.StopOnError
Expand Down Expand Up @@ -123,3 +123,18 @@ def unicode_only(value):
if not isinstance(value, unicode):
raise Invalid(_('Must be a Unicode string value'))
return value

def limit_to_configured_maximum(config_option, default_limit):
'''
If the value is over a limit, it changes it to the limit. The limit is
defined by a configuration option, or if that is not set, a given int
default_limit.
'''
def callable(key, data, errors, context):

value = data.get(key)
limit = int(config.get(config_option, default_limit))
if value > limit:
data[key] = limit

return callable
20 changes: 18 additions & 2 deletions ckanext/datastore/backend/postgres.py
Expand Up @@ -1342,7 +1342,7 @@ def _execute_single_statement_copy_to(context, sql_string, where_values, buf):
cursor.close()


def format_results(context, results, data_dict):
def format_results(context, results, data_dict, rows_max):
result_fields = []
for field in results.cursor.description:
result_fields.append({
Expand All @@ -1358,6 +1358,8 @@ def format_results(context, results, data_dict):
field['type'])
records.append(converted_row)
data_dict['records'] = records
if data_dict.get('records_truncated', False):
data_dict['records'] = data_dict['records'][:rows_max]
data_dict['fields'] = result_fields

return _unrename_json_field(data_dict)
Expand Down Expand Up @@ -1504,6 +1506,11 @@ def search_sql(context, data_dict):

sql = data_dict['sql'].replace('%', '%%')

# limit the number of results to ckan.datastore.search.rows_max + 1
# (the +1 is so that we know if the results went over the limit or not)
rows_max = int(config.get('ckan.datastore.search.rows_max', 32000))
sql = 'SELECT * FROM ({0}) AS blah LIMIT {1} ;'.format(sql, rows_max + 1)

try:

context['connection'].execute(
Expand All @@ -1520,7 +1527,10 @@ def search_sql(context, data_dict):

results = context['connection'].execute(sql)

return format_results(context, results, data_dict)
if results.rowcount == rows_max + 1:
data_dict['records_truncated'] = True

return format_results(context, results, data_dict, rows_max)

except ProgrammingError, e:
if e.orig.pgcode == _PG_ERR_CODE['permission_denied']:
Expand Down Expand Up @@ -1695,6 +1705,11 @@ def configure(self, config):
else:
self._check_urls_and_permissions()

# check rows_max is valid on CKAN start-up
rows_max = config.get('ckan.datastore.search.rows_max')
if rows_max is not None:
int(rows_max)

def datastore_delete(self, context, data_dict, fields_types, query_dict):
query_dict['where'] += _where_clauses(data_dict, fields_types)
return query_dict
Expand All @@ -1709,6 +1724,7 @@ def datastore_search(self, context, data_dict, fields_types, query_dict):
field_ids = fields_types.keys()

ts_query, rank_column = _textsearch_query(data_dict)
# add default limit here just in case - already defaulted in the schema
limit = data_dict.get('limit', 100)
offset = data_dict.get('offset', 0)

Expand Down
16 changes: 13 additions & 3 deletions ckanext/datastore/controller.py
Expand Up @@ -14,6 +14,7 @@
render,
c,
h,
config,
)
from ckanext.datastore.writer import (
csv_writer,
Expand Down Expand Up @@ -141,6 +142,15 @@ def result_page(offs, lim):

result = result_page(offset, limit)

if result['limit'] != limit:
# `limit` (from PAGINATE_BY) must have been more than
# ckan.datastore.search.rows_max, so datastore_search responded with a
# limit matching ckan.datastore.search.rows_max. So we need to paginate
# by that amount instead, otherwise we'll have gaps in the records.
paginate_by = result['limit']
else:
paginate_by = PAGINATE_BY

with start_writer(result['fields']) as wr:
while True:
if limit is not None and limit <= 0:
Expand All @@ -151,14 +161,14 @@ def result_page(offs, lim):
wr.write_records(records)

if records_format == 'objects' or records_format == 'lists':
if len(records) < PAGINATE_BY:
if len(records) < paginate_by:
break
elif not records:
break

offset += PAGINATE_BY
offset += paginate_by
if limit is not None:
limit -= PAGINATE_BY
limit -= paginate_by
if limit <= 0:
break

Expand Down
16 changes: 14 additions & 2 deletions ckanext/datastore/logic/action.py
Expand Up @@ -392,7 +392,9 @@ def datastore_search(context, data_dict):
:param language: language of the full text query
(optional, default: english)
:type language: string
:param limit: maximum number of rows to return (optional, default: 100)
:param limit: maximum number of rows to return
(optional, default: ``100``, upper limit: ``32000`` unless set in
site's configuration ``ckan.datastore.search.rows_max``)
:type limit: int
:param offset: offset this number of rows (optional)
:type offset: int
Expand Down Expand Up @@ -432,14 +434,22 @@ def datastore_search(context, data_dict):
:type fields: list of dictionaries
:param offset: query offset value
:type offset: int
:param limit: query limit value
:param limit: queried limit value (if the requested ``limit`` was above the
``ckan.datastore.search.rows_max`` value then this response ``limit``
will be set to the value of ``ckan.datastore.search.rows_max``)
:type limit: int
:param filters: query filters
:type filters: list of dictionaries
:param total: number of total matching records
:type total: int
:param records: list of matching results
:type records: depends on records_format value passed
:param records_truncated: indicates whether the number of records returned
was limited by the internal limit, which is 32000 records (or other
value set in the site's configuration
``ckan.datastore.search.rows_max``). If records are truncated by this,
this key has value True, otherwise the key is not returned at all.
:type records_truncated: bool
'''
backend = DatastoreBackend.get_active_backend()
Expand Down Expand Up @@ -481,6 +491,8 @@ def datastore_search_sql(context, data_dict):
engine is the
`PostgreSQL engine <http://www.postgresql.org/docs/9.1/interactive/>`_.
There is an enforced timeout on SQL queries to avoid an unintended DOS.
The number of results returned is limited to 32000, unless set in the
site's configuration ``ckan.datastore.search.rows_max``
DataStore resource that belong to a private CKAN resource cannot be
searched with this action. Use
:meth:`~ckanext.datastore.logic.action.datastore_search` instead.
Expand Down
6 changes: 5 additions & 1 deletion ckanext/datastore/logic/schema.py
Expand Up @@ -18,6 +18,8 @@
OneOf = get_validator('OneOf')
unicode_only = get_validator('unicode_only')
default = get_validator('default')
natural_number_validator = get_validator('natural_number_validator')
limit_to_configured_maximum = get_validator('limit_to_configured_maximum')


def rename(old, new):
Expand Down Expand Up @@ -157,7 +159,9 @@ def datastore_search_schema():
'plain': [ignore_missing, boolean_validator],
'filters': [ignore_missing, json_validator],
'language': [ignore_missing, unicode],
'limit': [ignore_missing, int_validator],
'limit': [default(100), natural_number_validator,
limit_to_configured_maximum('ckan.datastore.search.rows_max',
32000)],
'offset': [ignore_missing, int_validator],
'fields': [ignore_missing, list_of_strings_or_string],
'sort': [ignore_missing, list_of_strings_or_string],
Expand Down

0 comments on commit 1534ae0

Please sign in to comment.