Skip to content

Commit

Permalink
datastore dump is NOT limited by rows_max.
Browse files Browse the repository at this point in the history
  • Loading branch information
David Read committed Dec 7, 2018
1 parent fc0b465 commit 497795c
Show file tree
Hide file tree
Showing 7 changed files with 23 additions and 63 deletions.
27 changes: 10 additions & 17 deletions ckanext/datastore/controller.py
Expand Up @@ -174,41 +174,34 @@ def result_page(offs, lim):
'include_total': False,
}, **search_params))

# number of records the user can get is bounded by rows_max
rows_max = int(config.get('ckan.datastore.search.rows_max', 32000))
if limit > rows_max:
limit = rows_max

def set_header(num_records_written):
records_are_up_to_the_limit = num_records_written >= rows_max
response.headers['X-Records-Up-To-Rows-Max'] = \
str(records_are_up_to_the_limit).lower()

result = result_page(offset, limit)

if result['limit'] != limit:
# The limit has been reduced to ckan.datastore.search.rows_max
# so let's page over that amount instead
paginate_by = result['limit']
else:
paginate_by = PAGINATE_BY

with start_writer(result['fields']) as wr:
while True:
if limit is not None and limit <= 0:
set_header(num_records_written=offset)
break

records = result['records']

wr.write_records(records)

if records_format == 'objects' or records_format == 'lists':
if len(records) < PAGINATE_BY:
set_header(num_records_written=offset + len(records))
if len(records) < paginate_by:
break
elif not records:
set_header(num_records_written=offset)
break

offset += PAGINATE_BY
offset += paginate_by
if limit is not None:
limit -= PAGINATE_BY
limit -= paginate_by
if limit <= 0:
set_header(num_records_written=offset + limit)
break

result = result_page(offset, limit)
24 changes: 3 additions & 21 deletions ckanext/datastore/logic/action.py
Expand Up @@ -473,8 +473,9 @@ def datastore_search(context, data_dict):
:type fields: list of dictionaries
:param offset: query offset value
:type offset: int
:param limit: query limit value (which is bounded to site configuration
value ``ckan.datastore.search.rows_max`` which has default of 32000)
:param limit: queried limit value (if the requested ``limit`` was above the
``ckan.datastore.search.rows_max`` value then this response ``limit``
will be set to the value of ``ckan.datastore.search.rows_max``)
:type limit: int
:param filters: query filters
:type filters: list of dictionaries
Expand All @@ -484,12 +485,6 @@ def datastore_search(context, data_dict):
:type total_was_estimated: bool
:param records: list of matching results
:type records: depends on records_format value passed
:param records_truncated: indicates whether the number of records returned
was limited by the internal limit, which is 32000 records (or other
value set in the site's configuration
``ckan.datastore.search.rows_max``). If records are truncated by this,
this key has value True, otherwise the key is not returned at all.
:type records_truncated: bool
'''
backend = DatastoreBackend.get_active_backend()
Expand All @@ -498,13 +493,6 @@ def datastore_search(context, data_dict):
if errors:
raise p.toolkit.ValidationError(errors)

rows_max = int(config.get('ckan.datastore.search.rows_max', 32000))
if data_dict['limit'] > rows_max:
limit_reduced_by_rows_max = True
data_dict['limit'] = rows_max
else:
limit_reduced_by_rows_max = False

res_id = data_dict['resource_id']

if data_dict['resource_id'] not in WHITELISTED_RESOURCES:
Expand All @@ -527,12 +515,6 @@ def datastore_search(context, data_dict):
result.pop('id', None)
result.pop('connection_url', None)

# NB this is broken as result['records'] is a string - need total
if limit_reduced_by_rows_max and len(result['records']) == rows_max:
result['records'] = result['records'][:-1]
# result['total'] == rows_max
assert result['total'] == rows_max + 1
data_dict['records_truncated'] = True
return result


Expand Down
4 changes: 3 additions & 1 deletion ckanext/datastore/logic/schema.py
Expand Up @@ -169,7 +169,9 @@ def datastore_search_schema():
'plain': [ignore_missing, boolean_validator],
'filters': [ignore_missing, json_validator],
'language': [ignore_missing, text_type],
'limit': [default(100), natural_number_validator],
'limit': [default(100), natural_number_validator,
limit_to_configured_maximum('ckan.datastore.search.rows_max',
32000)],
'offset': [ignore_missing, int_validator],
'fields': [ignore_missing, list_of_strings_or_string],
'sort': [ignore_missing, list_of_strings_or_string],
Expand Down
17 changes: 5 additions & 12 deletions ckanext/datastore/tests/test_dump.py
Expand Up @@ -444,9 +444,9 @@ def test_dump_with_low_rows_max(self):
app = self._get_test_app()
response = app.get('/datastore/dump/{0}'.format(str(resource['id'])))
assert_equals('_id,book\r\n'
'1,annakarenina\n',
'1,annakarenina\n'
'2,warandpeace\n',
response.body)
assert response.headers['X-Records-Up-To-Rows-Max'] == 'true'

@mock.patch('ckanext.datastore.controller.PAGINATE_BY', 5)
def test_dump_pagination(self):
Expand All @@ -465,7 +465,6 @@ def test_dump_pagination(self):
'1,0\n2,1\n3,2\n4,3\n5,4\n6,5\n7,6\n8,7\n9,8\n10,9\n'
'11,10\n12,11\n',
response.body)
assert response.headers['X-Records-Up-To-Rows-Max'] == 'false'

@helpers.change_config('ckan.datastore.search.rows_max', '7')
@mock.patch('ckanext.datastore.controller.PAGINATE_BY', 5)
Expand All @@ -485,7 +484,6 @@ def test_dump_pagination_csv_with_limit(self):
'_id,record\r\n'
'1,0\n2,1\n3,2\n4,3\n5,4\n6,5\n',
response.body)
assert response.headers['X-Records-Up-To-Rows-Max'] == 'false'

@helpers.change_config('ckan.datastore.search.rows_max', '7')
@mock.patch('ckanext.datastore.controller.PAGINATE_BY', 6)
Expand All @@ -505,7 +503,6 @@ def test_dump_pagination_csv_with_limit_same_as_paginate(self):
'_id,record\r\n'
'1,0\n2,1\n3,2\n4,3\n5,4\n6,5\n',
response.body)
assert response.headers['X-Records-Up-To-Rows-Max'] == 'false'

@helpers.change_config('ckan.datastore.search.rows_max', '6')
@mock.patch('ckanext.datastore.controller.PAGINATE_BY', 5)
Expand All @@ -522,9 +519,8 @@ def test_dump_pagination_with_rows_max(self):
response = app.get('/datastore/dump/{0}?limit=7'.format(str(resource['id'])))
assert_equals(
'_id,record\r\n'
'1,0\n2,1\n3,2\n4,3\n5,4\n6,5\n',
'1,0\n2,1\n3,2\n4,3\n5,4\n6,5\n7,6\n',
response.body)
assert response.headers['X-Records-Up-To-Rows-Max'] == 'true'

@helpers.change_config('ckan.datastore.search.rows_max', '6')
@mock.patch('ckanext.datastore.controller.PAGINATE_BY', 6)
Expand All @@ -541,9 +537,8 @@ def test_dump_pagination_with_rows_max_same_as_paginate(self):
response = app.get('/datastore/dump/{0}?limit=7'.format(str(resource['id'])))
assert_equals(
'_id,record\r\n'
'1,0\n2,1\n3,2\n4,3\n5,4\n6,5\n',
'1,0\n2,1\n3,2\n4,3\n5,4\n6,5\n7,6\n',
response.body)
assert response.headers['X-Records-Up-To-Rows-Max'] == 'true'

@helpers.change_config('ckan.datastore.search.rows_max', '7')
@mock.patch('ckanext.datastore.controller.PAGINATE_BY', 5)
Expand All @@ -565,7 +560,6 @@ def test_dump_pagination_json_with_limit(self):
' "records": [\n [1,0],\n [2,1],\n [3,2],\n [4,3],\n'
' [5,4],\n [6,5]\n]}\n',
response.body)
assert response.headers['X-Records-Up-To-Rows-Max'] == 'false'

@helpers.change_config('ckan.datastore.search.rows_max', '6')
@mock.patch('ckanext.datastore.controller.PAGINATE_BY', 5)
Expand All @@ -585,6 +579,5 @@ def test_dump_pagination_json_with_rows_max(self):
'{\n "fields": [{"type":"int","id":"_id"},'
'{"type":"int4","id":"record"}],\n'
' "records": [\n [1,0],\n [2,1],\n [3,2],\n [4,3],\n'
' [5,4],\n [6,5]\n]}\n',
' [5,4],\n [6,5],\n [7,6]\n]}\n',
response.body)
assert response.headers['X-Records-Up-To-Rows-Max'] == 'true'
6 changes: 2 additions & 4 deletions ckanext/datastore/tests/test_search.py
Expand Up @@ -301,9 +301,7 @@ def test_search_limit(self):
result = helpers.call_action('datastore_search', **search_data)
assert_equals(result['total'], 2)
assert_equals(result['records'], [{u'the year': 2014, u'_id': 1}])
# it's limited by the user-given limit, rather than the rows_max
# setting, so it is not records_truncated
assert 'records_truncated' not in result
assert_equals(result['limit'], 1)

def test_search_limit_invalid(self):
resource = factories.Resource()
Expand Down Expand Up @@ -363,7 +361,7 @@ def test_search_limit_config_default(self):
result = helpers.call_action('datastore_search', **search_data)
assert_equals(result['total'], 2)
assert_equals(result['records'], [{u'the year': 2014, u'_id': 1}])
assert_equals(result[u'records_truncated'], True)
assert_equals(result['limit'], 1)


class TestDatastoreSearchLegacyTests(DatastoreLegacyTestBase):
Expand Down
1 change: 0 additions & 1 deletion doc/maintaining/configuration.rst
Expand Up @@ -301,7 +301,6 @@ Maximum allowed value for the number of rows returned by the datastore.
Specifically this limits:
* ``datastore_search``'s ``limit`` parameter.
* ``datastore_search_sql`` queries have this limit inserted.
* Datastore 'dump' at /datastore/dump/{RESOURCE-ID}

Site Settings
-------------
Expand Down
7 changes: 0 additions & 7 deletions doc/maintaining/datastore.rst
Expand Up @@ -252,13 +252,6 @@ tab-separated file use
A number of parameters from :meth:`~ckanext.datastore.logic.action.datastore_search` can be used:
``offset``, ``limit``, ``filters``, ``q``, ``distinct``, ``plain``, ``language``, ``fields``, ``sort``

The number of records returned is limited by site configuration
``ckan.datastore.search.rows_max``, which has a default of 32000.
Whether or not the number of records returned reaches this limit is indicated
by a HTTP header:

X-Records-Up-To-Rows-Max: true

.. _CSV: https://en.wikipedia.org/wiki/Comma-separated_values


Expand Down

0 comments on commit 497795c

Please sign in to comment.