Skip to content

Commit

Permalink
[#1792] Optimize FTS queries on non-indexed fields
Browse files Browse the repository at this point in the history
As per the commit aa51630, we can't create full-text search indexes on all
field types. Specifically, we're only indexing textual and numeric fields. Even
though we don't index other fields, the users are still able to do FTS queries
on them, and we also use them on the autocomplete, so we'd like them to be
relatively fast.

When creating a datastore table, we add a "_full_text" column with the content
of all the other columns. This is used when the user wants to do a FTS on the
entire resource, regardless of columns.

The idea behind the optimization in this patch is to use that "_full_text"
column to limit the number of rows Postgres will search. While testing, I've
used a sample table with 1 million rows and 540 MB. Among others, it has a
column named "timestamp" with (obviously) a timestamp. There're no indexes on
this column. I've done 2 tests: (a) do a FTS only on the timestamp column, and;
(b) do a FTS on both the timestamp and _full_text columns.

The results below are the average of running the tests 5 times.

(a) Querying for '06' just on the timestamp field took 5682ms;
(b) Querying for '06' both on the _full_text and timestamp fields took 90ms

For this test, querying for both _full_text and timestamp made the query ~63
times faster. Unfortunately, the result depends a lot on the query terms and
the data. On our test data, every timestamp is from 2014. If instead of
searching for '06' we search for '2014', this is what we get (again, repeated 5
times and averaged out):

(a) Querying for '2014' just on the timestamp field took 12455ms;
(a) Querying for '2014' just on the _full_text and timestamp fields took 14534ms;

Using the _full_text field made this query ~17% slower. The problem here is
that this optimization only makes sense if quering the _full_text reduces the
number of rows that we have to query on the timestamp field. When that's not
the case, it makes the query slower.

In my opinion, this patch is still worthy.
  • Loading branch information
vitorbaptista committed Aug 7, 2014
1 parent aa51630 commit c033efb
Show file tree
Hide file tree
Showing 7 changed files with 178 additions and 27 deletions.
34 changes: 16 additions & 18 deletions ckanext/datastore/db.py
Expand Up @@ -233,14 +233,13 @@ def _get_fields_types(context, data_dict):
def json_get_values(obj, current_list=None):
if current_list is None:
current_list = []
if isinstance(obj, basestring):
current_list.append(obj)
if isinstance(obj, list):
if isinstance(obj, list) or isinstance(obj, tuple):
for item in obj:
json_get_values(item, current_list)
if isinstance(obj, dict):
for item in obj.values():
json_get_values(item, current_list)
elif isinstance(obj, dict):
json_get_values(obj.items(), current_list)
elif obj:
current_list.append(str(obj))
return current_list


Expand Down Expand Up @@ -454,17 +453,17 @@ def _build_fts_indexes(connection, data_dict, sql_index_str_method, fields):
# create full-text search indexes
to_tsvector = lambda x: u"to_tsvector('{0}', {1})".format(fts_lang, x)
cast_as_text = lambda x: u'cast("{0}" AS text)'.format(x)
full_text_field = {'type': 'text', 'id': '_full_text'}
full_text_field = {'type': 'tsvector', 'id': '_full_text'}
for field in [full_text_field] + fields:
if not _should_index_field(field):
if not datastore_helpers.should_fts_index_field_type(field['type']):
continue

field_str = field['id']
if field['type'] != 'text':
if field['type'] not in ['text', 'tsvector']:
field_str = cast_as_text(field_str)
else:
field_str = u'"{0}"'.format(field_str)
if field['id'] != '_full_text':
if field['type'] != 'tsvector':
field_str = to_tsvector(field_str)
fts_indexes.append(sql_index_str_method.format(
res_id=resource_id,
Expand All @@ -475,10 +474,6 @@ def _build_fts_indexes(connection, data_dict, sql_index_str_method, fields):
return fts_indexes


def _should_index_field(field):
return field['type'] in ['text', 'number']


def _generate_index_name(resource_id, field):
value = (resource_id + field).encode('utf-8')
return hashlib.sha1(value).hexdigest()
Expand Down Expand Up @@ -765,11 +760,14 @@ def _to_full_text(fields, record):
full_text = []
for field in fields:
value = record.get(field['id'])
if field['type'].lower() == 'nested' and value:
full_text.extend(json_get_values(value))
elif field['type'].lower() == 'text' and value:
if not value:
continue

if field['type'].lower() == 'text':
full_text.append(value)
return ' '.join(full_text)
else:
full_text.extend(json_get_values(value))
return ' '.join(set(full_text))


def _where(where_clauses_and_values):
Expand Down
4 changes: 4 additions & 0 deletions ckanext/datastore/helpers.py
Expand Up @@ -33,3 +33,7 @@ def _strip(input):
if isinstance(input, basestring) and len(input) and input[0] == input[-1]:
return input.strip().strip('"')
return input


def should_fts_index_field_type(field_type):
return field_type.lower() in ['tsvector', 'text', 'number']
27 changes: 20 additions & 7 deletions ckanext/datastore/plugin.py
Expand Up @@ -402,14 +402,23 @@ def _where(self, data_dict, fields_types):
q = data_dict.get('q')
if q:
if isinstance(q, basestring):
clause_str = u'_full_text @@ {0}'.format(self._ts_query_alias())
ts_query_alias = self._ts_query_alias()
clause_str = u'_full_text @@ {0}'.format(ts_query_alias)
clauses.append((clause_str,))
elif isinstance(q, dict):
lang = self._fts_lang(data_dict.get('lang'))
for field, value in q.iteritems():
if field not in fields_types:
continue
query_field = self._ts_query_alias(field)
clause_str = u'cast("{0}" as text) @@ {1}'.format(field, query_field)

ftyp = fields_types[field]
if not datastore_helpers.should_fts_index_field_type(ftyp):
clause_str = u'_full_text @@ {0}'.format(query_field)
clauses.append((clause_str,))

clause_str = (u'to_tsvector(\'{0}\', cast("{1}" as text)) '
u'@@ {2}').format(lang, field, query_field)
clauses.append((clause_str,))

return clauses
Expand Down Expand Up @@ -450,10 +459,7 @@ def _sort(self, data_dict, fields_types):

def _textsearch_query(self, data_dict):
q = data_dict.get('q')
default_fts_lang = pylons.config.get('ckan.datastore.default_fts_lang')
if default_fts_lang is None:
default_fts_lang = u'english'
lang = data_dict.get(u'lang', default_fts_lang)
lang = self._fts_lang(data_dict.get('lang'))

if not q:
return '', ''
Expand All @@ -480,6 +486,12 @@ def _textsearch_query(self, data_dict):
rank_columns_str = ', ' + ', '.join(rank_columns)
return statements_str, rank_columns_str

def _fts_lang(self, lang=None):
default_fts_lang = pylons.config.get('ckan.datastore.default_fts_lang')
if default_fts_lang is None:
default_fts_lang = u'english'
return lang or default_fts_lang

def _build_query_and_rank_statements(self, lang, query, plain, field=None):
query_alias = self._ts_query_alias(field)
rank_alias = self._ts_rank_alias(field)
Expand All @@ -490,7 +502,8 @@ def _build_query_and_rank_statements(self, lang, query, plain, field=None):
if field is None:
rank_field = '_full_text'
else:
rank_field = u'to_tsvector(cast("{0}" as text))'.format(field)
rank_field = u'to_tsvector(\'{lang}\', cast("{field}" as text))'
rank_field = rank_field.format(lang=lang, field=field)
rank_statement = u'ts_rank({rank_field}, {query_alias}, 32) AS {alias}'
statement = statement.format(lang=lang, query=query, alias=query_alias)
rank_statement = rank_statement.format(rank_field=rank_field,
Expand Down
20 changes: 20 additions & 0 deletions ckanext/datastore/tests/test_db.py
Expand Up @@ -102,3 +102,23 @@ def _assert_created_index_on(self, field, connection, resource_id, lang=None, ca

assert was_called, ("Expected 'connection.execute' to have been "
"called with a string containing '%s'" % sql_str)


class TestJsonGetValues(object):
def test_returns_empty_list_if_called_with_none(self):
assert_equal(db.json_get_values(None), [])

def test_returns_list_with_value_if_called_with_string(self):
assert_equal(db.json_get_values('foo'), ['foo'])

def test_returns_list_with_only_the_original_truthy_values_if_called(self):
data = [None, 'foo', 42, 'bar', {}, []]
assert_equal(db.json_get_values(data), ['foo', '42', 'bar'])

def test_returns_flattened_list(self):
data = ['foo', ['bar', ('baz', 42)]]
assert_equal(db.json_get_values(data), ['foo', 'bar', 'baz', '42'])

def test_returns_only_truthy_values_from_dict(self):
data = {'foo': 'bar', 'baz': [42, None, {}, [], 'hey']}
assert_equal(db.json_get_values(data), ['foo', 'bar', 'baz', '42', 'hey'])
17 changes: 17 additions & 0 deletions ckanext/datastore/tests/test_helpers.py
Expand Up @@ -32,3 +32,20 @@ def test_is_single_statement(self):

for multiple in multiples:
assert helpers.is_single_statement(multiple) is False

def test_should_fts_index_field_type(self):
indexable_field_types = ['tsvector',
'text',
'number']

non_indexable_field_types = ['nested',
'timestamp',
'date',
'_text',
'text[]']

for indexable in indexable_field_types:
assert helpers.should_fts_index_field_type(indexable) is True

for non_indexable in non_indexable_field_types:
assert helpers.should_fts_index_field_type(non_indexable) is False
99 changes: 99 additions & 0 deletions ckanext/datastore/tests/test_plugin.py
@@ -1,4 +1,5 @@
import nose
import mock

import ckan.new_tests.helpers as helpers
import ckan.plugins as p
Expand Down Expand Up @@ -92,6 +93,104 @@ def test_lang_parameter_overwrites_default_fts_lang(self):

assert_equal(result['ts_query'], expected_ts_query)

def test_fts_rank_column_uses_lang_when_casting_to_tsvector(self):
expected_select_content = u'to_tsvector(\'french\', cast("country" as text))'
data_dict = {
'q': {'country': 'Brazil'},
'lang': 'french',
}

result = self._datastore_search(data_dict=data_dict)

assert expected_select_content in result['select'][0], result['select']

def test_adds_fts_on_full_text_field_when_q_is_a_string(self):
expected_where = [(u'_full_text @@ "query"',)]
data_dict = {
'q': 'foo',
}

result = self._datastore_search(data_dict=data_dict)

assert_equal(result['where'], expected_where)

def test_ignores_fts_searches_on_inexistent_fields(self):
data_dict = {
'q': {'inexistent-field': 'value'},
}

result = self._datastore_search(data_dict=data_dict, fields_types={})

assert_equal(result['where'], [])

@helpers.change_config('ckan.datastore.default_fts_lang', None)
def test_fts_where_clause_lang_uses_english_by_default(self):
expected_where = [(u'to_tsvector(\'english\', cast("country" as text))'
u' @@ "query country"',)]
data_dict = {
'q': {'country': 'Brazil'},
}
fields_types = {
'country': 'text',
}

result = self._datastore_search(data_dict=data_dict,
fields_types=fields_types)

assert_equal(result['where'], expected_where)

@helpers.change_config('ckan.datastore.default_fts_lang', 'simple')
def test_fts_where_clause_lang_can_be_overwritten_by_config(self):
expected_where = [(u'to_tsvector(\'simple\', cast("country" as text))'
u' @@ "query country"',)]
data_dict = {
'q': {'country': 'Brazil'},
}
fields_types = {
'country': 'text',
}

result = self._datastore_search(data_dict=data_dict,
fields_types=fields_types)

assert_equal(result['where'], expected_where)

@helpers.change_config('ckan.datastore.default_fts_lang', 'simple')
def test_fts_where_clause_lang_can_be_overwritten_using_lang_param(self):
expected_where = [(u'to_tsvector(\'french\', cast("country" as text))'
u' @@ "query country"',)]
data_dict = {
'q': {'country': 'Brazil'},
'lang': 'french',
}
fields_types = {
'country': 'text',
}

result = self._datastore_search(data_dict=data_dict,
fields_types=fields_types)

assert_equal(result['where'], expected_where)

@mock.patch('ckanext.datastore.helpers.should_fts_index_field_type')
def test_fts_adds_where_clause_on_full_text_when_querying_non_indexed_fields(self, should_fts_index_field_type):
should_fts_index_field_type.return_value = False
expected_where = [('_full_text @@ "query country"',),
(u'to_tsvector(\'english\', cast("country" as text))'
u' @@ "query country"',)]
data_dict = {
'q': {'country': 'Brazil'},
'lang': 'english',
}
fields_types = {
'country': 'non-indexed field type',
}

result = self._datastore_search(data_dict=data_dict,
fields_types=fields_types)

assert_equal(result['where'], expected_where)

def _datastore_search(self, context={}, data_dict={}, fields_types={}, query_dict={}):
_query_dict = {
'select': [],
Expand Down
4 changes: 2 additions & 2 deletions ckanext/datastore/tests/test_search.py
Expand Up @@ -732,13 +732,13 @@ def setup_class(cls):
name='test_org',
apikey=cls.sysadmin_user.apikey)

cls.expected_records = [{u'_full_text': u"'annakarenina':1 'b':3 'moo':4 'tolstoy':2",
cls.expected_records = [{u'_full_text': u"'-01':3 '-03':2 '2005':1 'annakarenina':7 'b':6 'moo':4 'tolstoy':5",
u'_id': 1,
u'author': u'tolstoy',
u'b\xfck': u'annakarenina',
u'nested': [u'b', {u'moo': u'moo'}],
u'published': u'2005-03-01T00:00:00'},
{u'_full_text': u"'b':3 'tolstoy':2 'warandpeac':1",
{u'_full_text': u"'b':4 'tolstoy':3 'warandpeac':2",
u'_id': 2,
u'author': u'tolstoy',
u'b\xfck': u'warandpeace',
Expand Down

0 comments on commit c033efb

Please sign in to comment.