[#1792] Optimize FTS queries on non-indexed fields

As per the commit aa51630, we can't create full-text search indexes on all field types. Specifically, we're only indexing textual and numeric fields. Even though we don't index other fields, the users are still able to do FTS queries on them, and we also use them on the autocomplete, so we'd like them to be relatively fast. When creating a datastore table, we add a "_full_text" column with the content of all the other columns. This is used when the user wants to do a FTS on the entire resource, regardless of columns. The idea behind the optimization in this patch is to use that "_full_text" column to limit the number of rows Postgres will search. While testing, I've used a sample table with 1 million rows and 540 MB. Among others, it has a column named "timestamp" with (obviously) a timestamp. There're no indexes on this column. I've done 2 tests: (a) do a FTS only on the timestamp column, and; (b) do a FTS on both the timestamp and _full_text columns. The results below are the average of running the tests 5 times. (a) Querying for '06' just on the timestamp field took 5682ms; (b) Querying for '06' both on the _full_text and timestamp fields took 90ms For this test, querying for both _full_text and timestamp made the query ~63 times faster. Unfortunately, the result depends a lot on the query terms and the data. On our test data, every timestamp is from 2014. If instead of searching for '06' we search for '2014', this is what we get (again, repeated 5 times and averaged out): (a) Querying for '2014' just on the timestamp field took 12455ms; (a) Querying for '2014' just on the _full_text and timestamp fields took 14534ms; Using the _full_text field made this query ~17% slower. The problem here is that this optimization only makes sense if quering the _full_text reduces the number of rows that we have to query on the timestamp field. When that's not the case, it makes the query slower. In my opinion, this patch is still worthy.
ckan · Aug 7, 2014 · c033efb · c033efb
1 parent aa51630
commit c033efb
Show file tree

Hide file tree

Showing 7 changed files with 178 additions and 27 deletions.
diff --git a/ckanext/datastore/db.py b/ckanext/datastore/db.py
@@ -233,14 +233,13 @@ def _get_fields_types(context, data_dict):
 def json_get_values(obj, current_list=None):
     if current_list is None:
         current_list = []
-    if isinstance(obj, basestring):
-        current_list.append(obj)
-    if isinstance(obj, list):
+    if isinstance(obj, list) or isinstance(obj, tuple):
         for item in obj:
             json_get_values(item, current_list)
-    if isinstance(obj, dict):
-        for item in obj.values():
-            json_get_values(item, current_list)
+    elif isinstance(obj, dict):
+        json_get_values(obj.items(), current_list)
+    elif obj:
+        current_list.append(str(obj))
     return current_list
 
 
@@ -454,17 +453,17 @@ def _build_fts_indexes(connection, data_dict, sql_index_str_method, fields):
     # create full-text search indexes
     to_tsvector = lambda x: u"to_tsvector('{0}', {1})".format(fts_lang, x)
     cast_as_text = lambda x: u'cast("{0}" AS text)'.format(x)
-    full_text_field = {'type': 'text', 'id': '_full_text'}
+    full_text_field = {'type': 'tsvector', 'id': '_full_text'}
     for field in [full_text_field] + fields:
-        if not _should_index_field(field):
+        if not datastore_helpers.should_fts_index_field_type(field['type']):
             continue
 
         field_str = field['id']
-        if field['type'] != 'text':
+        if field['type'] not in ['text', 'tsvector']:
             field_str = cast_as_text(field_str)
         else:
             field_str = u'"{0}"'.format(field_str)
-        if field['id'] != '_full_text':
+        if field['type'] != 'tsvector':
             field_str = to_tsvector(field_str)
         fts_indexes.append(sql_index_str_method.format(
             res_id=resource_id,
@@ -475,10 +474,6 @@ def _build_fts_indexes(connection, data_dict, sql_index_str_method, fields):
     return fts_indexes
 
 
-def _should_index_field(field):
-    return field['type'] in ['text', 'number']
-
-
 def _generate_index_name(resource_id, field):
     value = (resource_id + field).encode('utf-8')
     return hashlib.sha1(value).hexdigest()
@@ -765,11 +760,14 @@ def _to_full_text(fields, record):
     full_text = []
     for field in fields:
         value = record.get(field['id'])
-        if field['type'].lower() == 'nested' and value:
-            full_text.extend(json_get_values(value))
-        elif field['type'].lower() == 'text' and value:
+        if not value:
+            continue
+
+        if field['type'].lower() == 'text':
             full_text.append(value)
-    return ' '.join(full_text)
+        else:
+            full_text.extend(json_get_values(value))
+    return ' '.join(set(full_text))
 
 
 def _where(where_clauses_and_values):

diff --git a/ckanext/datastore/helpers.py b/ckanext/datastore/helpers.py
@@ -33,3 +33,7 @@ def _strip(input):
     if isinstance(input, basestring) and len(input) and input[0] == input[-1]:
         return input.strip().strip('"')
     return input
+
+
+def should_fts_index_field_type(field_type):
+    return field_type.lower() in ['tsvector', 'text', 'number']
diff --git a/ckanext/datastore/plugin.py b/ckanext/datastore/plugin.py
@@ -402,14 +402,23 @@ def _where(self, data_dict, fields_types):
         q = data_dict.get('q')
         if q:
             if isinstance(q, basestring):
-                clause_str = u'_full_text @@ {0}'.format(self._ts_query_alias())
+                ts_query_alias = self._ts_query_alias()
+                clause_str = u'_full_text @@ {0}'.format(ts_query_alias)
                 clauses.append((clause_str,))
             elif isinstance(q, dict):
+                lang = self._fts_lang(data_dict.get('lang'))
                 for field, value in q.iteritems():
                     if field not in fields_types:
                         continue
                     query_field = self._ts_query_alias(field)
-                    clause_str = u'cast("{0}" as text) @@ {1}'.format(field, query_field)
+
+                    ftyp = fields_types[field]
+                    if not datastore_helpers.should_fts_index_field_type(ftyp):
+                        clause_str = u'_full_text @@ {0}'.format(query_field)
+                        clauses.append((clause_str,))
+
+                    clause_str = (u'to_tsvector(\'{0}\', cast("{1}" as text)) '
+                                  u'@@ {2}').format(lang, field, query_field)
                     clauses.append((clause_str,))
 
         return clauses
@@ -450,10 +459,7 @@ def _sort(self, data_dict, fields_types):
 
     def _textsearch_query(self, data_dict):
         q = data_dict.get('q')
-        default_fts_lang = pylons.config.get('ckan.datastore.default_fts_lang')
-        if default_fts_lang is None:
-            default_fts_lang = u'english'
-        lang = data_dict.get(u'lang', default_fts_lang)
+        lang = self._fts_lang(data_dict.get('lang'))
 
         if not q:
             return '', ''
@@ -480,6 +486,12 @@ def _textsearch_query(self, data_dict):
         rank_columns_str = ', ' + ', '.join(rank_columns)
         return statements_str, rank_columns_str
 
+    def _fts_lang(self, lang=None):
+        default_fts_lang = pylons.config.get('ckan.datastore.default_fts_lang')
+        if default_fts_lang is None:
+            default_fts_lang = u'english'
+        return lang or default_fts_lang
+
     def _build_query_and_rank_statements(self, lang, query, plain, field=None):
         query_alias = self._ts_query_alias(field)
         rank_alias = self._ts_rank_alias(field)
@@ -490,7 +502,8 @@ def _build_query_and_rank_statements(self, lang, query, plain, field=None):
         if field is None:
             rank_field = '_full_text'
         else:
-            rank_field = u'to_tsvector(cast("{0}" as text))'.format(field)
+            rank_field = u'to_tsvector(\'{lang}\', cast("{field}" as text))'
+            rank_field = rank_field.format(lang=lang, field=field)
         rank_statement = u'ts_rank({rank_field}, {query_alias}, 32) AS {alias}'
         statement = statement.format(lang=lang, query=query, alias=query_alias)
         rank_statement = rank_statement.format(rank_field=rank_field,

diff --git a/ckanext/datastore/tests/test_db.py b/ckanext/datastore/tests/test_db.py
@@ -102,3 +102,23 @@ def _assert_created_index_on(self, field, connection, resource_id, lang=None, ca
 
         assert was_called, ("Expected 'connection.execute' to have been "
                             "called with a string containing '%s'" % sql_str)
+
+
+class TestJsonGetValues(object):
+    def test_returns_empty_list_if_called_with_none(self):
+        assert_equal(db.json_get_values(None), [])
+
+    def test_returns_list_with_value_if_called_with_string(self):
+        assert_equal(db.json_get_values('foo'), ['foo'])
+
+    def test_returns_list_with_only_the_original_truthy_values_if_called(self):
+        data = [None, 'foo', 42, 'bar', {}, []]
+        assert_equal(db.json_get_values(data), ['foo', '42', 'bar'])
+
+    def test_returns_flattened_list(self):
+        data = ['foo', ['bar', ('baz', 42)]]
+        assert_equal(db.json_get_values(data), ['foo', 'bar', 'baz', '42'])
+
+    def test_returns_only_truthy_values_from_dict(self):
+        data = {'foo': 'bar', 'baz': [42, None, {}, [], 'hey']}
+        assert_equal(db.json_get_values(data), ['foo', 'bar', 'baz', '42', 'hey'])
diff --git a/ckanext/datastore/tests/test_helpers.py b/ckanext/datastore/tests/test_helpers.py
@@ -32,3 +32,20 @@ def test_is_single_statement(self):
 
         for multiple in multiples:
             assert helpers.is_single_statement(multiple) is False
+
+    def test_should_fts_index_field_type(self):
+        indexable_field_types = ['tsvector',
+                                 'text',
+                                 'number']
+
+        non_indexable_field_types = ['nested',
+                                     'timestamp',
+                                     'date',
+                                     '_text',
+                                     'text[]']
+
+        for indexable in indexable_field_types:
+            assert helpers.should_fts_index_field_type(indexable) is True
+
+        for non_indexable in non_indexable_field_types:
+            assert helpers.should_fts_index_field_type(non_indexable) is False
diff --git a/ckanext/datastore/tests/test_plugin.py b/ckanext/datastore/tests/test_plugin.py
@@ -1,4 +1,5 @@
 import nose
+import mock
 
 import ckan.new_tests.helpers as helpers
 import ckan.plugins as p
@@ -92,6 +93,104 @@ def test_lang_parameter_overwrites_default_fts_lang(self):
 
         assert_equal(result['ts_query'], expected_ts_query)
 
+    def test_fts_rank_column_uses_lang_when_casting_to_tsvector(self):
+        expected_select_content = u'to_tsvector(\'french\', cast("country" as text))'
+        data_dict = {
+            'q': {'country': 'Brazil'},
+            'lang': 'french',
+        }
+
+        result = self._datastore_search(data_dict=data_dict)
+
+        assert expected_select_content in result['select'][0], result['select']
+
+    def test_adds_fts_on_full_text_field_when_q_is_a_string(self):
+        expected_where = [(u'_full_text @@ "query"',)]
+        data_dict = {
+            'q': 'foo',
+        }
+
+        result = self._datastore_search(data_dict=data_dict)
+
+        assert_equal(result['where'], expected_where)
+
+    def test_ignores_fts_searches_on_inexistent_fields(self):
+        data_dict = {
+            'q': {'inexistent-field': 'value'},
+        }
+
+        result = self._datastore_search(data_dict=data_dict, fields_types={})
+
+        assert_equal(result['where'], [])
+
+    @helpers.change_config('ckan.datastore.default_fts_lang', None)
+    def test_fts_where_clause_lang_uses_english_by_default(self):
+        expected_where = [(u'to_tsvector(\'english\', cast("country" as text))'
+                           u' @@ "query country"',)]
+        data_dict = {
+            'q': {'country': 'Brazil'},
+        }
+        fields_types = {
+            'country': 'text',
+        }
+
+        result = self._datastore_search(data_dict=data_dict,
+                                        fields_types=fields_types)
+
+        assert_equal(result['where'], expected_where)
+
+    @helpers.change_config('ckan.datastore.default_fts_lang', 'simple')
+    def test_fts_where_clause_lang_can_be_overwritten_by_config(self):
+        expected_where = [(u'to_tsvector(\'simple\', cast("country" as text))'
+                           u' @@ "query country"',)]
+        data_dict = {
+            'q': {'country': 'Brazil'},
+        }
+        fields_types = {
+            'country': 'text',
+        }
+
+        result = self._datastore_search(data_dict=data_dict,
+                                        fields_types=fields_types)
+
+        assert_equal(result['where'], expected_where)
+
+    @helpers.change_config('ckan.datastore.default_fts_lang', 'simple')
+    def test_fts_where_clause_lang_can_be_overwritten_using_lang_param(self):
+        expected_where = [(u'to_tsvector(\'french\', cast("country" as text))'
+                           u' @@ "query country"',)]
+        data_dict = {
+            'q': {'country': 'Brazil'},
+            'lang': 'french',
+        }
+        fields_types = {
+            'country': 'text',
+        }
+
+        result = self._datastore_search(data_dict=data_dict,
+                                        fields_types=fields_types)
+
+        assert_equal(result['where'], expected_where)
+
+    @mock.patch('ckanext.datastore.helpers.should_fts_index_field_type')
+    def test_fts_adds_where_clause_on_full_text_when_querying_non_indexed_fields(self, should_fts_index_field_type):
+        should_fts_index_field_type.return_value = False
+        expected_where = [('_full_text @@ "query country"',),
+                          (u'to_tsvector(\'english\', cast("country" as text))'
+                           u' @@ "query country"',)]
+        data_dict = {
+            'q': {'country': 'Brazil'},
+            'lang': 'english',
+        }
+        fields_types = {
+            'country': 'non-indexed field type',
+        }
+
+        result = self._datastore_search(data_dict=data_dict,
+                                        fields_types=fields_types)
+
+        assert_equal(result['where'], expected_where)
+
     def _datastore_search(self, context={}, data_dict={}, fields_types={}, query_dict={}):
         _query_dict = {
             'select': [],

diff --git a/ckanext/datastore/tests/test_search.py b/ckanext/datastore/tests/test_search.py
@@ -732,13 +732,13 @@ def setup_class(cls):
             name='test_org',
             apikey=cls.sysadmin_user.apikey)
 
-        cls.expected_records = [{u'_full_text': u"'annakarenina':1 'b':3 'moo':4 'tolstoy':2",
+        cls.expected_records = [{u'_full_text': u"'-01':3 '-03':2 '2005':1 'annakarenina':7 'b':6 'moo':4 'tolstoy':5",
                                  u'_id': 1,
                                  u'author': u'tolstoy',
                                  u'b\xfck': u'annakarenina',
                                  u'nested': [u'b', {u'moo': u'moo'}],
                                  u'published': u'2005-03-01T00:00:00'},
-                                {u'_full_text': u"'b':3 'tolstoy':2 'warandpeac':1",
+                                {u'_full_text': u"'b':4 'tolstoy':3 'warandpeac':2",
                                  u'_id': 2,
                                  u'author': u'tolstoy',
                                  u'b\xfck': u'warandpeace',