Skip to content

Commit

Permalink
full text search with ranking based on query
Browse files Browse the repository at this point in the history
  • Loading branch information
domoritz committed Sep 3, 2012
1 parent 96d5e44 commit 676c417
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 21 deletions.
45 changes: 33 additions & 12 deletions ckanext/datastore/db.py
Expand Up @@ -342,21 +342,35 @@ def _where(field_ids, data_dict):
where_clauses.append(u'"{}" = %s'.format(field))
values.append(value)

q = data_dict.get('q')
if q:
if (not data_dict.get('plain')
or str(data_dict.get('plain').lower()) in ['true', '1']):
where_clauses.append('_full_text @@ plainto_tsquery(%s)')
else:
where_clauses.append('_full_text @@ to_tsquery(%s)')
values.append(q)
# add full-text search where clause
if data_dict.get('q'):
where_clauses.append('_full_text @@ query')

where_clause = ' and '.join(where_clauses)
if where_clause:
where_clause = 'where ' + where_clause
return where_clause, values


def _textsearch_query(data_dict):
q = data_dict.get('q')
if q:
if (not data_dict.get('plain')
or str(data_dict.get('plain')).lower() in ['true', '1']):
statement = ", plainto_tsquery('{lang}', '{query}') query"
else:
statement = ", to_tsquery('{lang}', '{query}') query"

rank_column = ', ts_rank_cd(_full_text, query, 32) AS rank'
return statement.format(lang='english', query=q), rank_column
return '', ''


def _rank_column(data_dict):
if data_dict.get('plain'):
return ', ts_rank_cd(_full_text, query, 32) AS rank'


def _sort(context, sort, field_ids):

if not sort:
Expand Down Expand Up @@ -437,6 +451,7 @@ def search_data(context, data_dict):

select_columns = ', '.join([u'"{}"'.format(field_id)
for field_id in field_ids])
ts_query, rank_column = _textsearch_query(data_dict)
where_clause, where_values = _where(all_field_ids, data_dict)
limit = data_dict.get('limit', 100)
offset = data_dict.get('offset', 0)
Expand All @@ -446,10 +461,16 @@ def search_data(context, data_dict):

sort = _sort(context, data_dict.get('sort'), field_ids)

sql_string = u'''select {}, count(*) over() as "_full_count"
from "{}" {} {} limit {} offset {}'''\
.format(select_columns, data_dict['resource_id'], where_clause,
sort, limit, offset)
sql_string = u'''select {select}, count(*) over() as "_full_count" {rank}
from "{resource}" {ts_query}
{where} {sort} limit {limit} offset {offset}'''\
.format(
select=select_columns,
rank=rank_column,
resource=data_dict['resource_id'],
ts_query=ts_query,
where=where_clause,
sort=sort, limit=limit, offset=offset)
results = context['connection'].execute(sql_string, where_values)
return format_results(context, results, data_dict)

Expand Down
35 changes: 26 additions & 9 deletions ckanext/datastore/tests/test_datastore.py
Expand Up @@ -9,6 +9,10 @@
import pprint


def extract(d, keys):
return dict((k, d[k]) for k in keys if k in d)


class TestDatastoreCreate(tests.WsgiAppCase):
sysadmin_user = None
normal_user = None
Expand Down Expand Up @@ -658,6 +662,7 @@ def test_search_invalid_offset(self):
def test_search_full_text(self):
data = {'resource_id': self.data['resource_id'],
'q': 'annakarenina'}

postparams = '%s=1' % json.dumps(data)
auth = {'Authorization': str(self.sysadmin_user.apikey)}
res = self.app.post('/api/action/datastore_search', params=postparams,
Expand All @@ -666,7 +671,9 @@ def test_search_full_text(self):
assert res_dict['success'] is True
result = res_dict['result']
assert result['total'] == 1
assert result['records'] == [self.expected_records[0]]

results = [extract(result['records'][0], [u'_id', u'author', u'b\xfck', u'nested', u'published'])]
assert results == [self.expected_records[0]]

data = {'resource_id': self.data['resource_id'],
'q': 'tolstoy'}
Expand All @@ -677,9 +684,17 @@ def test_search_full_text(self):
assert res_dict['success'] is True
result = res_dict['result']
assert result['total'] == 2
assert result['records'] == self.expected_records, result['records']

assert result['fields'] == [{u'type': u'int4', u'id': u'_id'}, {u'type': u'text', u'id': u'b\xfck'}, {u'type': u'text', u'id': u'author'}, {u'type': u'timestamp', u'id': u'published'}, {u'type': u'_json', u'id': u'nested'}], result['fields']
results = [extract(record, [u'_id', u'author', u'b\xfck', u'nested', u'published']) for record in result['records']]
assert results == self.expected_records, result['records']

expected_fields = [{u'type': u'int4', u'id': u'_id'},
{u'type': u'text', u'id': u'b\xfck'},
{u'type': u'text', u'id': u'author'},
{u'type': u'timestamp', u'id': u'published'},
{u'type': u'_json', u'id': u'nested'},
{u'type': u'float4', u'id': u'rank'}]
for field in expected_fields:
assert field in result['fields'], field

# test multiple word queries (connected with and)
data = {'resource_id': self.data['resource_id'],
Expand All @@ -692,9 +707,11 @@ def test_search_full_text(self):
assert res_dict['success'] is True
result = res_dict['result']
assert result['total'] == 1
assert result['records'] == [self.expected_records[0]], result['records']
results = [extract(result['records'][0], [u'_id', u'author', u'b\xfck', u'nested', u'published'])]
assert results == [self.expected_records[0]], result['records']

assert result['fields'] == [{u'type': u'int4', u'id': u'_id'}, {u'type': u'text', u'id': u'b\xfck'}, {u'type': u'text', u'id': u'author'}, {u'type': u'timestamp', u'id': u'published'}, {u'type': u'_json', u'id': u'nested'}], result['fields']
for field in expected_fields:
assert field in result['fields'], field


class TestDatastoreFullTextSearch(tests.WsgiAppCase):
Expand All @@ -706,8 +723,8 @@ def setup_class(cls):
cls.normal_user = model.User.get('annafan')
resource = model.Package.get('annakarenina').resources[0]
cls.data = dict(
resource_id = resource.id,
fields = [
resource_id=resource.id,
fields=[
{'id': 'id'},
{'id': 'date', 'type':'date'},
{'id': 'x'},
Expand All @@ -718,7 +735,7 @@ def setup_class(cls):
{'id': 'lat'},
{'id': 'lon'}
],
records = [
records=[
{'id': 0, 'date': '2011-01-01', 'x': 1, 'y': 2, 'z': 3, 'country': 'DE', 'title': 'first', 'lat':52.56, 'lon':13.40},
{'id': 1, 'date': '2011-02-02', 'x': 2, 'y': 4, 'z': 24, 'country': 'UK', 'title': 'second', 'lat':54.97, 'lon':-1.60},
{'id': 2, 'date': '2011-03-03', 'x': 3, 'y': 6, 'z': 9, 'country': 'US', 'title': 'third', 'lat':40.00, 'lon':-75.5},
Expand Down

0 comments on commit 676c417

Please sign in to comment.