diff --git a/ckan/config/solr/CHANGELOG.txt b/ckan/config/solr/CHANGELOG.txt index eb600e042cc..769c6b53735 100644 --- a/ckan/config/solr/CHANGELOG.txt +++ b/ckan/config/solr/CHANGELOG.txt @@ -1,6 +1,13 @@ CKAN SOLR schemas changelog =========================== +v2.0 - (ckan>=2.0) +-------------------- +* Add _version_ field to make it compatible with solr 4.0 +* Remove stopwords +* Add dataset_type field. +* Add *_date autofield. + v1.4 - (ckan>=1.7) -------------------- * Add Ascii folding filter to text fields. diff --git a/ckan/config/solr/schema-2.0.xml b/ckan/config/solr/schema-2.0.xml new file mode 100644 index 00000000000..7e681738c5b --- /dev/null +++ b/ckan/config/solr/schema-2.0.xml @@ -0,0 +1,164 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +index_id +text + + + + + + + + + + + + + + + + + + + + + diff --git a/ckan/controllers/home.py b/ckan/controllers/home.py index dc90e8ed4de..010e8073e86 100644 --- a/ckan/controllers/home.py +++ b/ckan/controllers/home.py @@ -68,7 +68,7 @@ def index(self): 'res_format': _('Formats'), 'license': _('Licence'), } - data_dict = {'order_by': 'packages', 'all_fields': 1} + data_dict = {'sort': 'packages', 'all_fields': 1} # only give the terms to group dictize that are returned in the # facets as full results take a lot longer if 'groups' in c.search_facets: diff --git a/ckan/lib/search/__init__.py b/ckan/lib/search/__init__.py index 0bd607a835b..053f7178d36 100644 --- a/ckan/lib/search/__init__.py +++ b/ckan/lib/search/__init__.py @@ -30,7 +30,7 @@ def text_traceback(): SIMPLE_SEARCH = asbool(config.get('ckan.simple_search', False)) -SUPPORTED_SCHEMA_VERSIONS = ['1.4'] +SUPPORTED_SCHEMA_VERSIONS = ['2.0'] DEFAULT_OPTIONS = { 'limit': 20, diff --git a/ckan/lib/search/index.py b/ckan/lib/search/index.py index dee7e27c7e8..22a22827769 100644 --- a/ckan/lib/search/index.py +++ b/ckan/lib/search/index.py @@ -7,6 +7,7 @@ import re from pylons import config +from paste.deploy.converters import asbool from common import SearchIndexError, make_connection from ckan.model import PackageRelationship @@ -223,6 +224,8 @@ def index_package(self, pkg_dict, defer_commit=False): try: conn = make_connection() commit = not defer_commit + if not asbool(config.get('ckan.search.solr_commit', 'true')): + commit = False conn.add_many([pkg_dict], _commit=commit) except Exception, e: log.exception(e) @@ -236,7 +239,7 @@ def index_package(self, pkg_dict, defer_commit=False): def commit(self): try: conn = make_connection() - conn.commit(wait_flush=False, wait_searcher=False) + conn.commit(wait_searcher=False) except Exception, e: log.exception(e) raise SearchIndexError(e) @@ -251,7 +254,8 @@ def delete_package(self, pkg_dict): config.get('ckan.site_id')) try: conn.delete_query(query) - conn.commit() + if asbool(config.get('ckan.search.solr_commit', 'true')): + conn.commit() except Exception, e: log.exception(e) raise SearchIndexError(e) diff --git a/ckan/lib/search/query.py b/ckan/lib/search/query.py index 5f1db0fca1a..73207468017 100644 --- a/ckan/lib/search/query.py +++ b/ckan/lib/search/query.py @@ -338,7 +338,9 @@ def run(self, query): if ':' not in query['q']: query['defType'] = 'dismax' query['tie'] = '0.1' - query['mm'] = '1' + # this minimum match is explained + # http://wiki.apache.org/solr/DisMaxQParserPlugin#mm_.28Minimum_.27Should.27_Match.29 + query['mm'] = '2<-1 5<80%' query['qf'] = query.get('qf', QUERY_FIELDS) conn = make_connection() diff --git a/ckan/tests/functional/test_search.py b/ckan/tests/functional/test_search.py index f4aaed71e1b..829a92c471b 100644 --- a/ckan/tests/functional/test_search.py +++ b/ckan/tests/functional/test_search.py @@ -62,12 +62,12 @@ def test_1_name(self): def test_2_title(self): # exact title, one word - res = self.app.get('/dataset?q=Opengov.se') + res = self.app.get('/dataset?q=Opengov') result = self._check_results(res, 1, 'se-opengov') # multiple words res = self.app.get('/dataset?q=Government%20Expenditure') - result = self._check_results(res, 5, 'uk-government-expenditure') + result = self._check_results(res, 1, 'uk-government-expenditure') class TestSearch2(FunctionalTestCase, PylonsTestCase):#, TestPackageForm): @@ -158,7 +158,7 @@ def test_search(self): res = self.app.get(offset) assert 'Search - ' in res form = res.forms['dataset-search'] - form['q'] = str(self.non_active_name) + form['q'] = 'name:' + str(self.non_active_name) results_page = form.submit() assert 'Search - ' in results_page, results_page assert '0 datasets found' in results_page, (self.non_active_name, results_page) diff --git a/ckan/tests/lib/test_solr_package_search.py b/ckan/tests/lib/test_solr_package_search.py index 75d54c0dafc..5f65db2fb62 100644 --- a/ckan/tests/lib/test_solr_package_search.py +++ b/ckan/tests/lib/test_solr_package_search.py @@ -88,21 +88,20 @@ def test_1_name_token(self): def test_2_title(self): # exact title, one word - result = search.query_for(model.Package).run({'q': u'Opengov.se'}) + result = search.query_for(model.Package).run({'q': u'Opengov'}) + assert self._pkg_names(result) == 'se-opengov', self._pkg_names(result) # multiple words result = search.query_for(model.Package).run({'q': u'Government Expenditure'}) # uk-government-expenditure is the best match but all other results should be retured assert self._pkg_names(result).startswith('uk-government-expenditure'), self._pkg_names(result) - # se-opengov has only government in tags, all others hav it in title. - assert self._pkg_names(result).endswith('se-opengov'), self._pkg_names(result) # multiple words wrong order result = search.query_for(model.Package).run({'q': u'Expenditure Government'}) assert self._pkg_names(result).startswith('uk-government-expenditure'), self._pkg_names(result) - assert self._pkg_names(result).endswith('se-opengov'), self._pkg_names(result) # multiple words all should match government + result = search.query_for(model.Package).run({'q': u'Expenditure Government China'}) - assert len(result['results']) == 5, self._pkg_names(result) + assert len(result['results']) == 1, self._pkg_names(result) def test_3_licence(self): # this should result, but it is here to check that at least it does not error @@ -136,7 +135,7 @@ def test_tags_field_with_capitals(self): def dont_test_tags_field_with_basic_unicode(self): result = search.query_for(model.Package).run({'q': u'greek omega \u03a9'}) assert self._check_entity_names(result, ['se-publications']), self._pkg_names(result) - + def test_tags_token_simple(self): result = search.query_for(model.Package).run({'q': u'tags:country-sweden'}) assert self._check_entity_names(result, ['se-publications', 'se-opengov']), self._pkg_names(result) @@ -146,7 +145,7 @@ def test_tags_token_simple(self): def test_tags_token_with_multi_word_tag(self): result = search.query_for(model.Package).run({'q': u'tags:"todo split"'}) assert self._check_entity_names(result, ['us-gov-images']), self._pkg_names(result) - + def test_tags_token_simple_with_deleted_tag(self): # registry has been deleted result = search.query_for(model.Package).run({'q': u'tags:registry'}) @@ -287,7 +286,7 @@ def test_search_notes_on(self): pkgs = result['results'] count = result['count'] assert len(pkgs) == 2, pkgs - + def test_search_foreign_chars(self): result = search.query_for(model.Package).run({'q': 'umlaut'}) assert result['results'] == ['gils'], result['results'] @@ -319,8 +318,8 @@ def test_overall(self): check_search_results('annakarenina', 1, ['annakarenina']) check_search_results('warandpeace', 1, ['warandpeace']) check_search_results('', 2) - - check_search_results('A Novel By Tolstoy', 1, ['annakarenina']) + + check_search_results('Tolstoy', 1, ['annakarenina']) check_search_results('title:Novel', 1, ['annakarenina']) check_search_results('title:peace', 0) check_search_results('name:warandpeace', 1) @@ -332,7 +331,7 @@ def test_overall(self): check_search_results(u'Flexible \u30a1', 2) check_search_results(u'Flexible', 2) check_search_results(u'flexible', 2) - + class TestGeographicCoverage(TestController): @classmethod @@ -356,7 +355,7 @@ def setup_class(cls): def teardown_class(self): model.repo.rebuild_db() search.clear() - + def _do_search(self, q, expected_pkgs, count=None): query = { 'q': q, @@ -390,7 +389,7 @@ def test_0_basic(self): self._do_search(u'great britain', ['gb'], 1) def test_1_filtered(self): - # TODO: solr is not currently set up to allow partial matches + # TODO: solr is not currently set up to allow partial matches # and extras are not saved as multivalued so this # test will fail. Make multivalued or remove? from ckan.tests import SkipTest @@ -420,7 +419,7 @@ def setup_class(cls): def teardown_class(self): model.repo.rebuild_db() search.clear() - + def _do_search(self, department, expected_pkgs, count=None): result = search.query_for(model.Package).run({'q': 'department: %s' % department}) pkgs = result['results'] @@ -465,7 +464,7 @@ def setup_class(cls): def teardown_class(self): model.repo.rebuild_db() search.clear() - + def _do_search(self, q, wanted_results): query = { 'q': q, diff --git a/ckan/tests/lib/test_solr_package_search_synchronous_update.py b/ckan/tests/lib/test_solr_package_search_synchronous_update.py index 13d7a05e2a3..0e0f14e54fb 100644 --- a/ckan/tests/lib/test_solr_package_search_synchronous_update.py +++ b/ckan/tests/lib/test_solr_package_search_synchronous_update.py @@ -71,7 +71,7 @@ def _remove_package(self, name=None): def test_02_add_package_from_dict(self): check_search_results('', 3) - check_search_results('test-spatial', 1, ['council-owned-litter-bins']) + check_search_results('spatial', 1, ['council-owned-litter-bins']) def test_03_update_package_from_dict(self): package = model.Package.by_name('council-owned-litter-bins') @@ -93,7 +93,7 @@ def test_03_update_package_from_dict(self): model.repo.commit_and_remove() check_search_results('', 3) - check_search_results('test-spatial', 1, ['council-owned-litter-bins']) + check_search_results('spatial', 1, ['council-owned-litter-bins']) def test_04_delete_package_from_dict(self): package = model.Package.by_name('council-owned-litter-bins') diff --git a/ckanext/multilingual/solr/schema.xml b/ckanext/multilingual/solr/schema.xml index fb957d36fae..82f3e40769b 100644 --- a/ckanext/multilingual/solr/schema.xml +++ b/ckanext/multilingual/solr/schema.xml @@ -16,7 +16,7 @@ limitations under the License. --> - + @@ -354,6 +354,7 @@ + @@ -436,7 +437,12 @@ + + + + + diff --git a/doc/configuration.rst b/doc/configuration.rst index 1cab244f685..f513bb11d9c 100644 --- a/doc/configuration.rst +++ b/doc/configuration.rst @@ -525,6 +525,16 @@ to occur asynchronously, set this option to 0. Note, this is equivalent to explicitly load the `synchronous_search` plugin. +ckan.search.solr_commit +^^^^^^^^^^^^^^^^^^^^^^^ + +Example:: + + ckan.search.solr_commit = false + +Default value: ``true`` + +Make ckan commit changes solr after every dataset update change. Turn this to false if on solr 4.0 and you have automatic (soft)commits enabled to improve dataset update/create speed (however there may be a slight delay before dataset gets seen in results). simple_search ^^^^^^^^^^^^^