diff --git a/ckan/config/solr/CHANGELOG.txt b/ckan/config/solr/CHANGELOG.txt
index eb600e042cc..769c6b53735 100644
--- a/ckan/config/solr/CHANGELOG.txt
+++ b/ckan/config/solr/CHANGELOG.txt
@@ -1,6 +1,13 @@
CKAN SOLR schemas changelog
===========================
+v2.0 - (ckan>=2.0)
+--------------------
+* Add _version_ field to make it compatible with solr 4.0
+* Remove stopwords
+* Add dataset_type field.
+* Add *_date autofield.
+
v1.4 - (ckan>=1.7)
--------------------
* Add Ascii folding filter to text fields.
diff --git a/ckan/config/solr/schema-2.0.xml b/ckan/config/solr/schema-2.0.xml
new file mode 100644
index 00000000000..7e681738c5b
--- /dev/null
+++ b/ckan/config/solr/schema-2.0.xml
@@ -0,0 +1,164 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+index_id
+text
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/ckan/controllers/home.py b/ckan/controllers/home.py
index dc90e8ed4de..010e8073e86 100644
--- a/ckan/controllers/home.py
+++ b/ckan/controllers/home.py
@@ -68,7 +68,7 @@ def index(self):
'res_format': _('Formats'),
'license': _('Licence'), }
- data_dict = {'order_by': 'packages', 'all_fields': 1}
+ data_dict = {'sort': 'packages', 'all_fields': 1}
# only give the terms to group dictize that are returned in the
# facets as full results take a lot longer
if 'groups' in c.search_facets:
diff --git a/ckan/lib/search/__init__.py b/ckan/lib/search/__init__.py
index 0bd607a835b..053f7178d36 100644
--- a/ckan/lib/search/__init__.py
+++ b/ckan/lib/search/__init__.py
@@ -30,7 +30,7 @@ def text_traceback():
SIMPLE_SEARCH = asbool(config.get('ckan.simple_search', False))
-SUPPORTED_SCHEMA_VERSIONS = ['1.4']
+SUPPORTED_SCHEMA_VERSIONS = ['2.0']
DEFAULT_OPTIONS = {
'limit': 20,
diff --git a/ckan/lib/search/index.py b/ckan/lib/search/index.py
index dee7e27c7e8..22a22827769 100644
--- a/ckan/lib/search/index.py
+++ b/ckan/lib/search/index.py
@@ -7,6 +7,7 @@
import re
from pylons import config
+from paste.deploy.converters import asbool
from common import SearchIndexError, make_connection
from ckan.model import PackageRelationship
@@ -223,6 +224,8 @@ def index_package(self, pkg_dict, defer_commit=False):
try:
conn = make_connection()
commit = not defer_commit
+ if not asbool(config.get('ckan.search.solr_commit', 'true')):
+ commit = False
conn.add_many([pkg_dict], _commit=commit)
except Exception, e:
log.exception(e)
@@ -236,7 +239,7 @@ def index_package(self, pkg_dict, defer_commit=False):
def commit(self):
try:
conn = make_connection()
- conn.commit(wait_flush=False, wait_searcher=False)
+ conn.commit(wait_searcher=False)
except Exception, e:
log.exception(e)
raise SearchIndexError(e)
@@ -251,7 +254,8 @@ def delete_package(self, pkg_dict):
config.get('ckan.site_id'))
try:
conn.delete_query(query)
- conn.commit()
+ if asbool(config.get('ckan.search.solr_commit', 'true')):
+ conn.commit()
except Exception, e:
log.exception(e)
raise SearchIndexError(e)
diff --git a/ckan/lib/search/query.py b/ckan/lib/search/query.py
index 5f1db0fca1a..73207468017 100644
--- a/ckan/lib/search/query.py
+++ b/ckan/lib/search/query.py
@@ -338,7 +338,9 @@ def run(self, query):
if ':' not in query['q']:
query['defType'] = 'dismax'
query['tie'] = '0.1'
- query['mm'] = '1'
+ # this minimum match is explained
+ # http://wiki.apache.org/solr/DisMaxQParserPlugin#mm_.28Minimum_.27Should.27_Match.29
+ query['mm'] = '2<-1 5<80%'
query['qf'] = query.get('qf', QUERY_FIELDS)
conn = make_connection()
diff --git a/ckan/tests/functional/test_search.py b/ckan/tests/functional/test_search.py
index f4aaed71e1b..829a92c471b 100644
--- a/ckan/tests/functional/test_search.py
+++ b/ckan/tests/functional/test_search.py
@@ -62,12 +62,12 @@ def test_1_name(self):
def test_2_title(self):
# exact title, one word
- res = self.app.get('/dataset?q=Opengov.se')
+ res = self.app.get('/dataset?q=Opengov')
result = self._check_results(res, 1, 'se-opengov')
# multiple words
res = self.app.get('/dataset?q=Government%20Expenditure')
- result = self._check_results(res, 5, 'uk-government-expenditure')
+ result = self._check_results(res, 1, 'uk-government-expenditure')
class TestSearch2(FunctionalTestCase, PylonsTestCase):#, TestPackageForm):
@@ -158,7 +158,7 @@ def test_search(self):
res = self.app.get(offset)
assert 'Search - ' in res
form = res.forms['dataset-search']
- form['q'] = str(self.non_active_name)
+ form['q'] = 'name:' + str(self.non_active_name)
results_page = form.submit()
assert 'Search - ' in results_page, results_page
assert '0 datasets found' in results_page, (self.non_active_name, results_page)
diff --git a/ckan/tests/lib/test_solr_package_search.py b/ckan/tests/lib/test_solr_package_search.py
index 75d54c0dafc..5f65db2fb62 100644
--- a/ckan/tests/lib/test_solr_package_search.py
+++ b/ckan/tests/lib/test_solr_package_search.py
@@ -88,21 +88,20 @@ def test_1_name_token(self):
def test_2_title(self):
# exact title, one word
- result = search.query_for(model.Package).run({'q': u'Opengov.se'})
+ result = search.query_for(model.Package).run({'q': u'Opengov'})
+
assert self._pkg_names(result) == 'se-opengov', self._pkg_names(result)
# multiple words
result = search.query_for(model.Package).run({'q': u'Government Expenditure'})
# uk-government-expenditure is the best match but all other results should be retured
assert self._pkg_names(result).startswith('uk-government-expenditure'), self._pkg_names(result)
- # se-opengov has only government in tags, all others hav it in title.
- assert self._pkg_names(result).endswith('se-opengov'), self._pkg_names(result)
# multiple words wrong order
result = search.query_for(model.Package).run({'q': u'Expenditure Government'})
assert self._pkg_names(result).startswith('uk-government-expenditure'), self._pkg_names(result)
- assert self._pkg_names(result).endswith('se-opengov'), self._pkg_names(result)
# multiple words all should match government
+
result = search.query_for(model.Package).run({'q': u'Expenditure Government China'})
- assert len(result['results']) == 5, self._pkg_names(result)
+ assert len(result['results']) == 1, self._pkg_names(result)
def test_3_licence(self):
# this should result, but it is here to check that at least it does not error
@@ -136,7 +135,7 @@ def test_tags_field_with_capitals(self):
def dont_test_tags_field_with_basic_unicode(self):
result = search.query_for(model.Package).run({'q': u'greek omega \u03a9'})
assert self._check_entity_names(result, ['se-publications']), self._pkg_names(result)
-
+
def test_tags_token_simple(self):
result = search.query_for(model.Package).run({'q': u'tags:country-sweden'})
assert self._check_entity_names(result, ['se-publications', 'se-opengov']), self._pkg_names(result)
@@ -146,7 +145,7 @@ def test_tags_token_simple(self):
def test_tags_token_with_multi_word_tag(self):
result = search.query_for(model.Package).run({'q': u'tags:"todo split"'})
assert self._check_entity_names(result, ['us-gov-images']), self._pkg_names(result)
-
+
def test_tags_token_simple_with_deleted_tag(self):
# registry has been deleted
result = search.query_for(model.Package).run({'q': u'tags:registry'})
@@ -287,7 +286,7 @@ def test_search_notes_on(self):
pkgs = result['results']
count = result['count']
assert len(pkgs) == 2, pkgs
-
+
def test_search_foreign_chars(self):
result = search.query_for(model.Package).run({'q': 'umlaut'})
assert result['results'] == ['gils'], result['results']
@@ -319,8 +318,8 @@ def test_overall(self):
check_search_results('annakarenina', 1, ['annakarenina'])
check_search_results('warandpeace', 1, ['warandpeace'])
check_search_results('', 2)
-
- check_search_results('A Novel By Tolstoy', 1, ['annakarenina'])
+
+ check_search_results('Tolstoy', 1, ['annakarenina'])
check_search_results('title:Novel', 1, ['annakarenina'])
check_search_results('title:peace', 0)
check_search_results('name:warandpeace', 1)
@@ -332,7 +331,7 @@ def test_overall(self):
check_search_results(u'Flexible \u30a1', 2)
check_search_results(u'Flexible', 2)
check_search_results(u'flexible', 2)
-
+
class TestGeographicCoverage(TestController):
@classmethod
@@ -356,7 +355,7 @@ def setup_class(cls):
def teardown_class(self):
model.repo.rebuild_db()
search.clear()
-
+
def _do_search(self, q, expected_pkgs, count=None):
query = {
'q': q,
@@ -390,7 +389,7 @@ def test_0_basic(self):
self._do_search(u'great britain', ['gb'], 1)
def test_1_filtered(self):
- # TODO: solr is not currently set up to allow partial matches
+ # TODO: solr is not currently set up to allow partial matches
# and extras are not saved as multivalued so this
# test will fail. Make multivalued or remove?
from ckan.tests import SkipTest
@@ -420,7 +419,7 @@ def setup_class(cls):
def teardown_class(self):
model.repo.rebuild_db()
search.clear()
-
+
def _do_search(self, department, expected_pkgs, count=None):
result = search.query_for(model.Package).run({'q': 'department: %s' % department})
pkgs = result['results']
@@ -465,7 +464,7 @@ def setup_class(cls):
def teardown_class(self):
model.repo.rebuild_db()
search.clear()
-
+
def _do_search(self, q, wanted_results):
query = {
'q': q,
diff --git a/ckan/tests/lib/test_solr_package_search_synchronous_update.py b/ckan/tests/lib/test_solr_package_search_synchronous_update.py
index 13d7a05e2a3..0e0f14e54fb 100644
--- a/ckan/tests/lib/test_solr_package_search_synchronous_update.py
+++ b/ckan/tests/lib/test_solr_package_search_synchronous_update.py
@@ -71,7 +71,7 @@ def _remove_package(self, name=None):
def test_02_add_package_from_dict(self):
check_search_results('', 3)
- check_search_results('test-spatial', 1, ['council-owned-litter-bins'])
+ check_search_results('spatial', 1, ['council-owned-litter-bins'])
def test_03_update_package_from_dict(self):
package = model.Package.by_name('council-owned-litter-bins')
@@ -93,7 +93,7 @@ def test_03_update_package_from_dict(self):
model.repo.commit_and_remove()
check_search_results('', 3)
- check_search_results('test-spatial', 1, ['council-owned-litter-bins'])
+ check_search_results('spatial', 1, ['council-owned-litter-bins'])
def test_04_delete_package_from_dict(self):
package = model.Package.by_name('council-owned-litter-bins')
diff --git a/ckanext/multilingual/solr/schema.xml b/ckanext/multilingual/solr/schema.xml
index fb957d36fae..82f3e40769b 100644
--- a/ckanext/multilingual/solr/schema.xml
+++ b/ckanext/multilingual/solr/schema.xml
@@ -16,7 +16,7 @@
limitations under the License.
-->
-
+
@@ -354,6 +354,7 @@
+
@@ -436,7 +437,12 @@
+
+
+
+
+
diff --git a/doc/configuration.rst b/doc/configuration.rst
index 1cab244f685..f513bb11d9c 100644
--- a/doc/configuration.rst
+++ b/doc/configuration.rst
@@ -525,6 +525,16 @@ to occur asynchronously, set this option to 0.
Note, this is equivalent to explicitly load the `synchronous_search` plugin.
+ckan.search.solr_commit
+^^^^^^^^^^^^^^^^^^^^^^^
+
+Example::
+
+ ckan.search.solr_commit = false
+
+Default value: ``true``
+
+Make ckan commit changes solr after every dataset update change. Turn this to false if on solr 4.0 and you have automatic (soft)commits enabled to improve dataset update/create speed (however there may be a slight delay before dataset gets seen in results).
simple_search
^^^^^^^^^^^^^