Skip to content

Commit

Permalink
[3027] all tests pass for 1.4, 3.6, 4.0, changed mm
Browse files Browse the repository at this point in the history
  • Loading branch information
kindly committed Dec 4, 2012
1 parent 8ccc414 commit 783cf82
Show file tree
Hide file tree
Showing 8 changed files with 191 additions and 24 deletions.
163 changes: 163 additions & 0 deletions ckan/config/solr/schema-2.0.xml
@@ -0,0 +1,163 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

<schema name="ckan" version="2.0">

<types>
<fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/>
<fieldtype name="binary" class="solr.BinaryField"/>
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/>
<fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/>

<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
</analyzer>
</fieldType>


<!-- A general unstemmed text field - good if one does not know the language of the field -->
<fieldType name="textgen" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
</types>


<fields>
<field name="index_id" type="string" indexed="true" stored="true" required="true" />
<field name="id" type="string" indexed="true" stored="true" required="true" />
<field name="site_id" type="string" indexed="true" stored="true" required="true" />
<field name="title" type="text" indexed="true" stored="true" />
<field name="entity_type" type="string" indexed="true" stored="true" omitNorms="true" />
<field name="state" type="string" indexed="true" stored="true" omitNorms="true" />
<field name="name" type="string" indexed="true" stored="true" omitNorms="true" />
<field name="revision_id" type="string" indexed="true" stored="true" omitNorms="true" />
<field name="version" type="string" indexed="true" stored="true" />
<field name="url" type="string" indexed="true" stored="true" omitNorms="true" />
<field name="ckan_url" type="string" indexed="true" stored="true" omitNorms="true" />
<field name="download_url" type="string" indexed="true" stored="true" omitNorms="true" />
<field name="notes" type="text" indexed="true" stored="true"/>
<field name="author" type="textgen" indexed="true" stored="true" />
<field name="author_email" type="textgen" indexed="true" stored="true" />
<field name="maintainer" type="textgen" indexed="true" stored="true" />
<field name="maintainer_email" type="textgen" indexed="true" stored="true" />
<field name="license" type="string" indexed="true" stored="true" />
<field name="license_id" type="string" indexed="true" stored="true" />
<field name="ratings_count" type="int" indexed="true" stored="false" />
<field name="ratings_average" type="float" indexed="true" stored="false" />
<field name="tags" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="groups" type="string" indexed="true" stored="true" multiValued="true"/>

<field name="capacity" type="string" indexed="true" stored="true" multiValued="false"/>

<field name="res_description" type="textgen" indexed="true" stored="true" multiValued="true"/>
<field name="res_format" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="res_url" type="string" indexed="true" stored="true" multiValued="true"/>

<!-- catchall field, containing all other searchable text fields (implemented
via copyField further on in this schema -->
<field name="text" type="text" indexed="true" stored="false" multiValued="true"/>
<field name="urls" type="text" indexed="true" stored="false" multiValued="true"/>

<field name="depends_on" type="text" indexed="true" stored="false" multiValued="true"/>
<field name="dependency_of" type="text" indexed="true" stored="false" multiValued="true"/>
<field name="derives_from" type="text" indexed="true" stored="false" multiValued="true"/>
<field name="has_derivation" type="text" indexed="true" stored="false" multiValued="true"/>
<field name="links_to" type="text" indexed="true" stored="false" multiValued="true"/>
<field name="linked_from" type="text" indexed="true" stored="false" multiValued="true"/>
<field name="child_of" type="text" indexed="true" stored="false" multiValued="true"/>
<field name="parent_of" type="text" indexed="true" stored="false" multiValued="true"/>
<field name="views_total" type="int" indexed="true" stored="false"/>
<field name="views_recent" type="int" indexed="true" stored="false"/>
<field name="resources_accessed_total" type="int" indexed="true" stored="false"/>
<field name="resources_accessed_recent" type="int" indexed="true" stored="false"/>

<field name="metadata_created" type="date" indexed="true" stored="true" multiValued="false"/>
<field name="metadata_modified" type="date" indexed="true" stored="true" multiValued="false"/>

<field name="indexed_ts" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>

<!-- Copy the title field into titleString, and treat as a string
(rather than text type). This allows us to sort on the titleString -->
<field name="title_string" type="string" indexed="true" stored="false" />

<field name="data_dict" type="string" indexed="false" stored="true" />

<field name="_version_" type="string" indexed="true" stored="true"/>

<dynamicField name="*_date" type="date" indexed="true" stored="true" multiValued="false"/>

<dynamicField name="extras_*" type="text" indexed="true" stored="true" multiValued="false"/>
<dynamicField name="vocab_*" type="string" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*" type="string" indexed="true" stored="false"/>
</fields>

<uniqueKey>index_id</uniqueKey>
<defaultSearchField>text</defaultSearchField>
<solrQueryParser defaultOperator="AND"/>

<copyField source="url" dest="urls"/>
<copyField source="ckan_url" dest="urls"/>
<copyField source="download_url" dest="urls"/>
<copyField source="res_url" dest="urls"/>
<copyField source="extras_*" dest="text"/>
<copyField source="vocab_*" dest="text"/>
<copyField source="urls" dest="text"/>
<copyField source="name" dest="text"/>
<copyField source="title" dest="text"/>
<copyField source="text" dest="text"/>
<copyField source="license" dest="text"/>
<copyField source="notes" dest="text"/>
<copyField source="tags" dest="text"/>
<copyField source="groups" dest="text"/>
<copyField source="res_description" dest="text"/>
<copyField source="maintainer" dest="text"/>
<copyField source="author" dest="text"/>

</schema>
2 changes: 1 addition & 1 deletion ckan/lib/search/__init__.py
Expand Up @@ -30,7 +30,7 @@ def text_traceback():

SIMPLE_SEARCH = asbool(config.get('ckan.simple_search', False))

SUPPORTED_SCHEMA_VERSIONS = ['1.4']
SUPPORTED_SCHEMA_VERSIONS = ['2.0']

DEFAULT_OPTIONS = {
'limit': 20,
Expand Down
2 changes: 1 addition & 1 deletion ckan/lib/search/index.py
Expand Up @@ -236,7 +236,7 @@ def index_package(self, pkg_dict, defer_commit=False):
def commit(self):
try:
conn = make_connection()
conn.commit(wait_flush=False, wait_searcher=False)
conn.commit(wait_searcher=False)
except Exception, e:
log.exception(e)
raise SearchIndexError(e)
Expand Down
2 changes: 1 addition & 1 deletion ckan/lib/search/query.py
Expand Up @@ -338,7 +338,7 @@ def run(self, query):
if ':' not in query['q']:
query['defType'] = 'dismax'
query['tie'] = '0.1'
query['mm'] = '1'
query['mm'] = '2<-1 5<80%'
query['qf'] = query.get('qf', QUERY_FIELDS)

conn = make_connection()
Expand Down
6 changes: 3 additions & 3 deletions ckan/tests/functional/test_search.py
Expand Up @@ -62,12 +62,12 @@ def test_1_name(self):

def test_2_title(self):
# exact title, one word
res = self.app.get('/dataset?q=Opengov.se')
res = self.app.get('/dataset?q=Opengov')
result = self._check_results(res, 1, 'se-opengov')

# multiple words
res = self.app.get('/dataset?q=Government%20Expenditure')
result = self._check_results(res, 5, 'uk-government-expenditure')
result = self._check_results(res, 1, 'uk-government-expenditure')

class TestSearch2(FunctionalTestCase, PylonsTestCase):#, TestPackageForm):

Expand Down Expand Up @@ -158,7 +158,7 @@ def test_search(self):
res = self.app.get(offset)
assert 'Search - ' in res
form = res.forms['dataset-search']
form['q'] = str(self.non_active_name)
form['q'] = 'name:' + str(self.non_active_name)
results_page = form.submit()
assert 'Search - ' in results_page, results_page
assert '<strong>0</strong> datasets found' in results_page, (self.non_active_name, results_page)
29 changes: 14 additions & 15 deletions ckan/tests/lib/test_solr_package_search.py
Expand Up @@ -88,21 +88,20 @@ def test_1_name_token(self):

def test_2_title(self):
# exact title, one word
result = search.query_for(model.Package).run({'q': u'Opengov.se'})
result = search.query_for(model.Package).run({'q': u'Opengov'})

assert self._pkg_names(result) == 'se-opengov', self._pkg_names(result)
# multiple words
result = search.query_for(model.Package).run({'q': u'Government Expenditure'})
# uk-government-expenditure is the best match but all other results should be retured
assert self._pkg_names(result).startswith('uk-government-expenditure'), self._pkg_names(result)
# se-opengov has only government in tags, all others hav it in title.
assert self._pkg_names(result).endswith('se-opengov'), self._pkg_names(result)
# multiple words wrong order
result = search.query_for(model.Package).run({'q': u'Expenditure Government'})
assert self._pkg_names(result).startswith('uk-government-expenditure'), self._pkg_names(result)
assert self._pkg_names(result).endswith('se-opengov'), self._pkg_names(result)
# multiple words all should match government

result = search.query_for(model.Package).run({'q': u'Expenditure Government China'})
assert len(result['results']) == 5, self._pkg_names(result)
assert len(result['results']) == 1, self._pkg_names(result)

def test_3_licence(self):
# this should result, but it is here to check that at least it does not error
Expand Down Expand Up @@ -136,7 +135,7 @@ def test_tags_field_with_capitals(self):
def dont_test_tags_field_with_basic_unicode(self):
result = search.query_for(model.Package).run({'q': u'greek omega \u03a9'})
assert self._check_entity_names(result, ['se-publications']), self._pkg_names(result)

def test_tags_token_simple(self):
result = search.query_for(model.Package).run({'q': u'tags:country-sweden'})
assert self._check_entity_names(result, ['se-publications', 'se-opengov']), self._pkg_names(result)
Expand All @@ -146,7 +145,7 @@ def test_tags_token_simple(self):
def test_tags_token_with_multi_word_tag(self):
result = search.query_for(model.Package).run({'q': u'tags:"todo split"'})
assert self._check_entity_names(result, ['us-gov-images']), self._pkg_names(result)

def test_tags_token_simple_with_deleted_tag(self):
# registry has been deleted
result = search.query_for(model.Package).run({'q': u'tags:registry'})
Expand Down Expand Up @@ -287,7 +286,7 @@ def test_search_notes_on(self):
pkgs = result['results']
count = result['count']
assert len(pkgs) == 2, pkgs

def test_search_foreign_chars(self):
result = search.query_for(model.Package).run({'q': 'umlaut'})
assert result['results'] == ['gils'], result['results']
Expand Down Expand Up @@ -319,8 +318,8 @@ def test_overall(self):
check_search_results('annakarenina', 1, ['annakarenina'])
check_search_results('warandpeace', 1, ['warandpeace'])
check_search_results('', 2)
check_search_results('A Novel By Tolstoy', 1, ['annakarenina'])

check_search_results('Tolstoy', 1, ['annakarenina'])
check_search_results('title:Novel', 1, ['annakarenina'])
check_search_results('title:peace', 0)
check_search_results('name:warandpeace', 1)
Expand All @@ -332,7 +331,7 @@ def test_overall(self):
check_search_results(u'Flexible \u30a1', 2)
check_search_results(u'Flexible', 2)
check_search_results(u'flexible', 2)


class TestGeographicCoverage(TestController):
@classmethod
Expand All @@ -356,7 +355,7 @@ def setup_class(cls):
def teardown_class(self):
model.repo.rebuild_db()
search.clear()

def _do_search(self, q, expected_pkgs, count=None):
query = {
'q': q,
Expand Down Expand Up @@ -390,7 +389,7 @@ def test_0_basic(self):
self._do_search(u'great britain', ['gb'], 1)

def test_1_filtered(self):
# TODO: solr is not currently set up to allow partial matches
# TODO: solr is not currently set up to allow partial matches
# and extras are not saved as multivalued so this
# test will fail. Make multivalued or remove?
from ckan.tests import SkipTest
Expand Down Expand Up @@ -420,7 +419,7 @@ def setup_class(cls):
def teardown_class(self):
model.repo.rebuild_db()
search.clear()

def _do_search(self, department, expected_pkgs, count=None):
result = search.query_for(model.Package).run({'q': 'department: %s' % department})
pkgs = result['results']
Expand Down Expand Up @@ -465,7 +464,7 @@ def setup_class(cls):
def teardown_class(self):
model.repo.rebuild_db()
search.clear()

def _do_search(self, q, wanted_results):
query = {
'q': q,
Expand Down
4 changes: 2 additions & 2 deletions ckan/tests/lib/test_solr_package_search_synchronous_update.py
Expand Up @@ -71,7 +71,7 @@ def _remove_package(self, name=None):

def test_02_add_package_from_dict(self):
check_search_results('', 3)
check_search_results('test-spatial', 1, ['council-owned-litter-bins'])
check_search_results('spatial', 1, ['council-owned-litter-bins'])

def test_03_update_package_from_dict(self):
package = model.Package.by_name('council-owned-litter-bins')
Expand All @@ -93,7 +93,7 @@ def test_03_update_package_from_dict(self):
model.repo.commit_and_remove()

check_search_results('', 3)
check_search_results('test-spatial', 1, ['council-owned-litter-bins'])
check_search_results('spatial', 1, ['council-owned-litter-bins'])

def test_04_delete_package_from_dict(self):
package = model.Package.by_name('council-owned-litter-bins')
Expand Down
7 changes: 6 additions & 1 deletion ckanext/multilingual/solr/schema.xml
Expand Up @@ -16,7 +16,7 @@
limitations under the License.
-->

<schema name="ckan" version="1.4">
<schema name="ckan" version="2.0">

<types>
<fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
Expand Down Expand Up @@ -436,7 +436,12 @@

<field name="data_dict" type="string" indexed="false" stored="true" />

<field name="_version_" type="string" indexed="true" stored="true"/>

<dynamicField name="*_date" type="date" indexed="true" stored="true" multiValued="false"/>

<dynamicField name="extras_*" type="text" indexed="true" stored="true" multiValued="false"/>
<dynamicField name="vocab_*" type="string" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*" type="string" indexed="true" stored="false"/>
</fields>

Expand Down

0 comments on commit 783cf82

Please sign in to comment.