Skip to content

Commit

Permalink
Merge branch '1701-date-field-index'
Browse files Browse the repository at this point in the history
  • Loading branch information
joetsoi committed Jul 29, 2014
2 parents 4d31a45 + 0722f1b commit 06e3e97
Show file tree
Hide file tree
Showing 3 changed files with 171 additions and 115 deletions.
10 changes: 9 additions & 1 deletion ckan/lib/search/index.py
Expand Up @@ -3,6 +3,7 @@
import logging
import collections
import json
import datetime
from dateutil.parser import parse

import re
Expand Down Expand Up @@ -219,11 +220,18 @@ def index_package(self, pkg_dict, defer_commit=False):
# be needed? For my data not changing the keys seems to not cause a
# problem.
new_dict = {}
bogus_date = datetime.datetime(1, 1, 1)
for key, value in pkg_dict.items():
key = key.encode('ascii', 'ignore')
if key.endswith('_date'):
try:
value = parse(value).isoformat() + 'Z'
date = parse(value, default=bogus_date)
if date != bogus_date:
value = date.isoformat() + 'Z'
else:
# The date field was empty, so dateutil filled it with
# the default bogus date
value = None
except ValueError:
continue
new_dict[key] = value
Expand Down
174 changes: 162 additions & 12 deletions ckan/new_tests/lib/search/test_index.py
@@ -1,14 +1,164 @@
import datetime
import hashlib
import json
import nose.tools
import nose

import ckan.lib.search
from pylons import config
import ckan.lib.search as search
import ckan.new_tests.helpers as helpers

assert_equal = nose.tools.assert_equal
assert_in = helpers.assert_in
assert_not_in = helpers.assert_not_in


class TestSearchIndex(object):

@classmethod
def setup_class(cls):

if not search.is_available():
raise nose.SkipTest('Solr not reachable')

cls.solr_client = search.make_connection()

cls.fq = " +site_id:\"%s\" " % config['ckan.site_id']

cls.package_index = search.PackageSearchIndex()

cls.base_package_dict = {
'id': 'test-index',
'name': 'monkey',
'title': 'Monkey',
'state': 'active',
'private': False,
'type': 'dataset',
'owner_org': None,
'metadata_created': datetime.datetime.now().isoformat(),
'metadata_modified': datetime.datetime.now().isoformat(),
}

def teardown(self):
# clear the search index after every test
self.package_index.clear()

def test_index_basic(self):

self.package_index.index_package(self.base_package_dict)

response = self.solr_client.query('name:monkey', fq=self.fq)

assert_equal(len(response), 1)

assert_equal(response.results[0]['id'], 'test-index')
assert_equal(response.results[0]['name'], 'monkey')
assert_equal(response.results[0]['title'], 'Monkey')

index_id = hashlib.md5(
'{0}{1}'.format(self.base_package_dict['id'],
config['ckan.site_id'])
).hexdigest()

assert_equal(response.results[0]['index_id'], index_id)

def test_no_state_no_index(self):
pkg_dict = self.base_package_dict.copy()
pkg_dict.update({
'state': None,
})

self.package_index.index_package(pkg_dict)

response = self.solr_client.query('name:monkey', fq=self.fq)

assert_equal(len(response), 0)

def test_clear_index(self):

self.package_index.index_package(self.base_package_dict)

self.package_index.clear()

response = self.solr_client.query('name:monkey', fq=self.fq)
assert_equal(len(response), 0)

def test_index_illegal_xml_chars(self):

pkg_dict = self.base_package_dict.copy()
pkg_dict.update({
'title': u'\u00c3a\u0001ltimo n\u00famero penguin',
'notes': u'\u00c3a\u0001ltimo n\u00famero penguin',
})
self.package_index.index_package(pkg_dict)

response = self.solr_client.query('name:monkey', fq=self.fq)

assert_equal(len(response), 1)
assert_equal(response.results[0]['title'],
u'\u00c3altimo n\u00famero penguin')

def test_index_date_field(self):

pkg_dict = self.base_package_dict.copy()
pkg_dict.update({
'extras': [
{'key': 'test_date', 'value': '2014-03-22'},
{'key': 'test_tim_date', 'value': '2014-03-22 05:42:14'},
]
})

self.package_index.index_package(pkg_dict)

response = self.solr_client.query('name:monkey', fq=self.fq)

assert_equal(len(response), 1)

assert isinstance(response.results[0]['test_date'], datetime.datetime)
assert_equal(response.results[0]['test_date'].strftime('%Y-%m-%d'),
'2014-03-22')
assert_equal(
response.results[0]['test_tim_date'].strftime('%Y-%m-%d %H:%M:%S'),
'2014-03-22 05:42:14'
)

def test_index_date_field_wrong_value(self):

pkg_dict = self.base_package_dict.copy()
pkg_dict.update({
'extras': [
{'key': 'test_wrong_date', 'value': 'Not a date'},
{'key': 'test_another_wrong_date', 'value': '2014-13-01'},
]
})

self.package_index.index_package(pkg_dict)

response = self.solr_client.query('name:monkey', fq=self.fq)

assert_equal(len(response), 1)

assert 'test_wrong_date' not in response.results[0]
assert 'test_another_wrong_date' not in response.results[0]

def test_index_date_field_empty_value(self):

pkg_dict = self.base_package_dict.copy()
pkg_dict.update({
'extras': [
{'key': 'test_empty_date', 'value': ''},
]
})

self.package_index.index_package(pkg_dict)

response = self.solr_client.query('name:monkey', fq=self.fq)

assert_equal(len(response), 1)

assert 'test_empty_date' not in response.results[0]


class TestPackageSearchIndex:
@staticmethod
def _get_pkg_dict():
Expand Down Expand Up @@ -44,11 +194,11 @@ def _get_pkg_dict_with_resources():
return pkg_dict

def test_index_package_stores_basic_solr_fields(self):
index = ckan.lib.search.index.PackageSearchIndex()
index = search.index.PackageSearchIndex()
pkg_dict = self._get_pkg_dict()

index.index_package(pkg_dict)
indexed_pkg = ckan.lib.search.show(pkg_dict['name'])
indexed_pkg = search.show(pkg_dict['name'])

# At root level are the fields that SOLR uses
assert_equal(indexed_pkg['name'], 'river-quality')
Expand All @@ -58,11 +208,11 @@ def test_index_package_stores_basic_solr_fields(self):
assert_equal(indexed_pkg['dataset_type'], 'dataset')

def test_index_package_stores_unvalidated_data_dict(self):
index = ckan.lib.search.index.PackageSearchIndex()
index = search.index.PackageSearchIndex()
pkg_dict = self._get_pkg_dict()

index.index_package(pkg_dict)
indexed_pkg = ckan.lib.search.show(pkg_dict['name'])
indexed_pkg = search.show(pkg_dict['name'])

# data_dict is the result of package_show, unvalidated
data_dict = json.loads(indexed_pkg['data_dict'])
Expand All @@ -72,11 +222,11 @@ def test_index_package_stores_unvalidated_data_dict(self):
assert_not_in('title', data_dict)

def test_index_package_stores_validated_data_dict(self):
index = ckan.lib.search.index.PackageSearchIndex()
index = search.index.PackageSearchIndex()
pkg_dict = self._get_pkg_dict()

index.index_package(pkg_dict)
indexed_pkg = ckan.lib.search.show(pkg_dict['name'])
indexed_pkg = search.show(pkg_dict['name'])

# validated_data_dict is the result of package_show, validated
validated_data_dict = json.loads(indexed_pkg['validated_data_dict'])
Expand All @@ -87,27 +237,27 @@ def test_index_package_stores_validated_data_dict(self):

def test_index_package_stores_validated_data_dict_without_unvalidated_data_dict(self):
# This is a regression test for #1764
index = ckan.lib.search.index.PackageSearchIndex()
index = search.index.PackageSearchIndex()
pkg_dict = self._get_pkg_dict()

index.index_package(pkg_dict)
indexed_pkg = ckan.lib.search.show(pkg_dict['name'])
indexed_pkg = search.show(pkg_dict['name'])

validated_data_dict = json.loads(indexed_pkg['validated_data_dict'])
assert_not_in('data_dict', validated_data_dict)

def test_index_package_stores_resource_extras_in_config_file(self):
index = ckan.lib.search.index.PackageSearchIndex()
index = search.index.PackageSearchIndex()
pkg_dict = self._get_pkg_dict_with_resources()

index.index_package(pkg_dict)
indexed_pkg = ckan.lib.search.show(pkg_dict['name'])
indexed_pkg = search.show(pkg_dict['name'])

# Resource fields given by ckan.extra_resource_fields are indexed
assert_equal(indexed_pkg['res_extras_alt_url'],
['http://www.bar.com/riverquality.pdf',
'http://www.bar.com/riverquality.csv'])

#Other resource fields are ignored
# Other resource fields are ignored
assert_equal(indexed_pkg.get('res_extras_institution', None), None)
assert_equal(indexed_pkg.get('res_extras_city', None), None)
102 changes: 0 additions & 102 deletions ckan/tests/lib/test_solr_search_index.py
Expand Up @@ -29,108 +29,6 @@ def test_solr_url_exists(self):
raise AssertionError('SOLR connection problem. Connection defined in development.ini as: solr_url=%s Error: %s' % (config['solr_url'], e))


class TestSolrSearchIndex(TestController):
"""
Tests that a package is indexed when the packagenotification is
received by the indexer.
"""
@classmethod
def setup_class(cls):
setup_test_search_index()
CreateTestData.create()
cls.solr = search.make_connection()
cls.fq = " +site_id:\"%s\" " % config['ckan.site_id']

@classmethod
def teardown_class(cls):
model.repo.rebuild_db()
cls.solr.close()

def teardown(self):
# clear the search index after every test
search.index_for('Package').clear()

def _get_index_id(self,pkg_id):
return hashlib.md5('%s%s' % (pkg_id,config['ckan.site_id'])).hexdigest()

def test_index(self):

datetime_now = datetime.now()
pkg_dict = {
'id': u'penguin-id',
'title': u'penguin',
'state': u'active',
'type': u'dataset',
'private': False,
'owner_org': None,
'metadata_created': datetime_now.isoformat(),
'metadata_modified': datetime_now.isoformat(),
'extras': [
{'key': 'test_date', 'value': '2013-03-01'},
{'key': 'test_wrong_date', 'value': 'Not a date'},
]
}
search.dispatch_by_operation('Package', pkg_dict, 'new')
response = self.solr.query('title:penguin', fq=self.fq)
assert len(response) == 1, len(response)
assert response.results[0]['index_id'] == self._get_index_id (pkg_dict['id'])
assert response.results[0]['title'] == 'penguin'

# looks like solrpy messes with microseconds and time zones,
# so ignore them for testing
assert datetime_now.strftime('%Y-%m-%d %H:%M:%S') == response.results[0]['metadata_created'].strftime('%Y-%m-%d %H:%M:%S')
assert datetime_now.strftime('%Y-%m-%d %H:%M:%S') == response.results[0]['metadata_modified'].strftime('%Y-%m-%d %H:%M:%S')

def test_no_state_not_indexed(self):
pkg_dict = {
'title': 'penguin'
}
search.dispatch_by_operation('Package', pkg_dict, 'new')
response = self.solr.query('title:penguin', fq=self.fq)
assert len(response) == 0, len(response)

def test_index_clear(self):
pkg_dict = {
'id': u'penguin-id',
'title': u'penguin',
'state': u'active',
'type': u'dataset',
'private': False,
'owner_org': None,
'metadata_created': datetime.now().isoformat(),
'metadata_modified': datetime.now().isoformat(),
}
search.dispatch_by_operation('Package', pkg_dict, 'new')
response = self.solr.query('title:penguin', fq=self.fq)
assert len(response) == 1, len(response)
search.index_for('Package').clear()
response = self.solr.query('title:penguin', fq=self.fq)
assert len(response) == 0
# clear whilst empty
search.index_for('Package').clear()
response = self.solr.query('title:penguin', fq=self.fq)
assert len(response) == 0

def test_index_illegal_xml_chars(self):

pkg_dict = {
'id': u'penguin-id',
'title': u'\u00c3a\u0001ltimo n\u00famero penguin',
'notes': u'\u00c3a\u0001ltimo n\u00famero penguin',
'state': u'active',
'type': u'dataset',
'private': False,
'owner_org': None,
'metadata_created': datetime.now().isoformat(),
'metadata_modified': datetime.now().isoformat(),
}
search.dispatch_by_operation('Package', pkg_dict, 'new')
response = self.solr.query('title:penguin', fq=self.fq)
assert len(response) == 1, len(response)
assert response.results[0]['index_id'] == self._get_index_id (pkg_dict['id'])
assert response.results[0]['title'] == u'\u00c3altimo n\u00famero penguin'


class TestSolrSearch:
@classmethod
def setup_class(cls):
Expand Down

0 comments on commit 06e3e97

Please sign in to comment.