Skip to content

Commit

Permalink
Merge pull request #2352 from joetsoi/solrpy-is-dead-long-live-pysolr
Browse files Browse the repository at this point in the history
change solrpy library to pysolr
  • Loading branch information
wardi committed May 5, 2016
2 parents 01727cb + 64e8f9b commit 595b2d5
Show file tree
Hide file tree
Showing 7 changed files with 111 additions and 131 deletions.
41 changes: 28 additions & 13 deletions ckan/lib/search/common.py
@@ -1,5 +1,8 @@
from pylons import config
import datetime
import logging
import re
import pysolr
import simplejson
log = logging.getLogger(__name__)


Expand Down Expand Up @@ -48,23 +51,35 @@ def is_available():
"""
try:
conn = make_connection()
conn.query("*:*", rows=1)
conn.search(q="*:*", rows=1)
except Exception, e:
log.exception(e)
return False
finally:
if 'conn' in dir():
conn.close()

return True


def make_connection():
from solr import SolrConnection
def make_connection(decode_dates=True):
solr_url, solr_user, solr_password = SolrSettings.get()
assert solr_url is not None
if solr_user is not None and solr_password is not None:
return SolrConnection(solr_url, http_user=solr_user,
http_pass=solr_password)
if decode_dates:
decoder = simplejson.JSONDecoder(object_hook=solr_datetime_decoder)
return pysolr.Solr(solr_url, decoder=decoder)
else:
return SolrConnection(solr_url)
return pysolr.Solr(solr_url)


def solr_datetime_decoder(d):
for k, v in d.items():
if isinstance(v, basestring):
possible_datetime = re.search(pysolr.DATETIME_REGEX, v)
if possible_datetime:
date_values = possible_datetime.groupdict()
for dk, dv in date_values.items():
date_values[dk] = int(dv)

d[k] = datetime.datetime(date_values['year'],
date_values['month'],
date_values['day'],
date_values['hour'],
date_values['minute'],
date_values['second'])
return d
33 changes: 11 additions & 22 deletions ckan/lib/search/index.py
Expand Up @@ -8,8 +8,7 @@

import re

import solr

import pysolr
from pylons import config
from paste.deploy.converters import asbool

Expand Down Expand Up @@ -45,22 +44,19 @@ def escape_xml_illegal_chars(val, replacement=''):


def clear_index():
import solr.core
conn = make_connection()
query = "+site_id:\"%s\"" % (config.get('ckan.site_id'))
try:
conn.delete_query(query)
conn.commit()
conn.delete(q=query)
except socket.error, e:
err = 'Could not connect to SOLR %r: %r' % (conn.url, e)
log.error(err)
raise SearchIndexError(err)
except solr.core.SolrException, e:
except pysolr.SolrError, e:
err = 'SOLR %r exception: %r' % (conn.url, e)
log.error(err)
raise SearchIndexError(err)
finally:
conn.close()


class SearchIndex(object):
"""
Expand Down Expand Up @@ -287,31 +283,27 @@ def index_package(self, pkg_dict, defer_commit=False):
commit = not defer_commit
if not asbool(config.get('ckan.search.solr_commit', 'true')):
commit = False
conn.add_many([pkg_dict], _commit=commit)
except solr.core.SolrException, e:
msg = 'Solr returned an error: {0} {1} - {2}'.format(
e.httpcode, e.reason, e.body[:1000] # limit huge responses
conn.add(docs=[pkg_dict], commit=commit)
except pysolr.SolrError, e:
msg = 'Solr returned an error: {0}'.format(
e[:1000] # limit huge responses
)
raise SearchIndexError(msg)
except socket.error, e:
err = 'Could not connect to Solr using {0}: {1}'.format(conn.url, str(e))
log.error(err)
raise SearchIndexError(err)
finally:
conn.close()

commit_debug_msg = 'Not committed yet' if defer_commit else 'Committed'
log.debug('Updated index for %s [%s]' % (pkg_dict.get('name'), commit_debug_msg))

def commit(self):
try:
conn = make_connection()
conn.commit(wait_searcher=False)
conn.commit(waitSearcher=False)
except Exception, e:
log.exception(e)
raise SearchIndexError(e)
finally:
conn.close()


def delete_package(self, pkg_dict):
Expand All @@ -320,11 +312,8 @@ def delete_package(self, pkg_dict):
pkg_dict.get('id'), pkg_dict.get('id'),
config.get('ckan.site_id'))
try:
conn.delete_query(query)
if asbool(config.get('ckan.search.solr_commit', 'true')):
conn.commit()
commit = asbool(config.get('ckan.search.solr_commit', 'true'))
conn.delete(q=query, commit=commit)
except Exception, e:
log.exception(e)
raise SearchIndexError(e)
finally:
conn.close()
113 changes: 47 additions & 66 deletions ckan/lib/search/query.py
Expand Up @@ -2,11 +2,11 @@
import logging

from pylons import config
from solr import SolrException
import pysolr
from paste.deploy.converters import asbool
from paste.util.multidict import MultiDict
import six

from ckan.common import json
from ckan.lib.search.common import make_connection, SearchError, SearchQueryError
import ckan.logic as logic
import ckan.model as model
Expand Down Expand Up @@ -254,12 +254,8 @@ def get_all_entity_ids(self, max_results=1000):
fq += "+state:active "

conn = make_connection()
try:
data = conn.query(query, fq=fq, rows=max_results, fields='id')
finally:
conn.close()

return [r.get('id') for r in data.results]
data = conn.search(query, fq=fq, rows=max_results, fields='id')
return [r.get('id') for r in data.docs]

def get_index(self,reference):
query = {
Expand All @@ -268,26 +264,18 @@ def get_index(self,reference):
'wt': 'json',
'fq': 'site_id:"%s"' % config.get('ckan.site_id')}

conn = make_connection()
conn = make_connection(decode_dates=False)
log.debug('Package query: %r' % query)
try:
solr_response = conn.raw_query(**query)
except SolrException, e:
solr_response = conn.search(**query)
except pysolr.SolrError, e:
raise SearchError('SOLR returned an error running query: %r Error: %r' %
(query, e.reason))
try:
data = json.loads(solr_response)
(query, e))

if data['response']['numFound'] == 0:
raise SearchError('Dataset not found in the search index: %s' % reference)
else:
return data['response']['docs'][0]
except Exception, e:
if not isinstance(e, SearchError):
log.exception(e)
raise SearchError(e)
finally:
conn.close()
if solr_response.hits == 0:
raise SearchError('Dataset not found in the search index: %s' % reference)
else:
return solr_response.docs[0]


def run(self, query):
Expand Down Expand Up @@ -354,53 +342,46 @@ def run(self, query):
query['mm'] = query.get('mm', '2<-1 5<80%')
query['qf'] = query.get('qf', QUERY_FIELDS)

conn = make_connection()
conn = make_connection(decode_dates=False)
log.debug('Package query: %r' % query)
try:
solr_response = conn.raw_query(**query)
except SolrException, e:
solr_response = conn.search(**query)
except pysolr.SolrError, e:
# Error with the sort parameter. You see slightly different
# error messages depending on whether the SOLR JSON comes back
# or Jetty gets in the way converting it to HTML - not sure why
#
if "Can't determine a Sort Order" in e.body or \
"Can't determine Sort Order" in e.body or \
'Unknown sort order' in e.body:
raise SearchQueryError('Invalid "sort" parameter')
raise SearchError(
'SOLR returned an error running query: %r Error: %r' %
(query, e.body or e.reason))
try:
data = json.loads(solr_response)
response = data['response']
self.count = response.get('numFound', 0)
self.results = response.get('docs', [])

# #1683 Filter out the last row that is sometimes out of order
self.results = self.results[:rows_to_return]

# get any extras and add to 'extras' dict
for result in self.results:
extra_keys = filter(lambda x: x.startswith('extras_'), result.keys())
extras = {}
for extra_key in extra_keys:
value = result.pop(extra_key)
extras[extra_key[len('extras_'):]] = value
if extra_keys:
result['extras'] = extras

# if just fetching the id or name, return a list instead of a dict
if query.get('fl') in ['id', 'name']:
self.results = [r.get(query.get('fl')) for r in self.results]

# get facets and convert facets list to a dict
self.facets = data.get('facet_counts', {}).get('facet_fields', {})
for field, values in self.facets.iteritems():
self.facets[field] = dict(zip(values[0::2], values[1::2]))
except Exception, e:
log.exception(e)
raise SearchError(e)
finally:
conn.close()
if e.args and isinstance(e.args[0], str):
if "Can't determine a Sort Order" in e.args[0] or \
"Can't determine Sort Order" in e.args[0] or \
'Unknown sort order' in e.args[0]:
raise SearchQueryError('Invalid "sort" parameter')
raise SearchError('SOLR returned an error running query: %r Error: %r' %
(query, e))
self.count = solr_response.hits
self.results = solr_response.docs


# #1683 Filter out the last row that is sometimes out of order
self.results = self.results[:rows_to_return]

# get any extras and add to 'extras' dict
for result in self.results:
extra_keys = filter(lambda x: x.startswith('extras_'), result.keys())
extras = {}
for extra_key in extra_keys:
value = result.pop(extra_key)
extras[extra_key[len('extras_'):]] = value
if extra_keys:
result['extras'] = extras

# if just fetching the id or name, return a list instead of a dict
if query.get('fl') in ['id', 'name']:
self.results = [r.get(query.get('fl')) for r in self.results]

# get facets and convert facets list to a dict
self.facets = solr_response.facets.get('facet_fields', {})
for field, values in six.iteritems(self.facets):
self.facets[field] = dict(zip(values[0::2], values[1::2]))

return {'results': self.results, 'count': self.count}
15 changes: 5 additions & 10 deletions ckan/tests/legacy/lib/test_solr_search_index.py
@@ -1,7 +1,4 @@
from datetime import datetime
import hashlib
import socket
import solr
import pysolr
from pylons import config
from ckan import model
import ckan.lib.search as search
Expand All @@ -20,9 +17,8 @@ def test_solr_url_exists(self):
try:
# solr.SolrConnection.query will throw a socket.error if it
# can't connect to the SOLR instance
q = conn.query("*:*", rows=1)
conn.close()
except socket.error, e:
q = conn.search(q="*:*", rows=1)
except pysolr.SolrError, e:
if not config.get('solr_url'):
raise AssertionError("Config option 'solr_url' needs to be defined in this CKAN's development.ini. Default of %s didn't work: %s" % (search.DEFAULT_SOLR_URL, e))
else:
Expand All @@ -41,19 +37,18 @@ def setup_class(cls):
@classmethod
def teardown_class(cls):
model.repo.rebuild_db()
cls.solr.close()
search.index_for('Package').clear()

def test_0_indexing(self):
"""
Make sure that all packages created by CreateTestData.create_search_test_data
have been added to the search index.
"""
results = self.solr.query('*:*', fq=self.fq)
results = self.solr.search(q='*:*', fq=self.fq)
assert len(results) == 6, len(results)

def test_1_basic(self):
results = self.solr.query('sweden', fq=self.fq)
results = self.solr.search(q='sweden', fq=self.fq)
assert len(results) == 2
result_names = [r['name'] for r in results]
assert 'se-publications' in result_names
Expand Down

0 comments on commit 595b2d5

Please sign in to comment.