diff --git a/ckan/lib/cli.py b/ckan/lib/cli.py index baaa4674ba9..11272281628 100644 --- a/ckan/lib/cli.py +++ b/ckan/lib/cli.py @@ -277,7 +277,7 @@ class SearchIndexCommand(CkanCommand): '''Creates a search index for all datasets Usage: - search-index [-i] [-o] [-r] rebuild [dataset-name] - reindex dataset-name if given, if not then rebuild full search index (all datasets) + search-index [-i] [-o] [-r] [-e] rebuild [dataset-name] - reindex dataset-name if given, if not then rebuild full search index (all datasets) search-index check - checks for datasets not indexed search-index show {dataset-name} - shows index of a dataset search-index clear [dataset-name] - clears the search index for the provided dataset or for the whole ckan instance @@ -301,6 +301,13 @@ def __init__(self,name): self.parser.add_option('-r', '--refresh', dest='refresh', action='store_true', default=False, help='Refresh current index (does not clear the existing one)') + self.parser.add_option('-e', '--commit-each', dest='commit_each', + action='store_true', default=False, help= +'''Perform a commit after indexing each dataset. This ensures that changes are +immediately available on the search, but slows significantly the process. +Default is false.''' + ) + def command(self): self._load_config() @@ -322,14 +329,22 @@ def command(self): print 'Command %s not recognized' % cmd def rebuild(self): - from ckan.lib.search import rebuild + from ckan.lib.search import rebuild, commit + + # BY default we don't commit after each request to Solr, as it is + # a really heavy operation and slows things a lot if len(self.args) > 1: rebuild(self.args[1]) else: rebuild(only_missing=self.options.only_missing, force=self.options.force, - refresh=self.options.refresh) + refresh=self.options.refresh, + defer_commit=(not self.options.commit_each)) + + if not self.options.commit_each: + commit() + def check(self): from ckan.lib.search import check diff --git a/ckan/lib/search/__init__.py b/ckan/lib/search/__init__.py index ecf2ed568e5..bcd9414746c 100644 --- a/ckan/lib/search/__init__.py +++ b/ckan/lib/search/__init__.py @@ -132,7 +132,7 @@ def notify(self, entity, operation): log.warn("Discarded Sync. indexing for: %s" % entity) -def rebuild(package_id=None, only_missing=False, force=False, refresh=False): +def rebuild(package_id=None, only_missing=False, force=False, refresh=False, defer_commit=False): ''' Rebuilds the search index. @@ -176,12 +176,13 @@ def rebuild(package_id=None, only_missing=False, force=False, refresh=False): for pkg_id in package_ids: try: - package_index.insert_dict( + package_index.update_dict( get_action('package_show')( {'model': model, 'ignore_auth': True, 'validate': False}, {'id': pkg_id} - ) + ), + defer_commit ) except Exception, e: log.error('Error while indexing dataset %s: %s' % @@ -196,6 +197,11 @@ def rebuild(package_id=None, only_missing=False, force=False, refresh=False): log.info('Finished rebuilding search index.') +def commit(): + package_index = index_for(model.Package) + package_index.commit() + log.info('Commited pending changes on the search index') + def check(): from ckan import model package_query = query_for(model.Package) diff --git a/ckan/lib/search/index.py b/ckan/lib/search/index.py index 09ec2eec913..dee7e27c7e8 100644 --- a/ckan/lib/search/index.py +++ b/ckan/lib/search/index.py @@ -92,10 +92,10 @@ class PackageSearchIndex(SearchIndex): def remove_dict(self, pkg_dict): self.delete_package(pkg_dict) - def update_dict(self, pkg_dict): - self.index_package(pkg_dict) + def update_dict(self, pkg_dict, defer_commit=False): + self.index_package(pkg_dict, defer_commit) - def index_package(self, pkg_dict): + def index_package(self, pkg_dict, defer_commit=False): if pkg_dict is None: return pkg_dict['data_dict'] = json.dumps(pkg_dict) @@ -222,7 +222,20 @@ def index_package(self, pkg_dict): # send to solr: try: conn = make_connection() - conn.add_many([pkg_dict]) + commit = not defer_commit + conn.add_many([pkg_dict], _commit=commit) + except Exception, e: + log.exception(e) + raise SearchIndexError(e) + finally: + conn.close() + + commit_debug_msg = 'Not commited yet' if defer_commit else 'Commited' + log.debug('Updated index for %s [%s]' % (pkg_dict.get('name'), commit_debug_msg)) + + def commit(self): + try: + conn = make_connection() conn.commit(wait_flush=False, wait_searcher=False) except Exception, e: log.exception(e) @@ -230,7 +243,6 @@ def index_package(self, pkg_dict): finally: conn.close() - log.debug("Updated index for %s" % pkg_dict.get('name')) def delete_package(self, pkg_dict): conn = make_connection() diff --git a/ckan/logic/action/create.py b/ckan/logic/action/create.py index 9ed5a1ea586..0f53d241db9 100644 --- a/ckan/logic/action/create.py +++ b/ckan/logic/action/create.py @@ -98,7 +98,8 @@ def package_create(context, data_dict): which groups exist call ``group_list()`` :type groups: list of dictionaries - :returns: the newly created dataset + :returns: the newly created dataset (unless 'return_id_only' is set to True + in the context, in which case just the dataset id will be returned) :rtype: dictionary ''' @@ -162,7 +163,13 @@ def package_create(context, data_dict): ## this is added so that the rest controller can make a new location context["id"] = pkg.id log.debug('Created object %s' % str(pkg.name)) - return _get_action('package_show')(context, {'id':context['id']}) + + return_id_only = context.get('return_id_only', False) + + output = context['id'] if return_id_only \ + else _get_action('package_show')(context, {'id':context['id']}) + + return output def package_create_validate(context, data_dict): model = context['model'] diff --git a/ckan/logic/action/update.py b/ckan/logic/action/update.py index a06274c0714..d60c7927728 100644 --- a/ckan/logic/action/update.py +++ b/ckan/logic/action/update.py @@ -213,7 +213,9 @@ def package_update(context, data_dict): :param id: the name or id of the dataset to update :type id: string - :returns: the updated dataset + :returns: the updated dataset (if 'return_package_dict' is True in the + context, which is the default. Otherwise returns just the + dataset id) :rtype: dictionary ''' @@ -273,7 +275,13 @@ def package_update(context, data_dict): model.repo.commit() log.debug('Updated object %s' % str(pkg.name)) - return _get_action('package_show')(context, data_dict) + + return_id_only = context.get('return_id_only', False) + + output = data_dict['id'] if return_id_only \ + else _get_action('package_show')(context, {'id': data_dict['id']}) + + return output def package_update_validate(context, data_dict): model = context['model'] diff --git a/ckan/tests/lib/test_cli.py b/ckan/tests/lib/test_cli.py index 7bea16bb484..5aee50ace1a 100644 --- a/ckan/tests/lib/test_cli.py +++ b/ckan/tests/lib/test_cli.py @@ -80,7 +80,7 @@ def test_clear_and_rebuild_index(self): # Rebuild index self.search.args = () - self.search.options = FakeOptions(only_missing=False,force=False,refresh=False) + self.search.options = FakeOptions(only_missing=False,force=False,refresh=False,commit_each=False) self.search.rebuild() pkg_count = model.Session.query(model.Package).filter(model.Package.state==u'active').count() @@ -103,7 +103,7 @@ def test_clear_and_rebuild_only_one(self): # Rebuild index for annakarenina self.search.args = ('rebuild annakarenina').split() - self.search.options = FakeOptions(only_missing=False,force=False,refresh=False) + self.search.options = FakeOptions(only_missing=False,force=False,refresh=False,commit_each=False) self.search.rebuild() self.query.run({'q':'*:*'})