From c81ce4b6816e0669cf878603b1046558644b258f Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 1 Aug 2012 16:10:59 +0100 Subject: [PATCH 1/5] Avoid unnecessary commit request The commit can be sent on the same request with the dict to index --- ckan/lib/search/index.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ckan/lib/search/index.py b/ckan/lib/search/index.py index 09ec2eec913..1d912eef1c7 100644 --- a/ckan/lib/search/index.py +++ b/ckan/lib/search/index.py @@ -222,8 +222,7 @@ def index_package(self, pkg_dict): # send to solr: try: conn = make_connection() - conn.add_many([pkg_dict]) - conn.commit(wait_flush=False, wait_searcher=False) + conn.add_many([pkg_dict], _commit=True) except Exception, e: log.exception(e) raise SearchIndexError(e) From 7b65a54088db9f8ed0511aa200a6409ebe223392 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 2 Aug 2012 12:06:51 +0100 Subject: [PATCH 2/5] Commit only once at the end of a search index rebuild Commiting changes in the Solr search index is a heavy task that takes significant time. We are currently commiting changes after each update, which is probably fine for individual updated (ie users editing or creating a dataset), but when rebuilding the whole index it is unnecessary. A single commit at the end of the process is needed, and that speeds the reindexing about a 85%. A flag has been added (`-e` or `--commit-each`) to allow the old behaviour (commiting after each edit). --- ckan/lib/cli.py | 21 ++++++++++++++++++--- ckan/lib/search/__init__.py | 12 +++++++++--- ckan/lib/search/index.py | 23 ++++++++++++++++++----- ckan/tests/lib/test_cli.py | 4 ++-- 4 files changed, 47 insertions(+), 13 deletions(-) diff --git a/ckan/lib/cli.py b/ckan/lib/cli.py index baaa4674ba9..11272281628 100644 --- a/ckan/lib/cli.py +++ b/ckan/lib/cli.py @@ -277,7 +277,7 @@ class SearchIndexCommand(CkanCommand): '''Creates a search index for all datasets Usage: - search-index [-i] [-o] [-r] rebuild [dataset-name] - reindex dataset-name if given, if not then rebuild full search index (all datasets) + search-index [-i] [-o] [-r] [-e] rebuild [dataset-name] - reindex dataset-name if given, if not then rebuild full search index (all datasets) search-index check - checks for datasets not indexed search-index show {dataset-name} - shows index of a dataset search-index clear [dataset-name] - clears the search index for the provided dataset or for the whole ckan instance @@ -301,6 +301,13 @@ def __init__(self,name): self.parser.add_option('-r', '--refresh', dest='refresh', action='store_true', default=False, help='Refresh current index (does not clear the existing one)') + self.parser.add_option('-e', '--commit-each', dest='commit_each', + action='store_true', default=False, help= +'''Perform a commit after indexing each dataset. This ensures that changes are +immediately available on the search, but slows significantly the process. +Default is false.''' + ) + def command(self): self._load_config() @@ -322,14 +329,22 @@ def command(self): print 'Command %s not recognized' % cmd def rebuild(self): - from ckan.lib.search import rebuild + from ckan.lib.search import rebuild, commit + + # BY default we don't commit after each request to Solr, as it is + # a really heavy operation and slows things a lot if len(self.args) > 1: rebuild(self.args[1]) else: rebuild(only_missing=self.options.only_missing, force=self.options.force, - refresh=self.options.refresh) + refresh=self.options.refresh, + defer_commit=(not self.options.commit_each)) + + if not self.options.commit_each: + commit() + def check(self): from ckan.lib.search import check diff --git a/ckan/lib/search/__init__.py b/ckan/lib/search/__init__.py index ecf2ed568e5..bcd9414746c 100644 --- a/ckan/lib/search/__init__.py +++ b/ckan/lib/search/__init__.py @@ -132,7 +132,7 @@ def notify(self, entity, operation): log.warn("Discarded Sync. indexing for: %s" % entity) -def rebuild(package_id=None, only_missing=False, force=False, refresh=False): +def rebuild(package_id=None, only_missing=False, force=False, refresh=False, defer_commit=False): ''' Rebuilds the search index. @@ -176,12 +176,13 @@ def rebuild(package_id=None, only_missing=False, force=False, refresh=False): for pkg_id in package_ids: try: - package_index.insert_dict( + package_index.update_dict( get_action('package_show')( {'model': model, 'ignore_auth': True, 'validate': False}, {'id': pkg_id} - ) + ), + defer_commit ) except Exception, e: log.error('Error while indexing dataset %s: %s' % @@ -196,6 +197,11 @@ def rebuild(package_id=None, only_missing=False, force=False, refresh=False): log.info('Finished rebuilding search index.') +def commit(): + package_index = index_for(model.Package) + package_index.commit() + log.info('Commited pending changes on the search index') + def check(): from ckan import model package_query = query_for(model.Package) diff --git a/ckan/lib/search/index.py b/ckan/lib/search/index.py index 1d912eef1c7..dee7e27c7e8 100644 --- a/ckan/lib/search/index.py +++ b/ckan/lib/search/index.py @@ -92,10 +92,10 @@ class PackageSearchIndex(SearchIndex): def remove_dict(self, pkg_dict): self.delete_package(pkg_dict) - def update_dict(self, pkg_dict): - self.index_package(pkg_dict) + def update_dict(self, pkg_dict, defer_commit=False): + self.index_package(pkg_dict, defer_commit) - def index_package(self, pkg_dict): + def index_package(self, pkg_dict, defer_commit=False): if pkg_dict is None: return pkg_dict['data_dict'] = json.dumps(pkg_dict) @@ -222,14 +222,27 @@ def index_package(self, pkg_dict): # send to solr: try: conn = make_connection() - conn.add_many([pkg_dict], _commit=True) + commit = not defer_commit + conn.add_many([pkg_dict], _commit=commit) + except Exception, e: + log.exception(e) + raise SearchIndexError(e) + finally: + conn.close() + + commit_debug_msg = 'Not commited yet' if defer_commit else 'Commited' + log.debug('Updated index for %s [%s]' % (pkg_dict.get('name'), commit_debug_msg)) + + def commit(self): + try: + conn = make_connection() + conn.commit(wait_flush=False, wait_searcher=False) except Exception, e: log.exception(e) raise SearchIndexError(e) finally: conn.close() - log.debug("Updated index for %s" % pkg_dict.get('name')) def delete_package(self, pkg_dict): conn = make_connection() diff --git a/ckan/tests/lib/test_cli.py b/ckan/tests/lib/test_cli.py index 7bea16bb484..5aee50ace1a 100644 --- a/ckan/tests/lib/test_cli.py +++ b/ckan/tests/lib/test_cli.py @@ -80,7 +80,7 @@ def test_clear_and_rebuild_index(self): # Rebuild index self.search.args = () - self.search.options = FakeOptions(only_missing=False,force=False,refresh=False) + self.search.options = FakeOptions(only_missing=False,force=False,refresh=False,commit_each=False) self.search.rebuild() pkg_count = model.Session.query(model.Package).filter(model.Package.state==u'active').count() @@ -103,7 +103,7 @@ def test_clear_and_rebuild_only_one(self): # Rebuild index for annakarenina self.search.args = ('rebuild annakarenina').split() - self.search.options = FakeOptions(only_missing=False,force=False,refresh=False) + self.search.options = FakeOptions(only_missing=False,force=False,refresh=False,commit_each=False) self.search.rebuild() self.query.run({'q':'*:*'}) From 8473cd8bcb15d88805a309470129ed10fe8ba522 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 2 Aug 2012 16:55:10 +0100 Subject: [PATCH 3/5] Allow not returning the package dict when creating or updating This avoids calling package_show --- ckan/logic/action/create.py | 8 ++++++-- ckan/logic/action/update.py | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/ckan/logic/action/create.py b/ckan/logic/action/create.py index 9ed5a1ea586..83dc4c006ae 100644 --- a/ckan/logic/action/create.py +++ b/ckan/logic/action/create.py @@ -98,7 +98,8 @@ def package_create(context, data_dict): which groups exist call ``group_list()`` :type groups: list of dictionaries - :returns: the newly created dataset + :returns: the newly created dataset (if 'return_package_dict' is True in the + context, which is the default. Otherwise returns None) :rtype: dictionary ''' @@ -162,7 +163,10 @@ def package_create(context, data_dict): ## this is added so that the rest controller can make a new location context["id"] = pkg.id log.debug('Created object %s' % str(pkg.name)) - return _get_action('package_show')(context, {'id':context['id']}) + + return_package_dict = context.get('return_package_dict',True) + + return _get_action('package_show')(context, {'id':context['id']}) if return_package_dict else None def package_create_validate(context, data_dict): model = context['model'] diff --git a/ckan/logic/action/update.py b/ckan/logic/action/update.py index a06274c0714..bf530bd4c07 100644 --- a/ckan/logic/action/update.py +++ b/ckan/logic/action/update.py @@ -213,7 +213,8 @@ def package_update(context, data_dict): :param id: the name or id of the dataset to update :type id: string - :returns: the updated dataset + :returns: the updated dataset (if 'return_package_dict' is True in the + context, which is the default. Otherwise returns None) :rtype: dictionary ''' @@ -273,7 +274,10 @@ def package_update(context, data_dict): model.repo.commit() log.debug('Updated object %s' % str(pkg.name)) - return _get_action('package_show')(context, data_dict) + + return_package_dict = context.get('return_package_dict',True) + + return _get_action('package_show')(context, data_dict) if return_package_dict else None def package_update_validate(context, data_dict): model = context['model'] From 74f12575b32922c92b0f607c809692911ef641e1 Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 7 Aug 2012 14:54:14 +0100 Subject: [PATCH 4/5] Return dataset id if 'return_pacakge_dict' is False --- ckan/logic/action/create.py | 8 ++++++-- ckan/logic/action/update.py | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/ckan/logic/action/create.py b/ckan/logic/action/create.py index 83dc4c006ae..664e1541dbc 100644 --- a/ckan/logic/action/create.py +++ b/ckan/logic/action/create.py @@ -99,7 +99,8 @@ def package_create(context, data_dict): :type groups: list of dictionaries :returns: the newly created dataset (if 'return_package_dict' is True in the - context, which is the default. Otherwise returns None) + context, which is the default. Otherwise returns just the dataset + id) :rtype: dictionary ''' @@ -166,7 +167,10 @@ def package_create(context, data_dict): return_package_dict = context.get('return_package_dict',True) - return _get_action('package_show')(context, {'id':context['id']}) if return_package_dict else None + output = _get_action('package_show')(context, {'id':context['id']}) \ + if return_package_dict else context['id'] + + return output def package_create_validate(context, data_dict): model = context['model'] diff --git a/ckan/logic/action/update.py b/ckan/logic/action/update.py index bf530bd4c07..9a4f9ff0831 100644 --- a/ckan/logic/action/update.py +++ b/ckan/logic/action/update.py @@ -214,7 +214,8 @@ def package_update(context, data_dict): :type id: string :returns: the updated dataset (if 'return_package_dict' is True in the - context, which is the default. Otherwise returns None) + context, which is the default. Otherwise returns just the + dataset id) :rtype: dictionary ''' @@ -277,7 +278,10 @@ def package_update(context, data_dict): return_package_dict = context.get('return_package_dict',True) - return _get_action('package_show')(context, data_dict) if return_package_dict else None + output = _get_action('package_show')(context, {'id': data_dict['id']}) \ + if return_package_dict else data_dict['id'] + + return output def package_update_validate(context, data_dict): model = context['model'] From 4006757ba6a6ead21c3deb17427fd6176b481ec2 Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 14 Aug 2012 17:33:40 +0100 Subject: [PATCH 5/5] Use return_id_only instead of return_package_dict Suggested by @tobes, as it is more backwards compatible --- ckan/logic/action/create.py | 11 +++++------ ckan/logic/action/update.py | 6 +++--- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/ckan/logic/action/create.py b/ckan/logic/action/create.py index 664e1541dbc..0f53d241db9 100644 --- a/ckan/logic/action/create.py +++ b/ckan/logic/action/create.py @@ -98,9 +98,8 @@ def package_create(context, data_dict): which groups exist call ``group_list()`` :type groups: list of dictionaries - :returns: the newly created dataset (if 'return_package_dict' is True in the - context, which is the default. Otherwise returns just the dataset - id) + :returns: the newly created dataset (unless 'return_id_only' is set to True + in the context, in which case just the dataset id will be returned) :rtype: dictionary ''' @@ -165,10 +164,10 @@ def package_create(context, data_dict): context["id"] = pkg.id log.debug('Created object %s' % str(pkg.name)) - return_package_dict = context.get('return_package_dict',True) + return_id_only = context.get('return_id_only', False) - output = _get_action('package_show')(context, {'id':context['id']}) \ - if return_package_dict else context['id'] + output = context['id'] if return_id_only \ + else _get_action('package_show')(context, {'id':context['id']}) return output diff --git a/ckan/logic/action/update.py b/ckan/logic/action/update.py index 9a4f9ff0831..d60c7927728 100644 --- a/ckan/logic/action/update.py +++ b/ckan/logic/action/update.py @@ -276,10 +276,10 @@ def package_update(context, data_dict): log.debug('Updated object %s' % str(pkg.name)) - return_package_dict = context.get('return_package_dict',True) + return_id_only = context.get('return_id_only', False) - output = _get_action('package_show')(context, {'id': data_dict['id']}) \ - if return_package_dict else data_dict['id'] + output = data_dict['id'] if return_id_only \ + else _get_action('package_show')(context, {'id': data_dict['id']}) return output