From f2145778721c0c502cc742293fdfaced3e94c51b Mon Sep 17 00:00:00 2001 From: Stefan Oderbolz Date: Tue, 13 Jan 2015 14:46:14 +0100 Subject: [PATCH 1/7] Fetch remote organization via action api Organizations used to be returned by /api/2/rest/group, this is what the old implementation used to fetch the information to create the remote organization on the local instance of CKAN. With this commit the Action API is used to fetch the same information. --- ckanext/harvest/harvesters/ckanharvester.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/ckanext/harvest/harvesters/ckanharvester.py b/ckanext/harvest/harvesters/ckanharvester.py index 835e27ce9..2c5c4d9ae 100644 --- a/ckanext/harvest/harvesters/ckanharvester.py +++ b/ckanext/harvest/harvesters/ckanharvester.py @@ -21,10 +21,14 @@ class CKANHarvester(HarvesterBase): config = None api_version = 2 + action_api_version = 3 def _get_rest_api_offset(self): return '/api/%d/rest' % self.api_version + def _get_action_api_offset(self): + return '/api/%d/action' % self.action_api_version + def _get_search_api_offset(self): return '/api/%d/search' % self.api_version @@ -48,6 +52,15 @@ def _get_group(self, base_url, group_name): except Exception, e: raise e + def _get_organization(self, base_url, org_name): + url = base_url + self._get_action_api_offset() + '/organization_show?id=' + org_name + try: + content = self._get_content(url) + content_dict = json.loads(content) + return content_dict['result'] + except Exception, e: + raise e + def _set_config(self,config_str): if config_str: self.config = json.loads(config_str) @@ -324,7 +337,7 @@ def import_stage(self,harvest_object): log.info('Organization %s is not available' % remote_org) if remote_orgs == 'create': try: - org = self._get_group(harvest_object.source.url, remote_org) + org = self._get_organization(harvest_object.source.url, remote_org) for key in ['packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name', 'type']: org.pop(key, None) get_action('organization_create')(context, org) From 0fd38e0e54a8e0bb84ca5717db89e1587b6a6c79 Mon Sep 17 00:00:00 2001 From: Stefan Oderbolz Date: Wed, 14 Jan 2015 00:10:27 +0100 Subject: [PATCH 2/7] Use _get_group as a fallback for remote orgs First try to get a remote org from the remote Action API, if this fails try to use the old rest api call, which works on older CKAN versions. Only if both options fail, its currently not possible to get the remote organization. --- ckanext/harvest/harvesters/ckanharvester.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/ckanext/harvest/harvesters/ckanharvester.py b/ckanext/harvest/harvesters/ckanharvester.py index 2c5c4d9ae..3fb3935b8 100644 --- a/ckanext/harvest/harvesters/ckanharvester.py +++ b/ckanext/harvest/harvesters/ckanharvester.py @@ -337,7 +337,14 @@ def import_stage(self,harvest_object): log.info('Organization %s is not available' % remote_org) if remote_orgs == 'create': try: - org = self._get_organization(harvest_object.source.url, remote_org) + try: + org = self._get_organization(harvest_object.source.url, remote_org) + except: + # fallback if remote CKAN exposes organizations as groups + # this especially targets older versions of CKAN + log.debug('_get_organization() failed, now trying _get_group()') + org = self._get_group(harvest_object.source.url, remote_org) + for key in ['packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name', 'type']: org.pop(key, None) get_action('organization_create')(context, org) From ef35c21e2a587dc4136ff25acef774eabd7777fa Mon Sep 17 00:00:00 2001 From: Stefan Oderbolz Date: Thu, 15 Jan 2015 00:07:26 +0100 Subject: [PATCH 3/7] Improve exception handling with custom exception 1. Try whenever possible to catch specific exceptions 2. Raise custom exception where appropriate 3. Fix the exception handling in _get_group and _get_organization --- ckanext/harvest/harvesters/ckanharvester.py | 30 ++++++++++++++------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/ckanext/harvest/harvesters/ckanharvester.py b/ckanext/harvest/harvesters/ckanharvester.py index 3fb3935b8..e6fe50349 100644 --- a/ckanext/harvest/harvesters/ckanharvester.py +++ b/ckanext/harvest/harvesters/ckanharvester.py @@ -40,8 +40,14 @@ def _get_content(self, url): api_key = self.config.get('api_key',None) if api_key: http_request.add_header('Authorization',api_key) - http_response = urllib2.urlopen(http_request) + try: + http_response = urllib2.urlopen(http_request) + except urllib2.HTTPError, e: + raise ContentFetchError( + 'Could not fetch url: %s, HTTP error code: %s' % + (url, e.code) + ) return http_response.read() def _get_group(self, base_url, group_name): @@ -49,8 +55,9 @@ def _get_group(self, base_url, group_name): try: content = self._get_content(url) return json.loads(content) - except Exception, e: - raise e + except (ContentFetchError, ValueError): + log.debug('Could not fetch/decode remote group'); + raise RemoteResourceError('Could not fetch/decode remote group') def _get_organization(self, base_url, org_name): url = base_url + self._get_action_api_offset() + '/organization_show?id=' + org_name @@ -58,8 +65,9 @@ def _get_organization(self, base_url, org_name): content = self._get_content(url) content_dict = json.loads(content) return content_dict['result'] - except Exception, e: - raise e + except (ContentFetchError, ValueError, KeyError): + log.debug('Could not fetch/decode remote group'); + raise RemoteResourceError('Could not fetch/decode remote organization') def _set_config(self,config_str): if config_str: @@ -294,7 +302,7 @@ def import_stage(self,harvest_object): if remote_groups == 'create': try: group = self._get_group(harvest_object.source.url, group_name) - except: + except RemoteResourceError: log.error('Could not get remote group %s' % group_name) continue @@ -339,10 +347,9 @@ def import_stage(self,harvest_object): try: try: org = self._get_organization(harvest_object.source.url, remote_org) - except: + except RemoteResourceError: # fallback if remote CKAN exposes organizations as groups # this especially targets older versions of CKAN - log.debug('_get_organization() failed, now trying _get_group()') org = self._get_group(harvest_object.source.url, remote_org) for key in ['packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name', 'type']: @@ -350,7 +357,7 @@ def import_stage(self,harvest_object): get_action('organization_create')(context, org) log.info('Organization %s has been newly created' % remote_org) validated_org = org['id'] - except: + except (RemoteResourceError, ValidationError): log.error('Could not get remote org %s' % remote_org) package_dict['owner_org'] = validated_org or local_org @@ -425,3 +432,8 @@ def import_stage(self,harvest_object): except Exception, e: self._save_object_error('%r'%e,harvest_object,'Import') +class ContentFetchError(Exception): + pass + +class RemoteResourceError(Exception): + pass From 935b9dda0100325028b01eb45a363b5730670c16 Mon Sep 17 00:00:00 2001 From: Stefan Oderbolz Date: Thu, 15 Jan 2015 00:09:43 +0100 Subject: [PATCH 4/7] Munge group name before fetching remote group The API call /api/2/rest/package/ returns the display name of the group instead of its ID. To properly match the group, munge the name before calling /api/2/rest/group --- ckanext/harvest/harvesters/ckanharvester.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ckanext/harvest/harvesters/ckanharvester.py b/ckanext/harvest/harvesters/ckanharvester.py index e6fe50349..a5b3f321f 100644 --- a/ckanext/harvest/harvesters/ckanharvester.py +++ b/ckanext/harvest/harvesters/ckanharvester.py @@ -5,6 +5,7 @@ from ckan.model import Session, Package from ckan.logic import ValidationError, NotFound, get_action from ckan.lib.helpers import json +from ckan.lib.munge import munge_name from ckanext.harvest.model import HarvestJob, HarvestObject, HarvestGatherError, \ HarvestObjectError @@ -51,7 +52,7 @@ def _get_content(self, url): return http_response.read() def _get_group(self, base_url, group_name): - url = base_url + self._get_rest_api_offset() + '/group/' + group_name + url = base_url + self._get_rest_api_offset() + '/group/' + munge_name(group_name) try: content = self._get_content(url) return json.loads(content) From b978c26e70fa61024fcac2982a37782571b720a3 Mon Sep 17 00:00:00 2001 From: Stefan Oderbolz Date: Thu, 15 Jan 2015 00:49:11 +0100 Subject: [PATCH 5/7] Use ContentFetchError instead of generic Exception --- ckanext/harvest/harvesters/ckanharvester.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ckanext/harvest/harvesters/ckanharvester.py b/ckanext/harvest/harvesters/ckanharvester.py index a5b3f321f..70018ae63 100644 --- a/ckanext/harvest/harvesters/ckanharvester.py +++ b/ckanext/harvest/harvesters/ckanharvester.py @@ -177,7 +177,7 @@ def gather_stage(self,harvest_job): url = base_rest_url + '/revision/%s' % revision_id try: content = self._get_content(url) - except Exception,e: + except ContentFetchError,e: self._save_gather_error('Unable to get content for URL: %s: %s' % (url, str(e)),harvest_job) continue @@ -204,7 +204,7 @@ def gather_stage(self,harvest_job): url = base_rest_url + '/package' try: content = self._get_content(url) - except Exception,e: + except ContentFetchError,e: self._save_gather_error('Unable to get content for URL: %s: %s' % (url, str(e)),harvest_job) return None @@ -241,7 +241,7 @@ def fetch_stage(self,harvest_object): # Get contents try: content = self._get_content(url) - except Exception,e: + except ContentFetchError,e: self._save_object_error('Unable to get content for package: %s: %r' % \ (url, e),harvest_object) return None From 191c39ce5c13cd240dd9c5545f548bac09e214e1 Mon Sep 17 00:00:00 2001 From: Stefan Oderbolz Date: Thu, 15 Jan 2015 10:57:24 +0100 Subject: [PATCH 6/7] Catch the more general URLError instead of HTTPError HTTPError is a subclass of URLError, so catch URLError is enough. I think the HTTP error code is not as important in this situation, so catching the more generic error seems like the best solution. --- ckanext/harvest/harvesters/ckanharvester.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ckanext/harvest/harvesters/ckanharvester.py b/ckanext/harvest/harvesters/ckanharvester.py index 70018ae63..26251dcf8 100644 --- a/ckanext/harvest/harvesters/ckanharvester.py +++ b/ckanext/harvest/harvesters/ckanharvester.py @@ -44,10 +44,10 @@ def _get_content(self, url): try: http_response = urllib2.urlopen(http_request) - except urllib2.HTTPError, e: + except urllib2.URLError, e: raise ContentFetchError( - 'Could not fetch url: %s, HTTP error code: %s' % - (url, e.code) + 'Could not fetch url: %s, error: %s' % + (url, e.reason) ) return http_response.read() From c1bcee9684b23e1910055b75b673bc4b146a3336 Mon Sep 17 00:00:00 2001 From: Stefan Oderbolz Date: Thu, 15 Jan 2015 11:36:15 +0100 Subject: [PATCH 7/7] Use str() to get the error message --- ckanext/harvest/harvesters/ckanharvester.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/harvest/harvesters/ckanharvester.py b/ckanext/harvest/harvesters/ckanharvester.py index 26251dcf8..7575bad65 100644 --- a/ckanext/harvest/harvesters/ckanharvester.py +++ b/ckanext/harvest/harvesters/ckanharvester.py @@ -47,7 +47,7 @@ def _get_content(self, url): except urllib2.URLError, e: raise ContentFetchError( 'Could not fetch url: %s, error: %s' % - (url, e.reason) + (url, str(e)) ) return http_response.read()