Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fetch remote organization via action api #121

Merged
merged 7 commits into from Jan 15, 2015
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
53 changes: 43 additions & 10 deletions ckanext/harvest/harvesters/ckanharvester.py
Expand Up @@ -5,6 +5,7 @@
from ckan.model import Session, Package
from ckan.logic import ValidationError, NotFound, get_action
from ckan.lib.helpers import json
from ckan.lib.munge import munge_name

from ckanext.harvest.model import HarvestJob, HarvestObject, HarvestGatherError, \
HarvestObjectError
Expand All @@ -21,10 +22,14 @@ class CKANHarvester(HarvesterBase):
config = None

api_version = 2
action_api_version = 3

def _get_rest_api_offset(self):
return '/api/%d/rest' % self.api_version

def _get_action_api_offset(self):
return '/api/%d/action' % self.action_api_version

def _get_search_api_offset(self):
return '/api/%d/search' % self.api_version

Expand All @@ -36,17 +41,34 @@ def _get_content(self, url):
api_key = self.config.get('api_key',None)
if api_key:
http_request.add_header('Authorization',api_key)
http_response = urllib2.urlopen(http_request)

try:
http_response = urllib2.urlopen(http_request)
except urllib2.URLError, e:
raise ContentFetchError(
'Could not fetch url: %s, error: %s' %
(url, str(e))
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this also needs to catch urllib2.URLError, in cases such as where the url has bad syntax, or the socket connection times out.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch, I'll replace it with urllib2.URLError (which is the super class of urllib2.HTTPError), this should cover the exceptions at this point.

return http_response.read()

def _get_group(self, base_url, group_name):
url = base_url + self._get_rest_api_offset() + '/group/' + group_name
url = base_url + self._get_rest_api_offset() + '/group/' + munge_name(group_name)
try:
content = self._get_content(url)
return json.loads(content)
except Exception, e:
raise e
except (ContentFetchError, ValueError):
log.debug('Could not fetch/decode remote group');
raise RemoteResourceError('Could not fetch/decode remote group')

def _get_organization(self, base_url, org_name):
url = base_url + self._get_action_api_offset() + '/organization_show?id=' + org_name
try:
content = self._get_content(url)
content_dict = json.loads(content)
return content_dict['result']
except (ContentFetchError, ValueError, KeyError):
log.debug('Could not fetch/decode remote group');
raise RemoteResourceError('Could not fetch/decode remote organization')

def _set_config(self,config_str):
if config_str:
Expand Down Expand Up @@ -155,7 +177,7 @@ def gather_stage(self,harvest_job):
url = base_rest_url + '/revision/%s' % revision_id
try:
content = self._get_content(url)
except Exception,e:
except ContentFetchError,e:
self._save_gather_error('Unable to get content for URL: %s: %s' % (url, str(e)),harvest_job)
continue

Expand All @@ -182,7 +204,7 @@ def gather_stage(self,harvest_job):
url = base_rest_url + '/package'
try:
content = self._get_content(url)
except Exception,e:
except ContentFetchError,e:
self._save_gather_error('Unable to get content for URL: %s: %s' % (url, str(e)),harvest_job)
return None

Expand Down Expand Up @@ -219,7 +241,7 @@ def fetch_stage(self,harvest_object):
# Get contents
try:
content = self._get_content(url)
except Exception,e:
except ContentFetchError,e:
self._save_object_error('Unable to get content for package: %s: %r' % \
(url, e),harvest_object)
return None
Expand Down Expand Up @@ -281,7 +303,7 @@ def import_stage(self,harvest_object):
if remote_groups == 'create':
try:
group = self._get_group(harvest_object.source.url, group_name)
except:
except RemoteResourceError:
log.error('Could not get remote group %s' % group_name)
continue

Expand Down Expand Up @@ -324,13 +346,19 @@ def import_stage(self,harvest_object):
log.info('Organization %s is not available' % remote_org)
if remote_orgs == 'create':
try:
org = self._get_group(harvest_object.source.url, remote_org)
try:
org = self._get_organization(harvest_object.source.url, remote_org)
except RemoteResourceError:
# fallback if remote CKAN exposes organizations as groups
# this especially targets older versions of CKAN
org = self._get_group(harvest_object.source.url, remote_org)

for key in ['packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name', 'type']:
org.pop(key, None)
get_action('organization_create')(context, org)
log.info('Organization %s has been newly created' % remote_org)
validated_org = org['id']
except:
except (RemoteResourceError, ValidationError):
log.error('Could not get remote org %s' % remote_org)

package_dict['owner_org'] = validated_org or local_org
Expand Down Expand Up @@ -405,3 +433,8 @@ def import_stage(self,harvest_object):
except Exception, e:
self._save_object_error('%r'%e,harvest_object,'Import')

class ContentFetchError(Exception):
pass

class RemoteResourceError(Exception):
pass