Skip to content

Commit

Permalink
Fixes for XML harvester
Browse files Browse the repository at this point in the history
  • Loading branch information
amercader committed May 16, 2014
1 parent 57ba862 commit d289c58
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 5 deletions.
7 changes: 4 additions & 3 deletions ckanext/dcat/formats/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,11 @@ def get_xml_tree(self):
else:
xml_str = self.xml_str
self.xml_tree = etree.fromstring(xml_str, parser=parser)

if self.base_class and ':' in self.base_class:
ns = self.base_class.split(':')[0]
if self.base_class.replace(ns, self.xml_tree.nsmap[ns]) != self.xml_tree.tag:
parts = self.base_class.split(':')
ns = parts[0]
name = parts[1]
if '{{{ns}}}{name}'.format(ns=self.xml_tree.nsmap[ns], name=name) != self.xml_tree.tag:
elements = self.xml_tree.xpath(self.base_class, namespaces=self.xml_tree.nsmap)
if len(elements):
self.xml_tree = elements[0]
Expand Down
11 changes: 9 additions & 2 deletions ckanext/dcat/harvesters.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def _get_content(self, url, harvest_job, page=1):
# first we try a HEAD request which may not be supported
did_get = False
r = requests.head(url)
if r.status_code == 405:
if r.status_code == 405 or r.status_code == 400:
r = requests.get(url, stream=True)
did_get = True
r.raise_for_status()
Expand Down Expand Up @@ -303,6 +303,9 @@ def import_stage(self,harvest_object):


package_dict, dcat_dict = self._get_package_dict(harvest_object)
if not package_dict:
return False

if not package_dict.get('name'):
package_dict['name'] = self._get_package_name(harvest_object, package_dict['title'])

Expand Down Expand Up @@ -394,7 +397,11 @@ def _get_package_dict(self, harvest_object):

content = harvest_object.content

dataset = formats.xml.DCATDataset(content)
try:
dataset = formats.xml.DCATDataset(content)
except ValueError, e:
self._save_object_error('Content does not look like dcat:Dataset for harvest object {0}'.format(harvest_object.id), harvest_object, 'Import')
return None, None
dcat_dict = dataset.read_values()

package_dict = converters.dcat_to_ckan(dcat_dict)
Expand Down

0 comments on commit d289c58

Please sign in to comment.