Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Revert "#276 Coupled Resource table now gets updated during harvest."
Reverting changes for "#276 Coupled Resource" on master as it is INSPIRE-specific. Moving to datagovuk/ckanext-spatial branch dgu.

This reverts commit 0153687.

Conflicts:

	ckanext/spatial/harvesters.py
  • Loading branch information
David Read committed Feb 4, 2013
1 parent 84b75ea commit c771a76
Show file tree
Hide file tree
Showing 6 changed files with 20 additions and 437 deletions.
6 changes: 6 additions & 0 deletions ckanext/spatial/bin/coupled_resources.py
Expand Up @@ -234,13 +234,17 @@ def add_coupling(cls, service_record, dataset_record,
from ckanext.harvest.model import HarvestCoupledResource

if dataset_harvest_object.harvest_source_reference != harvest_source_reference:
rev = model.repo.new_revision()
rev.author = 'Couple migration'
dataset_harvest_object.harvest_source_reference = harvest_source_reference
model.Session.commit()
q = model.Session.query(HarvestCoupledResource) \
.filter_by(service_record_package_id=service_record.id) \
.filter_by(dataset_record_package_id=dataset_record.id) \
.filter_by(harvest_source_reference=harvest_source_reference)
if q.count() == 0:
rev = model.repo.new_revision()
rev.author = 'Couple migration'
obj = HarvestCoupledResource(
service_record_package_id=service_record.id,
dataset_record_package_id=dataset_record.id,
Expand All @@ -267,6 +271,8 @@ def ensure_dataset_is_in_couple_table(cls, dataset_record):
harvest_object = harvest_objects[0]
harvest_source_reference = harvest_object.harvest_source_reference

rev = model.repo.new_revision()
rev.author = 'Couple migration'
obj = HarvestCoupledResource(
dataset_record_package_id=dataset_record.id,
harvest_source_reference=harvest_source_reference)
Expand Down
28 changes: 4 additions & 24 deletions ckanext/spatial/harvesters.py
Expand Up @@ -46,7 +46,6 @@
from ckanext.spatial.model import GeminiDocument
from ckanext.spatial.lib.csw_client import CswService
from ckanext.spatial.validation import Validators
from ckanext.spatial.lib.coupled_resource import extract_guid, update_coupled_resources

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -155,7 +154,7 @@ def import_stage(self, harvest_object):
if debug_exception_mode:
raise

def import_gemini_object(self, gemini_string, harvest_source_reference):
def import_gemini_object(self, gemini_string):
'''Imports the Gemini metadata into CKAN.
The harvest_source_reference is an ID that the harvest_source uses
Expand All @@ -178,10 +177,6 @@ def import_gemini_object(self, gemini_string, harvest_source_reference):
# may raise Exception for errors
package_dict = self.write_package_from_gemini_string(unicode_gemini_string)

if package_dict:
package = Session.query(Package).get(package_dict['id'])
update_coupled_resources(package, harvest_source_reference)


def write_package_from_gemini_string(self, content):
'''Create or update a Package based on some content that has
Expand Down Expand Up @@ -429,10 +424,11 @@ def write_package_from_gemini_string(self, content):
self.obj.current = True
self.obj.save()


assert gemini_guid == [e['value'] for e in package['extras'] if e['key'] == 'guid'][0]
assert self.obj.id == [e['value'] for e in package['extras'] if e['key'] == 'harvest_object_id'][0]

return package # i.e. a package_dict
return package

@classmethod
def _process_responsible_organisation(cls, responsible_organisations):
Expand Down Expand Up @@ -493,8 +489,7 @@ def gen_new_name(self, title):
counter = counter + 1
return None

@classmethod
def _extract_first_licence_url(cls, licences):
def _extract_first_licence_url(self, licences):
'''Given a list of pieces of licence info, hunt for the first one
which looks like a URL and return it. Otherwise returns None.'''
for licence in licences:
Expand All @@ -505,14 +500,6 @@ def _extract_first_licence_url(cls, licences):

def _create_package_from_data(self, package_dict, package = None):
'''
Given a package_dict describing a package, creates or updates
a package object. If you supply package then it will update it,
otherwise it will create a new one.
Uses the logic layer to create it.
Returns a package_dict of the resulting package.
{'name': 'council-owned-litter-bins',
'notes': 'Location of Council owned litter bins within Borough.',
'resources': [{'description': 'Resource locator',
Expand Down Expand Up @@ -600,7 +587,6 @@ def get_gemini_string_and_guid(self,content,url=None):

return gemini_string, gemini_guid


class GeminiCswHarvester(GeminiHarvester, SingletonPlugin):
'''
A Harvester for CSW servers
Expand Down Expand Up @@ -647,9 +633,6 @@ def gather_stage(self, harvest_job):
# Create a new HarvestObject for this identifier
obj = HarvestObject(guid=identifier, job=harvest_job,
harvest_source_reference=guid)
# NB: Gemini uses GUID for the harvest_source_reference
# whereas INSPIRE specifies the Unique Resource
# Identifier
obj.save()

ids.append(obj.id)
Expand Down Expand Up @@ -824,9 +807,6 @@ def gather_stage(self,harvest_job):
job=harvest_job,
content=gemini_string,
harvest_source_reference=url)
# NB: Gemini uses WAF URL for the
# harvest_source_reference whereas INSPIRE
# specifies the Unique Resource Identifier
obj.save()

ids.append(obj.id)
Expand Down
153 changes: 1 addition & 152 deletions ckanext/spatial/lib/coupled_resource.py
@@ -1,17 +1,7 @@
import re
import logging

from ckan.lib.base import json
from ckanext.harvest.model import HarvestCoupledResource
from ckan import model

log = logging.getLogger(__name__)

guid_matcher = None

class CoupledResourceParseError(Exception):
pass

def extract_guid(csw_url):
'''Given a CSW GetRecordByID URL, identify the record\'s ID (GUID).
Returns the GUID or None if it can\'t find it.'''
Expand Down Expand Up @@ -42,147 +32,6 @@ def extract_gemini_harvest_source_reference(coupled_href):
if not coupled_href.startswith('http'):
return
guid = extract_guid(coupled_href)
return guid or coupled_href.strip()

def extract_harvest_source_reference_from_coupled_resource(coupled_resource_dict):
'''Given a coupled_resource_dict, returns the harvest_source_reference.
return guid or coupled_href

May raise CoupledResourceParseError.
'''
href = coupled_resource_dict['href']
if len(href) <> 1:
raise CoupledResourceParseError('Coupled resource href is not a list of 1: %r' % href)
href = href[0]
if not href.strip():
raise CoupledResourceParseError('Coupled resource href is blank.')
ref = extract_gemini_harvest_source_reference(href)
if not ref:
raise CoupledResourceParseError('Coupled resource harvest source reference is blank')
return ref

def _package_name(package_or_none):
return package_or_none.name if package_or_none else None

def update_coupled_resources(package, harvest_source_reference):
'''Update the harvest_coupled_resource_table with the details of this
harvested package\'s couplings.
:param package: the Package object containing extra fields with couples
to update in the table.
:param harvest_source_reference: the ref of this package being harvested.
This is not relevant if it is a service record, but
essential if it is a dataset.
'''
resource_type = package.extras['resource-type']
if resource_type == 'service':
# When a service record is harvested, ensure the couples listed
# in it match the couples in the HarvestCoupledResource objects,
# ignoring their dataset values (they might be filled in or not).
pkg_couples_str = package.extras['coupled-resource']
pkg_couples = json.loads(pkg_couples_str)
log.info('Service Record %s has %i coupled resources to update',
package.name, len(pkg_couples))

table_couples_matching_service = HarvestCoupledResource.get_by_service_record(package)
table_couples_not_matching_pkg = table_couples_matching_service.all() # cross them off as we go

for pkg_couple in pkg_couples:
try:
ref = extract_harvest_source_reference_from_coupled_resource(pkg_couple)
except CoupledResourceParseError, e:
log.warn('Error parsing couple: %s Ignoring couple=%s', e, pkg_couple)
continue
# Match both service and ref
matching_table_couples = table_couples_matching_service.filter_by(harvest_source_reference=ref)
if matching_table_couples.count() > 0:
# Test: test_02_reharvest_existing_service
# Note down the matches so we don't delete them later
for matching_table_couple in matching_table_couples:
log.info('Service couple is already there (%s, %s, %s)',
package.name, ref,
_package_name(matching_table_couple.dataset_record))
table_couples_not_matching_pkg.remove(matching_table_couple)
continue
# Match just ref with blank service
matching_table_couples = HarvestCoupledResource.get_by_harvest_source_reference(ref)\
.filter_by(service_record=None)
if matching_table_couples.count() == 0:
# Test: test_06_harvest_service_not_matching_a_dataset
# create the row
obj = HarvestCoupledResource(service_record=package,
harvest_source_reference=ref)
model.Session.add(obj)
log.info('Ref is new for this service - adding (%s, %s, None)',
package.name, ref)
model.Session.commit()
else:
# Test: test_04_harvest_service_to_match_existing_dataset
for matching_table_couple in matching_table_couples:
# fill in the service value
matching_table_couple.service_record = package
log.info('Service filled into couple matching ref (%s, %s, %s)',
package.name, ref,
_package_name(matching_table_couple.dataset_record))
model.Session.commit()

# Delete service value for any table_couples not matching the package
# Test: test_08_reharvest_existing_service_to_delete_and_add_couples
for table_couple in table_couples_not_matching_pkg:
log.info('Service couple not matched - deleted service (%s->None, %s, %s)',
_package_name(table_couple.service_record),
ref, _package_name(table_couple.dataset_record))
table_couple.service_record = None
model.Session.commit()
return
elif resource_type in ('dataset', 'series'):
# When a dataset (or dataset series) record is harvested, for its
# dataset_record_package_id there should be one ref - any with another
# ref is removed. And for the dataset_record_package_id and ref combo
# there should be one or more HarvestCoupledResource objects (with
# a service or without).

# Couples where this dataset is under a different ref
# Test: test_07_reharvest_existing_dataset_but_with_changed_ref
ref = harvest_source_reference
assert ref
for couple in model.Session.query(HarvestCoupledResource) \
.filter_by(dataset_record=package) \
.filter(HarvestCoupledResource.harvest_source_reference!=ref):
log.info('Ref %s has been replaced for this dataset record with '
'%s. Removing link to the dataset record (%s, %s, %s->None)',
couple.harvest_source_reference, ref,
_package_name(couple.service_record),
couple.harvest_source_reference,
_package_name(couple.dataset_record))
couple.dataset_record = None
model.Session.commit()

# Couples with this ref
for couple in HarvestCoupledResource.get_by_harvest_source_reference(ref):
if couple.dataset_record != package:
# Test: test_03_harvest_dataset_to_match_existing_service
log.info('Linking ref to this dataset record (%s, %s, %s->%s)',
_package_name(couple.service_record),
ref,
_package_name(couple.dataset_record),
package.name)
couple.dataset_record = package
model.Session.commit()
else:
# Test: test_01_reharvest_existing_dataset
log.info('Couple for this dataset and ref already exists (%s, %s, %s)',
_package_name(couple.service_record),
ref,
_package_name(couple.dataset_record))

# No couples for this ref
couples = HarvestCoupledResource.get_by_harvest_source_reference(ref)
if couples.count() == 0:
# Test: test_05_harvest_dataset_not_matching_a_service
obj = HarvestCoupledResource(dataset_record=package,
harvest_source_reference=ref)
model.Session.add(obj)
log.info('Ref is new - adding new dataset couple (None, %s, %s)',
ref, package.name)
model.Session.commit()
return

0 comments on commit c771a76

Please sign in to comment.