Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Do not duplicate harvest_extras if exist in root schema #521

Merged
merged 3 commits into from
Mar 14, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
112 changes: 61 additions & 51 deletions ckanext/harvest/plugin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,40 @@ def before_dataset_search(self, search_params):

return search_params

def _add_or_update_harvest_metadata(self, key, value, data_dict):
"""Adds extras fields or updates them if already exist."""
if not data_dict.get("extras"):
data_dict["extras"] = []

for e in data_dict.get("extras"):
if e.get("key") == key:
e.update({"value": value})
break
else:
data_dict["extras"].append({"key": key, "value": value})

def before_dataset_index(self, pkg_dict):
"""Adds harvest metadata to the extra field of the dataset.

This method will add or update harvest related metadata in `pkg_dict`,
`data_dict` and `validated_data_dict` so it can be obtained when
calling package_show API (that depends on Solr data). This metadata will
be stored in the `extras` field of the dictionaries ONLY if it does not
already exist in the root schema.

Note: If another extension adds any harvest extra to the `package_show`
schema then this method will not add them again in the `extras` field to avoid
validation errors when updating a package.

If the harvest extra has been added to the root schema, then we will not update
them since it is responsibility of the package validators to do it.
"""
# Fix to support Solr8
if isinstance(pkg_dict.get('status'), dict):
try:
pkg_dict['status'] = json.dumps(pkg_dict['status'])
except ValueError:
pkg_dict.pop('status', None)

harvest_object = model.Session.query(HarvestObject) \
.filter(HarvestObject.package_id == pkg_dict["id"]) \
Expand All @@ -125,59 +158,36 @@ def before_dataset_index(self, pkg_dict):
).order_by(HarvestObject.import_finished.desc()) \
.first()

if harvest_object:

data_dict = json.loads(pkg_dict["data_dict"])

validated_data_dict = json.loads(pkg_dict["validated_data_dict"])

harvest_extras = [
("harvest_object_id", harvest_object.id),
("harvest_source_id", harvest_object.source.id),
("harvest_source_title", harvest_object.source.title),
]

for key, value in harvest_extras:

# If the harvest extras are there, update them. This can
# happen eg when calling package_update or resource_update,
# which call package_show
harvest_not_found = True
harvest_not_found_validated = True
if not data_dict.get("extras"):
data_dict["extras"] = []

for e in data_dict.get("extras"):
if e.get("key") == key:
e.update({"value": value})
harvest_not_found = False
if harvest_not_found:
data_dict["extras"].append({"key": key, "value": value})

if not validated_data_dict.get("extras"):
validated_data_dict["extras"] = []

for e in validated_data_dict.get("extras"):
if e.get("key") == key:
e.update({"value": value})
harvest_not_found_validated = False
if harvest_not_found_validated:
validated_data_dict["extras"].append({"key": key, "value": value})

# The commented line isn't cataloged correctly, if we pass the
# basic key the extras are prepended and the system works as
# expected.
# pkg_dict['extras_{0}'.format(key)] = value
if not harvest_object:
return pkg_dict

harvest_extras = [
("harvest_object_id", harvest_object.id),
("harvest_source_id", harvest_object.source.id),
("harvest_source_title", harvest_object.source.title),
]

data_dict = json.loads(pkg_dict["data_dict"])
for key, value in harvest_extras:
if key in data_dict.keys():
data_dict[key] = value
continue
self._add_or_update_harvest_metadata(key, value, data_dict)

validated_data_dict = json.loads(pkg_dict["validated_data_dict"])
for key, value in harvest_extras:
if key in validated_data_dict.keys():
validated_data_dict[key] = value
continue
self._add_or_update_harvest_metadata(key, value, validated_data_dict)

# Add harvest extras to main indexed pkg_dict
for key, value in harvest_extras:
if key not in pkg_dict.keys():
pkg_dict[key] = value

pkg_dict["data_dict"] = json.dumps(data_dict)
pkg_dict["validated_data_dict"] = json.dumps(validated_data_dict)

if isinstance(pkg_dict.get('status'), dict):
try:
pkg_dict['status'] = json.dumps(pkg_dict['status'])
except ValueError:
pkg_dict.pop('status', None)
pkg_dict["data_dict"] = json.dumps(data_dict)
pkg_dict["validated_data_dict"] = json.dumps(validated_data_dict)

return pkg_dict

Expand Down