From 34fa49064521618396f205a13b2e95accedfc59a Mon Sep 17 00:00:00 2001 From: pdelboca Date: Thu, 9 Mar 2023 09:59:41 -0300 Subject: [PATCH 1/3] Do not duplicate harvest_extras if exist in root schema --- ckanext/harvest/plugin/__init__.py | 114 +++++++++++++++-------------- 1 file changed, 61 insertions(+), 53 deletions(-) diff --git a/ckanext/harvest/plugin/__init__.py b/ckanext/harvest/plugin/__init__.py index a2a0b4f5..6fea9a4b 100644 --- a/ckanext/harvest/plugin/__init__.py +++ b/ckanext/harvest/plugin/__init__.py @@ -116,7 +116,37 @@ def before_dataset_search(self, search_params): return search_params + def _add_or_update_harvest_metadata(self, key, value, data_dict): + """Adds extras fields or updates them if already exist.""" + if not data_dict.get("extras"): + data_dict["extras"] = [] + + for e in data_dict.get("extras"): + if e.get("key") == key: + e.update({"value": value}) + break + else: + data_dict["extras"].append({"key": key, "value": value}) + def before_dataset_index(self, pkg_dict): + """Adds harvest metadata to the extra field of the dataset. + + This method will add or update harvest related metadata in `pkg_dict`, + `data_dict` and `validated_data_dict` so it can be obtained when + calling package_show API (that depends on Solr data). This metadata will + be stored in the `extra` field of the dictionaries. + + By default all harvest metadata will go in the extra field. If + another extension adds any of them to the `package_show` schema + then it will not be added again in the `extras` field to avoid + validation errors when updating a package. + """ + # Fix to support Solr8 + if isinstance(pkg_dict.get('status'), dict): + try: + pkg_dict['status'] = json.dumps(pkg_dict['status']) + except ValueError: + pkg_dict.pop('status', None) harvest_object = model.Session.query(HarvestObject) \ .filter(HarvestObject.package_id == pkg_dict["id"]) \ @@ -125,59 +155,37 @@ def before_dataset_index(self, pkg_dict): ).order_by(HarvestObject.import_finished.desc()) \ .first() - if harvest_object: - - data_dict = json.loads(pkg_dict["data_dict"]) - - validated_data_dict = json.loads(pkg_dict["validated_data_dict"]) - - harvest_extras = [ - ("harvest_object_id", harvest_object.id), - ("harvest_source_id", harvest_object.source.id), - ("harvest_source_title", harvest_object.source.title), - ] - - for key, value in harvest_extras: - - # If the harvest extras are there, update them. This can - # happen eg when calling package_update or resource_update, - # which call package_show - harvest_not_found = True - harvest_not_found_validated = True - if not data_dict.get("extras"): - data_dict["extras"] = [] - - for e in data_dict.get("extras"): - if e.get("key") == key: - e.update({"value": value}) - harvest_not_found = False - if harvest_not_found: - data_dict["extras"].append({"key": key, "value": value}) - - if not validated_data_dict.get("extras"): - validated_data_dict["extras"] = [] - - for e in validated_data_dict.get("extras"): - if e.get("key") == key: - e.update({"value": value}) - harvest_not_found_validated = False - if harvest_not_found_validated: - validated_data_dict["extras"].append({"key": key, "value": value}) - - # The commented line isn't cataloged correctly, if we pass the - # basic key the extras are prepended and the system works as - # expected. - # pkg_dict['extras_{0}'.format(key)] = value - pkg_dict[key] = value - - pkg_dict["data_dict"] = json.dumps(data_dict) - pkg_dict["validated_data_dict"] = json.dumps(validated_data_dict) - - if isinstance(pkg_dict.get('status'), dict): - try: - pkg_dict['status'] = json.dumps(pkg_dict['status']) - except ValueError: - pkg_dict.pop('status', None) + if not harvest_object: + return pkg_dict + + harvest_extras = [ + ("harvest_object_id", harvest_object.id), + ("harvest_source_id", harvest_object.source.id), + ("harvest_source_title", harvest_object.source.title), + ] + + # Add harvest extras to data_dict + data_dict = json.loads(pkg_dict["data_dict"]) + for key, value in harvest_extras: + if key in data_dict.keys(): + data_dict[key] = value + continue + self._add_or_update_harvest_metadata(key, value, data_dict) + + # Add harvest extras to validated_data_dict + validated_data_dict = json.loads(pkg_dict["validated_data_dict"]) + for key, value in harvest_extras: + if key in validated_data_dict.keys(): + validated_data_dict[key] = value + continue + self._add_or_update_harvest_metadata(key, value, validated_data_dict) + + # Add harvest extras to main indexed pkg_dict + for key, value in harvest_extras: + pkg_dict[key] = value + + pkg_dict["data_dict"] = json.dumps(data_dict) + pkg_dict["validated_data_dict"] = json.dumps(validated_data_dict) return pkg_dict From 62a16d0bfffdb847a1d1f379ef0fc1915a2238d7 Mon Sep 17 00:00:00 2001 From: pdelboca Date: Mon, 13 Mar 2023 09:19:13 -0300 Subject: [PATCH 2/3] Improve docstring --- ckanext/harvest/plugin/__init__.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/ckanext/harvest/plugin/__init__.py b/ckanext/harvest/plugin/__init__.py index 6fea9a4b..b66cbf10 100644 --- a/ckanext/harvest/plugin/__init__.py +++ b/ckanext/harvest/plugin/__init__.py @@ -134,11 +134,11 @@ def before_dataset_index(self, pkg_dict): This method will add or update harvest related metadata in `pkg_dict`, `data_dict` and `validated_data_dict` so it can be obtained when calling package_show API (that depends on Solr data). This metadata will - be stored in the `extra` field of the dictionaries. + be stored in the `extras` field of the dictionaries ONLY if it does not + already exist in the root schema. - By default all harvest metadata will go in the extra field. If - another extension adds any of them to the `package_show` schema - then it will not be added again in the `extras` field to avoid + Note: If another extension adds any harvest extra to the `package_show` + schema then this method will add them again in the `extras` field to avoid validation errors when updating a package. """ # Fix to support Solr8 @@ -164,7 +164,6 @@ def before_dataset_index(self, pkg_dict): ("harvest_source_title", harvest_object.source.title), ] - # Add harvest extras to data_dict data_dict = json.loads(pkg_dict["data_dict"]) for key, value in harvest_extras: if key in data_dict.keys(): @@ -172,7 +171,6 @@ def before_dataset_index(self, pkg_dict): continue self._add_or_update_harvest_metadata(key, value, data_dict) - # Add harvest extras to validated_data_dict validated_data_dict = json.loads(pkg_dict["validated_data_dict"]) for key, value in harvest_extras: if key in validated_data_dict.keys(): From 10c2fecec389989ea69e3f1da73f5c20516f8d98 Mon Sep 17 00:00:00 2001 From: pdelboca Date: Mon, 13 Mar 2023 12:04:11 -0300 Subject: [PATCH 3/3] Do not update extra in pkg_dict if it already exist. This is a responsibility of the package. We are skiping any override since users will expect the behaviour of the custom logic added in package schema and validators. --- ckanext/harvest/plugin/__init__.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ckanext/harvest/plugin/__init__.py b/ckanext/harvest/plugin/__init__.py index b66cbf10..5eabbe09 100644 --- a/ckanext/harvest/plugin/__init__.py +++ b/ckanext/harvest/plugin/__init__.py @@ -138,8 +138,11 @@ def before_dataset_index(self, pkg_dict): already exist in the root schema. Note: If another extension adds any harvest extra to the `package_show` - schema then this method will add them again in the `extras` field to avoid + schema then this method will not add them again in the `extras` field to avoid validation errors when updating a package. + + If the harvest extra has been added to the root schema, then we will not update + them since it is responsibility of the package validators to do it. """ # Fix to support Solr8 if isinstance(pkg_dict.get('status'), dict): @@ -180,7 +183,8 @@ def before_dataset_index(self, pkg_dict): # Add harvest extras to main indexed pkg_dict for key, value in harvest_extras: - pkg_dict[key] = value + if key not in pkg_dict.keys(): + pkg_dict[key] = value pkg_dict["data_dict"] = json.dumps(data_dict) pkg_dict["validated_data_dict"] = json.dumps(validated_data_dict)