[#56] Allow to provide a dataset schema to profiles

This allows to check if a field should be stored as a custom field or an extra
ckan · May 8, 2024 · 65abb1f · 65abb1f
1 parent 48b5e61
commit 65abb1f
Show file tree

Hide file tree

Showing 2 changed files with 70 additions and 7 deletions.
diff --git a/ckanext/dcat/processors.py b/ckanext/dcat/processors.py
@@ -33,7 +33,7 @@
 
 class RDFProcessor(object):
 
-    def __init__(self, profiles=None, compatibility_mode=False):
+    def __init__(self, profiles=None, dataset_schema='dataset', compatibility_mode=False):
         '''
         Creates a parser or serializer instance
 
@@ -56,6 +56,8 @@ def __init__(self, profiles=None, compatibility_mode=False):
             raise RDFProfileException(
                 'No suitable RDF profiles could be loaded')
 
+        self.dataset_schema = dataset_schema
+
         if not compatibility_mode:
             compatibility_mode = p.toolkit.asbool(
                 config.get(COMPAT_MODE_CONFIG_OPTION, False))
@@ -177,11 +179,16 @@ def datasets(self):
         for dataset_ref in self._datasets():
             dataset_dict = {}
             for profile_class in self._profiles:
-                profile = profile_class(self.g, self.compatibility_mode)
+                profile = profile_class(
+                    self.g,
+                    dataset_schema=self.dataset_schema,
+                    compatibility_mode=self.compatibility_mode
+                )
                 profile.parse_dataset(dataset_dict, dataset_ref)
 
             yield dataset_dict
 
+
 class RDFSerializer(RDFProcessor):
     '''
     A CKAN to RDF serializer based on rdflib
@@ -245,7 +252,7 @@ def graph_from_dataset(self, dataset_dict):
         dataset_ref = URIRef(dataset_uri(dataset_dict))
 
         for profile_class in self._profiles:
-            profile = profile_class(self.g, self.compatibility_mode)
+            profile = profile_class(self.g, compatibility_mode=self.compatibility_mode)
             profile.graph_from_dataset(dataset_dict, dataset_ref)
 
         return dataset_ref
@@ -263,7 +270,7 @@ def graph_from_catalog(self, catalog_dict=None):
         catalog_ref = URIRef(catalog_uri())
 
         for profile_class in self._profiles:
-            profile = profile_class(self.g, self.compatibility_mode)
+            profile = profile_class(self.g, compatibility_mode=self.compatibility_mode)
             profile.graph_from_catalog(catalog_dict, catalog_ref)
 
         return catalog_ref

diff --git a/ckanext/dcat/profiles.py b/ckanext/dcat/profiles.py
@@ -5,6 +5,7 @@
 
 from dateutil.parser import parse as parse_date
 
+import ckantoolkit as toolkit
 from ckantoolkit import config
 from ckantoolkit import url_for
 
@@ -15,7 +16,6 @@
 from geomet import wkt, InvalidGeoJSONException
 
 from ckan.model.license import LicenseRegister
-from ckan.plugins import toolkit
 from ckan.lib.munge import munge_tag
 from ckan.lib.helpers import resource_formats
 from ckanext.dcat.utils import resource_uri, publisher_uri_organization_fallback, DCAT_EXPOSE_SUBCATALOGS, DCAT_CLEAN_TAGS
@@ -55,6 +55,19 @@
 
 DISTRIBUTION_LICENSE_FALLBACK_CONFIG = 'ckanext.dcat.resource.inherit.license'
 
+ROOT_DATASET_FIELDS = [
+    'name',
+    'title',
+    'url',
+    'version',
+    'tags',
+    'license_id',
+    'maintainer',
+    'maintainer_email',
+    'author',
+    'author_email',
+]
+
 
 class URIRefOrLiteral(object):
     '''Helper which creates an URIRef if the value appears to be an http URL,
@@ -111,7 +124,9 @@ class RDFProfile(object):
        custom profiles
     '''
 
-    def __init__(self, graph, compatibility_mode=False):
+    _dataset_schema = None
+
+    def __init__(self, graph, dataset_schema='dataset', compatibility_mode=False):
         '''Class constructor
 
         Graph is an rdflib.Graph instance.
@@ -130,6 +145,15 @@ def __init__(self, graph, compatibility_mode=False):
         # _license().
         self._licenceregister_cache = None
 
+        schema_show = toolkit.get_action("scheming_dataset_schema_show")
+        if schema_show:
+            try:
+                schema = schema_show({}, {"type": dataset_schema})
+            except toolkit.ObjectNotFound:
+                raise toolkit.ObjectNotFound(f"Unknown dataset schema: {dataset_schema}")
+
+            self._dataset_schema = schema
+
     def _datasets(self):
         '''
         Generator that returns all DCAT datasets on the graph
@@ -695,6 +719,38 @@ def _add_spatial_to_dict(self, dataset_dict, key, spatial):
                 {'key': 'spatial_{0}'.format(key) if key != 'geom' else 'spatial',
                  'value': spatial.get(key)})
 
+    def _schema_field(self, key):
+        '''
+        Returns the schema field information if the provided key exists as a field in
+        the dataset schema (if one was provided)
+        '''
+        if not self._dataset_schema:
+            return None
+
+        for field in self._dataset_schema['dataset_fields']:
+            if field['field_name'] == key:
+                return field
+
+    def _set_dataset_value(self, dataset_dict, key, value):
+        '''
+        Sets the value for a given key in a CKAN dataset dict
+
+        If a dataset schema was provided, the schema will be checked to see if
+        a custom field is present for the key. If so the key will be stored at
+        the dict root level, otherwise it will be stored as an extra.
+
+        Standard CKAN fields (defined in ROOT_DATASET_FIELDS) are always stored
+        at the root level.
+        '''
+        if self._schema_field(key) or key in ROOT_DATASET_FIELDS:
+            dataset_dict[key] = value
+        else:
+            if not dataset_dict.get('extras'):
+                dataset_dict['extras'] = []
+            dataset_dict['extras'].append({'key': key, 'value': value})
+
+        return dataset_dict
+
     def _get_dataset_value(self, dataset_dict, key, default=None):
         '''
         Returns the value for the given key on a CKAN dict
@@ -1021,7 +1077,7 @@ def parse_dataset(self, dataset_dict, dataset_ref):
                 ):
             value = self._object_value(dataset_ref, predicate)
             if value:
-                dataset_dict['extras'].append({'key': key, 'value': value})
+                self._set_dataset_value(dataset_dict, key, value)
 
         #  Lists
         for key, predicate, in (