[#56] Consolidate and simplify publisher handling

Values for dct:publisher are obtained from the first one of these that are present: 1. A scheming `publisher` field (assuming the `euro_dcat_ap_scheming` profile is loaded) 2. The legacy `publisher_*` extras 3. The dataset's own organization For the last case, a sample schema for organizations has been added that implements all the publisher properties supported by the processors.
ckan · May 29, 2024 · f9467d4 · f9467d4
1 parent 2e4b4bc
commit f9467d4
Show file tree

Hide file tree

Showing 6 changed files with 257 additions and 42 deletions.
diff --git a/ckanext/dcat/profiles.py b/ckanext/dcat/profiles.py
@@ -126,6 +126,13 @@ class RDFProfile(object):
 
     _dataset_schema = None
 
+    # Cache for mappings of licenses URL/title to ID built when needed in
+    # _license().
+    _licenceregister_cache = None
+
+    # Cache for organization_show details (used for publisher fallback)
+    _org_cache: dict = {}
+
     def __init__(self, graph, dataset_type='dataset', compatibility_mode=False):
         '''Class constructor
 
@@ -144,10 +151,6 @@ def __init__(self, graph, dataset_type='dataset', compatibility_mode=False):
 
         self.compatibility_mode = compatibility_mode
 
-        # Cache for mappings of licenses URL/title to ID built when needed in
-        # _license().
-        self._licenceregister_cache = None
-
         try:
             schema_show = toolkit.get_action("scheming_dataset_schema_show")
             try:
@@ -1365,45 +1368,61 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
             )
 
         # Publisher
-        if any([
+        publisher_ref = None
+
+        if dataset_dict.get('publisher'):
+            # Scheming publisher field: will be handled in a separate profile
+            pass
+        elif any([
             self._get_dataset_value(dataset_dict, 'publisher_uri'),
             self._get_dataset_value(dataset_dict, 'publisher_name'),
-            dataset_dict.get('organization'),
         ]):
-
+            # Legacy publisher_* extras
             publisher_uri = self._get_dataset_value(dataset_dict, 'publisher_uri')
-            publisher_uri_fallback = publisher_uri_organization_fallback(dataset_dict)
             publisher_name = self._get_dataset_value(dataset_dict, 'publisher_name')
             if publisher_uri:
-                publisher_details = CleanedURIRef(publisher_uri)
-            elif not publisher_name and publisher_uri_fallback:
-                # neither URI nor name are available, use organization as fallback
-                publisher_details = CleanedURIRef(publisher_uri_fallback)
+                publisher_ref = CleanedURIRef(publisher_uri)
             else:
                 # No publisher_uri
-                publisher_details = BNode()
-
-            g.add((publisher_details, RDF.type, FOAF.Organization))
-            g.add((dataset_ref, DCT.publisher, publisher_details))
-
-            # In case no name and URI are available, again fall back to organization.
-            # If no name but an URI is available, the name literal remains empty to
-            # avoid mixing organization and dataset values.
-            if not publisher_name and not publisher_uri and dataset_dict.get('organization'):
-                publisher_name = dataset_dict['organization']['title']
-
-            g.add((publisher_details, FOAF.name, Literal(publisher_name)))
-            # TODO: It would make sense to fallback these to organization
-            # fields but they are not in the default schema and the
-            # `organization` object in the dataset_dict does not include
-            # custom fields
+                publisher_ref = BNode()
+            publisher_details = {
+                'name': publisher_name,
+                'email': self._get_dataset_value(dataset_dict, 'publisher_email'),
+                'url': self._get_dataset_value(dataset_dict, 'publisher_url'),
+                'type': self._get_dataset_value(dataset_dict, 'publisher_type'),
+            }
+        elif dataset_dict.get('organization'):
+            # Fall back to dataset org
+            org_id = dataset_dict['organization']['id']
+            org_dict = None
+            if org_id in self._org_cache:
+                org_dict = self._org_cache[org_id]
+            else:
+                try:
+                    org_dict = toolkit.get_action('organization_show')(
+                        {'ignore_auth': True}, {'id': org_id})
+                    self._org_cache[org_id] = org_dict
+                except toolkit.ObjectNotFound:
+                    pass
+            if org_dict:
+                publisher_ref = CleanedURIRef(publisher_uri_organization_fallback(dataset_dict))
+                publisher_details = {
+                    'name': org_dict.get('title'),
+                    'email': org_dict.get('email'),
+                    'url': org_dict.get('url'),
+                    'type': org_dict.get('dcat_type'),
+                }
+        # Add to graph
+        if publisher_ref:
+            g.add((publisher_ref, RDF.type, FOAF.Organization))
+            g.add((dataset_ref, DCT.publisher, publisher_ref))
             items = [
-                ('publisher_email', FOAF.mbox, None, Literal),
-                ('publisher_url', FOAF.homepage, None, URIRef),
-                ('publisher_type', DCT.type, None, URIRefOrLiteral),
+                ('name', FOAF.name, None, Literal),
+                ('email', FOAF.mbox, None, Literal),
+                ('url', FOAF.homepage, None, URIRef),
+                ('type', DCT.type, None, URIRefOrLiteral),
             ]
-
-            self._add_triples_from_dict(dataset_dict, publisher_details, items)
+            self._add_triples_from_dict(publisher_details, publisher_ref, items)
 
         # Temporal
         start = self._get_dataset_value(dataset_dict, 'temporal_start')
@@ -2207,6 +2226,33 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
                     _type=URIRef, value_modifier=self._add_mailto
                 )
 
+        publisher = dataset_dict.get("publisher")
+        if isinstance(publisher, list) and len(publisher):
+            publisher = publisher[0]
+            publisher_uri = publisher.get('uri')
+            if publisher_uri:
+                publisher_ref = CleanedURIRef(publisher_uri)
+            else:
+                publisher_ref = BNode()
+
+            self.g.add((publisher_ref, RDF.type, FOAF.Organization))
+            self.g.add((dataset_ref, DCT.publisher, publisher_ref))
+
+            self._add_triple_from_dict(
+                publisher, publisher_ref, FOAF.name, 'name'
+            )
+            self._add_triple_from_dict(
+                publisher, publisher_ref, FOAF.homepage, 'url', URIRef
+            )
+            self._add_triple_from_dict(
+                publisher, publisher_ref, DCT.type, 'type', URIRefOrLiteral
+            )
+            self._add_triple_from_dict(
+                publisher, publisher_ref,
+                VCARD.hasEmail, 'email',
+                _type=URIRef, value_modifier=self._add_mailto
+            )
+
         resources = dataset_dict.get('resources', [])
         for resource in resources:
             if resource.get('access_services'):

diff --git a/ckanext/dcat/schemas/dcat_ap_2.1.yaml b/ckanext/dcat/schemas/dcat_ap_2.1.yaml
@@ -3,7 +3,6 @@ dataset_type: dataset
 about: A reimplementation of the default CKAN dataset schema
 about_url: http://github.com/ckan/ckanext-dcat
 
-
 dataset_fields:
 
 - field_name: title
@@ -23,6 +22,11 @@ dataset_fields:
   form_snippet: markdown.html
   form_placeholder: eg. Some useful notes about the data
 
+- field_name: tag_string
+  label: Keywords
+  preset: tag_string_autocomplete
+  form_placeholder: eg. economy, mental health, government
+
 - field_name: contact
   label: Contact points
   repeating_label: Contact point
@@ -38,10 +42,28 @@ dataset_fields:
       label: Email
       display_snippet: email.html
 
-- field_name: tag_string
-  label: Keywords
-  preset: tag_string_autocomplete
-  form_placeholder: eg. economy, mental health, government
+- field_name: publisher
+  label: Publisher
+  repeating_label: Publisher
+  repeating_once: true
+  repeating_subfields:
+
+    - field_name: uri
+      label: URI
+
+    - field_name: name
+      label: Name
+
+    - field_name: email
+      label: Email
+      display_snippet: email.html
+
+    - field_name: url
+      label: URL
+      display_snippet: link.html
+
+    - field_name: type
+      label: Type
 
 - field_name: license_id
   label: License
@@ -209,4 +231,3 @@ resource_fields:
 # Note: if not provided, this will be autogenerated
 - field_name: uri
   label: URI
-
diff --git a/ckanext/dcat/schemas/publisher_organization.yaml b/ckanext/dcat/schemas/publisher_organization.yaml
@@ -0,0 +1,35 @@
+scheming_version: 2
+about_url: http://github.com/ckan/ckanext-dcat
+description: >
+  An organization schema that implements the properties supported
+  by default in the dct:publisher property of a dcat:Dataset
+
+fields:
+
+- field_name: title
+  label: Name
+  validators: ignore_missing unicode_safe
+  form_snippet: large_text.html
+  form_attrs: {data-module: slug-preview-target}
+
+- field_name: name
+  label: URL
+  validators: not_empty unicode_safe name_validator group_name_validator
+  form_snippet: slug.html
+  form_placeholder: my-theme
+
+- field_name: notes
+  label: Description
+  form_snippet: markdown.html
+  form_placeholder: A little information about this organization.
+
+- field_name: email
+  label: Email
+  display_snippet: email.html
+
+- field_name: url
+  label: URL
+  display_snippet: link.html
+
+- field_name: dcat_type
+  label: Type
diff --git a/ckanext/dcat/templates/scheming/form_snippets/repeating_subfields.html b/ckanext/dcat/templates/scheming/form_snippets/repeating_subfields.html
@@ -0,0 +1,8 @@
+{% ckan_extends %}
+
+{% block add_button %}
+  {# Hide the Add button if we only want one set of subfields #}
+  {% if not field.repeating_once %}
+     {{ super() }}
+  {% endif %}
+{% endblock %}
diff --git a/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py b/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py
@@ -1,6 +1,7 @@
 from builtins import str
 from builtins import object
 import json
+import uuid
 
 import pytest
 
@@ -17,7 +18,7 @@
 from ckanext.dcat import utils
 from ckanext.dcat.processors import RDFSerializer, HYDRA
 from ckanext.dcat.profiles import (DCAT, DCT, ADMS, XSD, VCARD, FOAF, SCHEMA,
-                                   SKOS, LOCN, GSP, OWL, SPDX, GEOJSON_IMT, 
+                                   SKOS, LOCN, GSP, OWL, SPDX, GEOJSON_IMT,
                                    DISTRIBUTION_LICENSE_FALLBACK_CONFIG)
 from ckanext.dcat.utils import DCAT_EXPOSE_SUBCATALOGS
 from ckanext.dcat.tests.utils import BaseSerializeTest
@@ -398,11 +399,17 @@ def test_publisher_extras(self):
         assert self._triple(g, publisher, DCT.type, URIRef(extras['publisher_type']))
 
     def test_publisher_org(self):
+        org_id = str(uuid.uuid4())
+        factories.Organization(
+            id=org_id,
+            name='publisher1',
+            title='Example Publisher from Org'
+        )
         dataset = {
             'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
             'name': 'test-dataset',
             'organization': {
-                'id': '',
+                'id': org_id,
                 'name': 'publisher1',
                 'title': 'Example Publisher from Org',
             }