Skip to content

Commit

Permalink
[#56] Consolidate and simplify publisher handling
Browse files Browse the repository at this point in the history
Values for dct:publisher are obtained from the first one of these that
are present:

1. A scheming `publisher` field (assuming the `euro_dcat_ap_scheming`
   profile is loaded)
2. The legacy `publisher_*` extras
3. The dataset's own organization

For the last case, a sample schema for organizations has been added that
implements all the publisher properties supported by the processors.
  • Loading branch information
amercader committed May 29, 2024
1 parent 2e4b4bc commit f9467d4
Show file tree
Hide file tree
Showing 6 changed files with 257 additions and 42 deletions.
112 changes: 79 additions & 33 deletions ckanext/dcat/profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,13 @@ class RDFProfile(object):

_dataset_schema = None

# Cache for mappings of licenses URL/title to ID built when needed in
# _license().
_licenceregister_cache = None

# Cache for organization_show details (used for publisher fallback)
_org_cache: dict = {}

def __init__(self, graph, dataset_type='dataset', compatibility_mode=False):
'''Class constructor
Expand All @@ -144,10 +151,6 @@ def __init__(self, graph, dataset_type='dataset', compatibility_mode=False):

self.compatibility_mode = compatibility_mode

# Cache for mappings of licenses URL/title to ID built when needed in
# _license().
self._licenceregister_cache = None

try:
schema_show = toolkit.get_action("scheming_dataset_schema_show")
try:
Expand Down Expand Up @@ -1365,45 +1368,61 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
)

# Publisher
if any([
publisher_ref = None

if dataset_dict.get('publisher'):
# Scheming publisher field: will be handled in a separate profile
pass
elif any([
self._get_dataset_value(dataset_dict, 'publisher_uri'),
self._get_dataset_value(dataset_dict, 'publisher_name'),
dataset_dict.get('organization'),
]):

# Legacy publisher_* extras
publisher_uri = self._get_dataset_value(dataset_dict, 'publisher_uri')
publisher_uri_fallback = publisher_uri_organization_fallback(dataset_dict)
publisher_name = self._get_dataset_value(dataset_dict, 'publisher_name')
if publisher_uri:
publisher_details = CleanedURIRef(publisher_uri)
elif not publisher_name and publisher_uri_fallback:
# neither URI nor name are available, use organization as fallback
publisher_details = CleanedURIRef(publisher_uri_fallback)
publisher_ref = CleanedURIRef(publisher_uri)
else:
# No publisher_uri
publisher_details = BNode()

g.add((publisher_details, RDF.type, FOAF.Organization))
g.add((dataset_ref, DCT.publisher, publisher_details))

# In case no name and URI are available, again fall back to organization.
# If no name but an URI is available, the name literal remains empty to
# avoid mixing organization and dataset values.
if not publisher_name and not publisher_uri and dataset_dict.get('organization'):
publisher_name = dataset_dict['organization']['title']

g.add((publisher_details, FOAF.name, Literal(publisher_name)))
# TODO: It would make sense to fallback these to organization
# fields but they are not in the default schema and the
# `organization` object in the dataset_dict does not include
# custom fields
publisher_ref = BNode()
publisher_details = {
'name': publisher_name,
'email': self._get_dataset_value(dataset_dict, 'publisher_email'),
'url': self._get_dataset_value(dataset_dict, 'publisher_url'),
'type': self._get_dataset_value(dataset_dict, 'publisher_type'),
}
elif dataset_dict.get('organization'):
# Fall back to dataset org
org_id = dataset_dict['organization']['id']
org_dict = None
if org_id in self._org_cache:
org_dict = self._org_cache[org_id]
else:
try:
org_dict = toolkit.get_action('organization_show')(
{'ignore_auth': True}, {'id': org_id})
self._org_cache[org_id] = org_dict
except toolkit.ObjectNotFound:
pass
if org_dict:
publisher_ref = CleanedURIRef(publisher_uri_organization_fallback(dataset_dict))
publisher_details = {
'name': org_dict.get('title'),
'email': org_dict.get('email'),
'url': org_dict.get('url'),
'type': org_dict.get('dcat_type'),
}
# Add to graph
if publisher_ref:
g.add((publisher_ref, RDF.type, FOAF.Organization))
g.add((dataset_ref, DCT.publisher, publisher_ref))
items = [
('publisher_email', FOAF.mbox, None, Literal),
('publisher_url', FOAF.homepage, None, URIRef),
('publisher_type', DCT.type, None, URIRefOrLiteral),
('name', FOAF.name, None, Literal),
('email', FOAF.mbox, None, Literal),
('url', FOAF.homepage, None, URIRef),
('type', DCT.type, None, URIRefOrLiteral),
]

self._add_triples_from_dict(dataset_dict, publisher_details, items)
self._add_triples_from_dict(publisher_details, publisher_ref, items)

# Temporal
start = self._get_dataset_value(dataset_dict, 'temporal_start')
Expand Down Expand Up @@ -2207,6 +2226,33 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
_type=URIRef, value_modifier=self._add_mailto
)

publisher = dataset_dict.get("publisher")
if isinstance(publisher, list) and len(publisher):
publisher = publisher[0]
publisher_uri = publisher.get('uri')
if publisher_uri:
publisher_ref = CleanedURIRef(publisher_uri)
else:
publisher_ref = BNode()

self.g.add((publisher_ref, RDF.type, FOAF.Organization))
self.g.add((dataset_ref, DCT.publisher, publisher_ref))

self._add_triple_from_dict(
publisher, publisher_ref, FOAF.name, 'name'
)
self._add_triple_from_dict(
publisher, publisher_ref, FOAF.homepage, 'url', URIRef
)
self._add_triple_from_dict(
publisher, publisher_ref, DCT.type, 'type', URIRefOrLiteral
)
self._add_triple_from_dict(
publisher, publisher_ref,
VCARD.hasEmail, 'email',
_type=URIRef, value_modifier=self._add_mailto
)

resources = dataset_dict.get('resources', [])
for resource in resources:
if resource.get('access_services'):
Expand Down
33 changes: 27 additions & 6 deletions ckanext/dcat/schemas/dcat_ap_2.1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ dataset_type: dataset
about: A reimplementation of the default CKAN dataset schema
about_url: http://github.com/ckan/ckanext-dcat


dataset_fields:

- field_name: title
Expand All @@ -23,6 +22,11 @@ dataset_fields:
form_snippet: markdown.html
form_placeholder: eg. Some useful notes about the data

- field_name: tag_string
label: Keywords
preset: tag_string_autocomplete
form_placeholder: eg. economy, mental health, government

- field_name: contact
label: Contact points
repeating_label: Contact point
Expand All @@ -38,10 +42,28 @@ dataset_fields:
label: Email
display_snippet: email.html

- field_name: tag_string
label: Keywords
preset: tag_string_autocomplete
form_placeholder: eg. economy, mental health, government
- field_name: publisher
label: Publisher
repeating_label: Publisher
repeating_once: true
repeating_subfields:

- field_name: uri
label: URI

- field_name: name
label: Name

- field_name: email
label: Email
display_snippet: email.html

- field_name: url
label: URL
display_snippet: link.html

- field_name: type
label: Type

- field_name: license_id
label: License
Expand Down Expand Up @@ -209,4 +231,3 @@ resource_fields:
# Note: if not provided, this will be autogenerated
- field_name: uri
label: URI

35 changes: 35 additions & 0 deletions ckanext/dcat/schemas/publisher_organization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
scheming_version: 2
about_url: http://github.com/ckan/ckanext-dcat
description: >
An organization schema that implements the properties supported
by default in the dct:publisher property of a dcat:Dataset
fields:

- field_name: title
label: Name
validators: ignore_missing unicode_safe
form_snippet: large_text.html
form_attrs: {data-module: slug-preview-target}

- field_name: name
label: URL
validators: not_empty unicode_safe name_validator group_name_validator
form_snippet: slug.html
form_placeholder: my-theme

- field_name: notes
label: Description
form_snippet: markdown.html
form_placeholder: A little information about this organization.

- field_name: email
label: Email
display_snippet: email.html

- field_name: url
label: URL
display_snippet: link.html

- field_name: dcat_type
label: Type
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{% ckan_extends %}

{% block add_button %}
{# Hide the Add button if we only want one set of subfields #}
{% if not field.repeating_once %}
{{ super() }}
{% endif %}
{% endblock %}
11 changes: 9 additions & 2 deletions ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from builtins import str
from builtins import object
import json
import uuid

import pytest

Expand All @@ -17,7 +18,7 @@
from ckanext.dcat import utils
from ckanext.dcat.processors import RDFSerializer, HYDRA
from ckanext.dcat.profiles import (DCAT, DCT, ADMS, XSD, VCARD, FOAF, SCHEMA,
SKOS, LOCN, GSP, OWL, SPDX, GEOJSON_IMT,
SKOS, LOCN, GSP, OWL, SPDX, GEOJSON_IMT,
DISTRIBUTION_LICENSE_FALLBACK_CONFIG)
from ckanext.dcat.utils import DCAT_EXPOSE_SUBCATALOGS
from ckanext.dcat.tests.utils import BaseSerializeTest
Expand Down Expand Up @@ -398,11 +399,17 @@ def test_publisher_extras(self):
assert self._triple(g, publisher, DCT.type, URIRef(extras['publisher_type']))

def test_publisher_org(self):
org_id = str(uuid.uuid4())
factories.Organization(
id=org_id,
name='publisher1',
title='Example Publisher from Org'
)
dataset = {
'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
'name': 'test-dataset',
'organization': {
'id': '',
'id': org_id,
'name': 'publisher1',
'title': 'Example Publisher from Org',
}
Expand Down
Loading

0 comments on commit f9467d4

Please sign in to comment.