[#56] Separate scheming compat profile, parsing

ckan · May 20, 2024 · 35657ef · 35657ef
1 parent a77d5c2
commit 35657ef
Show file tree

Hide file tree

Showing 3 changed files with 127 additions and 35 deletions.
diff --git a/ckanext/dcat/profiles.py b/ckanext/dcat/profiles.py
@@ -731,6 +731,18 @@ def _schema_field(self, key):
             if field['field_name'] == key:
                 return field
 
+    def _schema_resource_field(self, key):
+        '''
+        Returns the schema field information if the provided key exists as a field in
+        the resources fields of the dataset schema (if one was provided)
+        '''
+        if not self._dataset_schema:
+            return None
+
+        for field in self._dataset_schema['resource_fields']:
+            if field['field_name'] == key:
+                return field
+
     def _set_dataset_value(self, dataset_dict, key, value):
         '''
         Sets the value for a given key in a CKAN dataset dict
@@ -758,6 +770,15 @@ def _set_list_dataset_value(self, dataset_dict, key, value):
         else:
             return self._set_dataset_value(dataset_dict, key, json.dumps(value))
 
+    def _set_list_resource_value(self, resource_dict, key, value):
+        schema_field = self._schema_resource_field(key)
+        if schema_field and 'scheming_multiple_text' in schema_field['validators']:
+            resource_dict[key] = value
+        else:
+            resource_dict[key] = json.dumps(value)
+
+        return resource_dict
+
     def _get_dataset_value(self, dataset_dict, key, default=None):
         '''
         Returns the value for the given key on a CKAN dict
@@ -1084,7 +1105,7 @@ def parse_dataset(self, dataset_dict, dataset_ref):
                 ):
             value = self._object_value(dataset_ref, predicate)
             if value:
-                self._set_dataset_value(dataset_dict, key, value)
+                dataset_dict['extras'].append({'key': key, 'value': value})
 
         #  Lists
         for key, predicate, in (
@@ -1101,7 +1122,8 @@ def parse_dataset(self, dataset_dict, dataset_ref):
                 ):
             values = self._object_value_list(dataset_ref, predicate)
             if values:
-                self._set_list_dataset_value(dataset_dict, key, values)
+                dataset_dict['extras'].append({'key': key,
+                                               'value': json.dumps(values)})
 
         # Contact details
         contact = self._contact_details(dataset_ref, DCAT.contactPoint)
@@ -1110,7 +1132,7 @@ def parse_dataset(self, dataset_dict, dataset_ref):
             contact = self._contact_details(dataset_ref, ADMS.contactPoint)
 
         if contact:
-            for key in ('uri', 'name', 'email'):
+           for key in ('uri', 'name', 'email'):
                 if contact.get(key):
                     dataset_dict['extras'].append(
                         {'key': 'contact_{0}'.format(key),
@@ -1336,32 +1358,6 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
                 _type=URIRef, value_modifier=self._add_mailto
             )
 
-        # TODO: this will go into a separate profile
-        contact = dataset_dict.get("contact")
-        if isinstance(contact, list) and len(contact):
-            for item in contact:
-                contact_uri = item.get('uri')
-                if contact_uri:
-                    contact_details = CleanedURIRef(contact_uri)
-                else:
-                    contact_details = BNode()
-
-                g.add((contact_details, RDF.type, VCARD.Organization))
-                g.add((dataset_ref, DCAT.contactPoint, contact_details))
-
-                self._add_triple_from_dict(
-                    item, contact_details,
-                    VCARD.fn, 'name'
-                )
-                # Add mail address as URIRef, and ensure it has a mailto: prefix
-                self._add_triple_from_dict(
-                    item, contact_details,
-                    VCARD.hasEmail, 'email',
-                    _type=URIRef, value_modifier=self._add_mailto
-                )
-
-
-
         # Publisher
         if any([
             self._get_dataset_value(dataset_dict, 'publisher_uri'),
@@ -1752,8 +1748,6 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
             ]
             self._add_list_triples_from_dict(resource_dict, distribution, items)
 
-            # TODO: this will go into a separate profile
-
             access_service_list = resource_dict.get('access_services', [])
             if isinstance(access_service_list, str):
                 try:
@@ -1796,9 +1790,8 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
                 ]
                 self._add_list_triples_from_dict(access_service_dict, access_service_node, items)
 
-            # TODO: re-enable when separating into a profile
-            # if access_service_list:
-            #    resource_dict['access_services'] = json.dumps(access_service_list)
+            if access_service_list:
+               resource_dict['access_services'] = json.dumps(access_service_list)
 
     def graph_from_catalog(self, catalog_dict, catalog_ref):
 
@@ -2097,3 +2090,88 @@ def _distribution_url_graph(self, distribution, resource_dict):
     def _distribution_numbers_graph(self, distribution, resource_dict):
         if resource_dict.get('size'):
             self.g.add((distribution, SCHEMA.contentSize, Literal(resource_dict['size'])))
+
+
+# TODO: split all these classes in different files
+class EuropeanDCATAPSchemingProfile(RDFProfile):
+    '''
+    This is a compatibilty profile meant to add support for ckanext-scheming to the existing
+    `euro_dcat_ap` and `euro_dcat_ap_2` profiles.
+
+    It does not add or remove any properties from these profiles, it just transforms the
+    resulting dataset_dict so it is compatible with a ckanext-scheming schema
+
+    TODO: summarize changes and link to docs
+    '''
+
+    def parse_dataset(self, dataset_dict, dataset_ref):
+
+        if not self._dataset_schema:
+            # Not using scheming
+            return dataset_dict
+
+        # Move extras to root
+
+        extras_to_remove = []
+        extras = dataset_dict.get('extras', [])
+        for extra in extras:
+            if self._schema_field(extra['key']):
+                # This is a field defined in the dataset schema
+                dataset_dict[extra['key']] = extra['value']
+                extras_to_remove.append(extra['key'])
+
+        dataset_dict['extras'] = [e for e in extras if e['key'] not in extras_to_remove]
+
+
+        # Parse lists
+        def _parse_list_value(data_dict, field_name):
+            schema_field = self._schema_field(field_name) or self._schema_resource_field(field_name)
+
+            if schema_field and 'scheming_multiple_text' in schema_field.get('validators', []):
+                if isinstance(data_dict[field_name], str):
+                    try:
+                        data_dict[field_name] = json.loads(data_dict[field_name])
+                    except ValueError:
+                        pass
+
+        for field_name in dataset_dict.keys():
+            _parse_list_value(dataset_dict, field_name)
+
+        for resource_dict in dataset_dict.get('resources', []):
+            for field_name in resource_dict.keys():
+                _parse_list_value(resource_dict, field_name)
+
+
+        # Repeating subfields
+        for schema_field in self._dataset_schema['dataset_fields']:
+            if 'repeating_subfields' in schema_field:
+                # Check if existing extras need to be migrated
+                field_name = schema_field['field_name']
+                new_extras = []
+                new_dict = {}
+                for extra in dataset_dict.get('extras', []):
+                    if extra['key'].startswith(f'{field_name}_'):
+                        subfield = extra['key'][extra['key'].index('_') + 1:]
+                        if subfield in [f['field_name'] for f in schema_field['repeating_subfields']]:
+                            new_dict[subfield] = extra['value']
+                        else:
+                            new_extras.append(extra)
+                    else:
+                        new_extras.append(extra)
+                if new_dict:
+                    dataset_dict[field_name] = [new_dict]
+                    dataset_dict['extras'] = new_extras
+
+        for schema_field in self._dataset_schema['resource_fields']:
+            if 'repeating_subfields' in schema_field:
+                # Check if value needs to be load from JSON
+                field_name = schema_field['field_name']
+                for resource_dict in dataset_dict.get('resources', []):
+                    if resource_dict.get(field_name) and isinstance(resource_dict[field_name], str):
+                        try:
+                            # TODO: load only subfields in schema?
+                            resource_dict[field_name] = json.loads(resource_dict[field_name])
+                        except ValueError:
+                            pass
+
+        return dataset_dict
diff --git a/examples/dataset.rdf b/examples/dataset.rdf
@@ -3,6 +3,7 @@
     xmlns:time="http://www.w3.org/2006/time#"
 	xmlns:dct="http://purl.org/dc/terms/"
 	xmlns:dcat="http://www.w3.org/ns/dcat#"
+    xmlns:dcatap="http://data.europa.eu/r5r/"
 	xmlns:foaf="http://xmlns.com/foaf/0.1/"
     xmlns:adms="http://www.w3.org/ns/adms#"
     xmlns:schema="http://schema.org/"
@@ -96,7 +97,19 @@
                     <spdx:algorithm rdf:resource="http://spdx.org/rdf/terms#checksumAlgorithm_sha1"/>
                 </spdx:Checksum>
             </spdx:checksum>
-        </dcat:Distribution>
+            <dcat:accessService>
+                <dcat:DataService>
+                    <dcatap:availability rdf:resource="http://publications.europa.eu/resource/authority/planned-availability/AVAILABLE"/>
+                    <dct:title>Sparql-end Point</dct:title>
+                    <dcat:endpointURL rdf:resource="http://publications.europa.eu/webapi/rdf/sparql"/>
+                    <dct:description>This SPARQL end point allow to directly query the EU Whoiswho content (organization / membership / person)</dct:description>
+                    <dcat:endpointDescription>SPARQL url description</dcat:endpointDescription>
+                    <dct:license rdf:resource="http://publications.europa.eu/resource/authority/licence/COM_REUSE"/>
+                    <dct:accessRights rdf:resource="http://publications.europa.eu/resource/authority/access-right/PUBLIC"/>
+                </dcat:DataService>
+            </dcat:accessService>
+
+</dcat:Distribution>
     </dcat:distribution>
 </dcat:Dataset>
 </rdf:RDF>
diff --git a/setup.py b/setup.py
@@ -43,6 +43,7 @@
     [ckan.rdf.profiles]
     euro_dcat_ap=ckanext.dcat.profiles:EuropeanDCATAPProfile
     euro_dcat_ap_2=ckanext.dcat.profiles:EuropeanDCATAP2Profile
+    euro_dcat_ap_scheming=ckanext.dcat.profiles:EuropeanDCATAPSchemingProfile
     schemaorg=ckanext.dcat.profiles:SchemaOrgProfile
 
     [babel.extractors]