Provide a new plugin for indexing repeating subfields

The `scheming_subfields_index` plugin will group the values of the same subfields in a text field that will make the values findable. They are indexed as `extras_{field_name}__{key}`. `extras_*` is a dynamic `text` Solr field that will allow free-text search on these values. Added tests and updated the docs.
ckan · Jun 14, 2024 · 0b67416 · 0b67416
1 parent 9a33dcd
commit 0b67416
Show file tree

Hide file tree

Showing 5 changed files with 94 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -280,10 +280,13 @@ When using a plain string translations will be provided with gettext:
 This field is the parent of group of repeating subfields. The value is
 a list of fields entered the same way as normal fields.
 
-> **_NOTE:_** CKAN needs an IPackageController plugin with `before_index` to
-> convert repeating subfields to formats that can be indexed by solr. For
-> testing you may use the included `scheming_nerf_index` plugin to encode
-> all repeating fields as JSON strings to prevent solr errors.
+> [!NOTE]
+> CKAN needs an IPackageController plugin with `before_dataset_index` to
+> convert repeating subfields to formats that can be indexed by solr. The
+> included `scheming_subfields_index` plugin will group the values of the
+> same subfields in a text field that will make the values findable. If
+> you require more precise handling of a particular subfield,
+> you will need to customize the Solr schema to add the necessary fields.
 
 `repeating_label` may be used to provide a singular version of the label
 for each group.

diff --git a/ckanext/scheming/plugins.py b/ckanext/scheming/plugins.py
@@ -499,6 +499,51 @@ def before_index(self, data_dict):
         return data_dict
 
 
+class SchemingSubfieldsIndexPlugin(p.SingletonPlugin):
+    """
+    Index suitable repeating dataset fields in before_dataset_index to prevent failures
+    on unmodified solr schema. This will allow hitting results in most text and list
+    subfields. Ideally you probably want to select the relevant subfields that will get
+    indexed and modify the Solr schema if necessary.
+
+    This implementation will group the values of the same subfields into an
+    `extras_{field_name}__{key}`,a text Solr field that will allow free-text search on
+    its value. Again, if you require more precise handling of a particular subfield,
+    you will need to customize the Solr schema to add particular fields needed.
+    """
+    p.implements(p.IPackageController, inherit=True)
+
+    def before_dataset_index(self, data_dict):
+        return self.before_index(data_dict)
+
+    def before_index(self, data_dict):
+        schemas = SchemingDatasetsPlugin.instance._expanded_schemas
+        if data_dict['type'] not in schemas:
+            return data_dict
+
+        schema = schemas[data_dict['type']]
+
+        for field in schema['dataset_fields']:
+            if field['field_name'] in data_dict and 'repeating_subfields' in field:
+                for item in data_dict[field['field_name']]:
+                    for key in item:
+                        value = item[key]
+                        if isinstance(value, dict):
+                            continue
+                        if isinstance(value, list):
+                            value = ' '.join(value)
+                        # Index a flattened version
+                        new_key = f'extras_{field["field_name"]}__{key}'
+                        if not data_dict.get(new_key):
+                            data_dict[new_key] = value
+                        else:
+                            data_dict[new_key] += ' ' + value
+
+                data_dict.pop(field['field_name'], None)
+
+        return data_dict
+
+
 def _load_schemas(schemas, type_field):
     out = {}
     for n in schemas:

diff --git a/ckanext/scheming/tests/test_subfields.py b/ckanext/scheming/tests/test_subfields.py
@@ -0,0 +1,40 @@
+from unittest import mock
+
+import pytest
+import ckantoolkit
+
+from ckantoolkit.tests.factories import Dataset
+from ckantoolkit.tests.helpers import call_action
+
+
+dataset_dict = {
+    "name": "test-dataset",
+    "type": "test-subfields",
+    # Repeating subfields
+    "contact_address": [
+        {"address": "Maple Street 123", "city": "New Paris", "country": "Maplonia"},
+        {"address": "Rose Avenue 452", "city": "Old York", "country": "Rosestan"},
+    ],
+}
+
+
+@pytest.mark.usefixtures("with_plugins", "clean_db")
+def test_repeating_subfields_index():
+
+    with mock.patch("ckan.lib.search.index.make_connection") as m:
+        call_action("package_create", **dataset_dict)
+
+        # Dict sent to Solr
+        search_dict = m.mock_calls[1].kwargs["docs"][0]
+        assert search_dict["extras_contact_address__city"] == "New Paris Old York"
+        assert search_dict["extras_contact_address__country"] == "Maplonia Rosestan"
+
+
+@pytest.mark.usefixtures("with_plugins", "clean_db")
+def test_repeating_subfields_search():
+
+    dataset = call_action("package_create", **dataset_dict)
+
+    result = call_action("package_search", q="Old York")
+
+    assert result["results"][0]["id"] == dataset["id"]
diff --git a/setup.py b/setup.py
@@ -38,6 +38,7 @@
     scheming_groups=ckanext.scheming.plugins:SchemingGroupsPlugin
     scheming_organizations=ckanext.scheming.plugins:SchemingOrganizationsPlugin
     scheming_nerf_index=ckanext.scheming.plugins:SchemingNerfIndexPlugin
+    scheming_subfields_index=ckanext.scheming.plugins:SchemingSubfieldsIndexPlugin
     scheming_test_subclass=ckanext.scheming.tests.plugins:SchemingTestSubclass
     scheming_test_plugin=ckanext.scheming.tests.plugins:SchemingTestSchemaPlugin
     scheming_test_validation=ckanext.scheming.tests.plugins:SchemingTestValidationPlugin

diff --git a/test.ini b/test.ini
@@ -12,7 +12,7 @@ port = 5000
 use = config:../../src/ckan/test-core.ini
 
 ckan.plugins = scheming_datasets scheming_groups scheming_organizations
-               scheming_test_plugin scheming_nerf_index
+               scheming_test_plugin scheming_subfields_index
 scheming.dataset_schemas = ckanext.scheming:ckan_dataset.yaml
                            ckanext.scheming.tests:test_schema.json
 			   ckanext.scheming.tests:test_subfields.yaml