Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CKAN -> Frictionless format conversion and resource path generation #19

Merged
merged 11 commits into from
Jul 13, 2020
Merged
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ __pycache__/
.Python
env/
/.venv/
build/
/build/
develop-eggs/
dist/
sdist/
Expand Down
19 changes: 0 additions & 19 deletions .gitlab-ci.yml

This file was deleted.

4 changes: 3 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ CKAN_CLI := $(shell which ckan | head -n1)

TEST_INI_PATH := ./test.ini
SENTINELS := .make-status
TEST_PATH :=
TEST_EXTRA_ARGS :=

PYTHON_VERSION := $(shell $(PYTHON) -c 'import sys; print(sys.version_info[0])')

Expand Down Expand Up @@ -216,7 +218,7 @@ $(SENTINELS)/tests-passed: $(SENTINELS)/test-setup $(shell find $(PACKAGE_DIR) -
--with-pylons=$(TEST_INI_PATH) \
--nologcapture \
--with-doctest \
$(COVERAGE_ARG) $(PACKAGE_DIR)/tests/$(TEST_PATH)
$(COVERAGE_ARG) $(TEST_EXTRA_ARGS) $(PACKAGE_DIR)/tests/$(TEST_PATH)
@touch $@

## Add test users
Expand Down
116 changes: 108 additions & 8 deletions ckanext/versioning/datapackage.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,128 @@

See http://specs.frictionlessdata.io/data-package/ for datapackage specs
"""
# import frictionless_ckan_mapper.ckan_to_frictionless as ctf
# import frictionless_ckan_mapper.frictionless_to_ckan as ftc
import re
from typing import Any, Dict

import frictionless_ckan_mapper.ckan_to_frictionless as ctf
import frictionless_ckan_mapper.frictionless_to_ckan as ftc
from six import iteritems

def dataset_to_frictionless(package):
FALLBACK_RESOURCE_PATH = 'resource'


def dataset_to_frictionless(ckan_dataset):
"""Convert a CKAN dataset dict to a Frictionless datapackage
"""
# TODO: Use ckan_mapper
# return ctf.dataset(package)
package = _convert_excluding_path(ckan_dataset)
_normalize_resource_paths(package)
return package


def frictionless_to_dataset(package):
def frictionless_to_dataset(datapackage):
"""Convert a Frictionless data datapackage dict to a CKAN dataset dict
"""
# TODO: Use ckan_mapper
# return ftc.package(package)
return ftc.package(datapackage)


def _convert_excluding_path(ckan_dataset):
"""Convert a CKAN dataset to a frictionless package but exclude custom `path` values

This is done because frictionless_ckan_mapper will override `path` if URL is set for
a resource, but we want to preserve `path` if it was previously set.
"""
existing_paths = {i: r['path']
for i, r in enumerate(ckan_dataset.get('resources', []))
if 'path' in r}

package = ctf.dataset(ckan_dataset)

for i, path in iteritems(existing_paths):
package['resources'][i]['path'] = path

return package


def _normalize_resource_paths(package):
"""Normalize the paths of all resources
"""
parent_dir_re = re.compile(r'/?(?:(?:\.)+/)+')
existing_paths = set()

for counter, resource in enumerate(package.get('resources', [])):
path = _get_resource_path(resource)
if path is None:
continue

path = parent_dir_re.sub('/', path)
try:
while path[0] == '/':
path = path[1:]
except IndexError:
path = FALLBACK_RESOURCE_PATH

if path in existing_paths:
path = _add_filename_suffix(path, '-{}'.format(counter))
else:
existing_paths.add(path)

resource['path'] = path


def _get_resource_path(resource):
# type: (Dict[str, Any]) -> str
"""Get the `path` value for a resource

Apply the following rules in order

Generate an initial value for path:
If a path variable exists, use it. END
If url exists, use it. END
If sha256 is set use the name and format: {NAME-LOWER-CASE}.{FORMAT}

If path contains /../ or begins with /, normalize by striping any leading / and replacing any /../ with -.

Ensure generated path does not conflict with other path values in the same datapackage:
If it conflicts use suffixes based on position in resource list
e.g. "{NAME}-{POS}.{FORMAT}"

"""

if 'path' in resource:
path = resource['path']
elif 'url' in resource:
path = resource.pop('url')
elif 'sha256' in resource and 'name' in resource and 'format' in resource:
path = '{name}.{format}'.format(name=resource['name'], format=resource['format']).lower()
else:
path = None
return path


def _add_filename_suffix(original, suffix):
# type: (str, str) -> str
"""Add a suffix to a filename
"""
parts = original.rsplit('.', 1)
filename = '{}{}'.format(parts[0], suffix)
if len(parts) > 1:
filename += '.{}'.format(parts[1])
return filename


def update_ckan_dict(ckan_dict, dataset):
""" Updates the CKAN package dict with metadata from metastore.
"""
ckan_dict.update(dataset)
if len(ckan_dict.get('extras', [])) > 0:
ckan_dict['extras'] = _normalize_extras(ckan_dict)
return ckan_dict


def _normalize_extras(ckan_dict):
"""Normalize extras returned by frictionless-ckan-mapper

This removes any extras item that already exists in the main CKAN package dict,
because we know this will not pass validation
"""
return [e for e in ckan_dict['extras'] if e['key'] not in ckan_dict]
1 change: 0 additions & 1 deletion ckanext/versioning/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,4 @@ def setup(self):
config['ckanext.versioning.backend_config'] = json.dumps({"uri": self._backend_dir})

def teardown(self):
super(MetastoreBackendTestBase, self).setup()
shutil.rmtree(self._backend_dir)
66 changes: 66 additions & 0 deletions ckanext/versioning/tests/test_datapackage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
"""Datapackage formatting related tests
"""
from nose.tools import assert_equals
from parameterized import parameterized

from ckanext.versioning import datapackage

SHA256 = '0f1128046248f83dc9b9ab187e16fad0ff596128f1524d05a9a77c4ad932f10a'


@parameterized([
({"path": "i/have/a/path.csv", "url": "i/also/have/a/url.csv", "id": "r-1"}, "i/have/a/path.csv"),
({"url": "https://example.com/data.csv", "sha256": SHA256, "name": "my-resource", "format": "csv", "id": "r-1"},
"https://example.com/data.csv"),
({"sha256": SHA256, "name": "my-resource", "format": "csv", "id": "r-1"}, "my-resource.csv"),
({"sha256": SHA256, "name": "MyResource", "format": "csv", "id": "r-1"}, "myresource.csv"),
({"url": "data/myfile.csv", "id": "r-1"}, "data/myfile.csv"),
({"name": "my-resource", "format": "csv", "id": "r-1"}, None),
])
def test_resource_path_is_added(resource, expected_path):
result = datapackage.dataset_to_frictionless({"name": "my package", "resources": [resource]})
assert_equals(result['resources'][0].get('path'), expected_path)


def test_resource_path_multiple_resources():
dataset = {
"name": "my package",
"resources": [{"url": "data/foo.csv", "name": "resource 1", "id": "resource-1"},
{"url": "https://example.com/data.csv", "name": "resource 2", "id": "resource-2"},
{"path": "an/existing/path.csv", "name": "resource 3", "id": "resource-3"},
{"name": "my-resource", "format": "xls", "sha256": SHA256, "id": "resource-4"}]
}
resources = datapackage.dataset_to_frictionless(dataset)['resources']
assert_equals(resources[0]['path'], 'data/foo.csv')
assert_equals(resources[1]['path'], 'https://example.com/data.csv')
assert_equals(resources[2]['path'], 'an/existing/path.csv')
assert_equals(resources[3]['path'], 'my-resource.xls')


@parameterized([
('/absolute/data.csv', 'absolute/data.csv'),
('./relative/data.csv', 'relative/data.csv'),
('local/with/../../../parent/ref.csv', 'local/with/parent/ref.csv'),
('local/with/./././parent/ref.csv', 'local/with/parent/ref.csv'),
('../upper/dir/ref.csv', 'upper/dir/ref.csv'),
])
def test_resource_path_relative_dirs_normalization(input, expected):
result = datapackage.dataset_to_frictionless({"name": "my package", "resources": [{"url": input, "id": "r-1"}]})
assert_equals(result['resources'][0].get('path'), expected)


def test_resource_path_conflicting_paths_fixed():
dataset = {
"name": "my package",
"resources": [{"url": "data/foo.csv", "name": "resource 1", "id": "r-1"},
{"url": "data/bar.csv", "name": "resource 2", "id": "r-2"},
{"path": "data/foo.csv", "name": "resource 3", "id": "r-3"},
{"name": "data/foo", "format": "csv", "sha256": SHA256, "id": "r-4"},
{"path": "../data/foo.csv", "name": "resource 5", "id": "r-5"}]
}
resources = datapackage.dataset_to_frictionless(dataset)['resources']
assert_equals(resources[0]['path'], 'data/foo.csv')
assert_equals(resources[1]['path'], 'data/bar.csv')
assert_equals(resources[2]['path'], 'data/foo-2.csv')
assert_equals(resources[3]['path'], 'data/foo-3.csv')
assert_equals(resources[4]['path'], 'data/foo-4.csv')
8 changes: 4 additions & 4 deletions ckanext/versioning/tests/test_functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def test_package_show_renders_version(self):
app = self._get_test_app()
context = self._get_context(self.user)

version = test_helpers.call_action(
tag = test_helpers.call_action(
'dataset_tag_create',
context,
dataset=self.dataset['id'],
Expand All @@ -58,7 +58,7 @@ def test_package_show_renders_version(self):
url = toolkit.url_for(
'versioning.show',
package_id=self.dataset['id'],
tag=version['name'])
tag=tag['name'])

environ = {'REMOTE_USER': self.user_name}
res = app.get(url, extra_environ=environ)
Expand All @@ -69,7 +69,7 @@ def test_package_show_renders_alert_info(self):
app = self._get_test_app()
context = self._get_context(self.user)

version = test_helpers.call_action(
tag = test_helpers.call_action(
'dataset_tag_create',
context,
dataset=self.dataset['id'],
Expand All @@ -86,7 +86,7 @@ def test_package_show_renders_alert_info(self):
url = toolkit.url_for(
'versioning.show',
package_id=self.dataset['id'],
tag=version['name'])
tag=tag['name'])

environ = {'REMOTE_USER': self.user_name}
res = app.get(url, extra_environ=environ)
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ ckan-datapackage-tools==0.1.0
metastore-lib==0.1.1
python-dateutil==2.8.1
datapackage
git+https://github.com/frictionlessdata/frictionless-ckan-mapper.git#egg=frictionless-ckan-mapper
frictionless-ckan-mapper==1.0.3