diff --git a/PROCESSORS.md b/PROCESSORS.md index 960043c..f7d6b0f 100644 --- a/PROCESSORS.md +++ b/PROCESSORS.md @@ -619,6 +619,24 @@ You can use `update_resource` to rename a resource like so: update_resource('current-name', name='new-name') ``` +#### update_schema.py +Update schema properties for one or more resources in the package + +```python +def update_schema(resources, **metadata): + pass +``` + +- `resources` + - A name of a resource to operate on + - A regular expression matching resource names + - A list of resource names + - `None` indicates operation should be done on all resources + - The index of the resource in the package +- `metadata` - Any allowed schema property (according to the [spec]([https://frictionlessdata.io/specs/table-schema/#descriptor)) can be provided here. + +You can use `update_schema` to add a `missingValues` property, change the primary key etc. + #### set_primary_key.py Updates the primary key for one or more resources in the package diff --git a/dataflows/VERSION b/dataflows/VERSION index ff8026f..9c3f756 100644 --- a/dataflows/VERSION +++ b/dataflows/VERSION @@ -1 +1 @@ -0.0.66 +0.0.67 diff --git a/dataflows/base/schema_validator.py b/dataflows/base/schema_validator.py index 7d11c9a..7cb643f 100644 --- a/dataflows/base/schema_validator.py +++ b/dataflows/base/schema_validator.py @@ -52,3 +52,8 @@ def schema_validator(resource, iterator, continue yield row + + +schema_validator.drop = drop +schema_validator.ignore = ignore +schema_validator.raise_exception = raise_exception diff --git a/dataflows/processors/__init__.py b/dataflows/processors/__init__.py index 3e62caa..3008426 100644 --- a/dataflows/processors/__init__.py +++ b/dataflows/processors/__init__.py @@ -22,3 +22,4 @@ from .unstream import unstream from .update_package import update_package, add_metadata from .update_resource import update_resource +from .update_schema import update_schema diff --git a/dataflows/processors/update_schema.py b/dataflows/processors/update_schema.py new file mode 100644 index 0000000..234aec6 --- /dev/null +++ b/dataflows/processors/update_schema.py @@ -0,0 +1,21 @@ +from dataflows import PackageWrapper +from dataflows.helpers.resource_matcher import ResourceMatcher + + +def update_schema(resources, **props): + + def func(package: PackageWrapper): + matcher = ResourceMatcher(resources, package.pkg) + for resource in package.pkg.descriptor['resources']: + if matcher.match(resource['name']): + resource.setdefault('schema', {}).update(props) + yield package.pkg + + res_iter = iter(package) + for r in res_iter: + if matcher.match(r.res.name): + yield r.it + else: + yield r + + return func diff --git a/tests/test_lib.py b/tests/test_lib.py index d169736..22e51c4 100644 --- a/tests/test_lib.py +++ b/tests/test_lib.py @@ -763,6 +763,22 @@ def test_update_resource(): assert dp.descriptor['resources'][4]['source'] == 'thewild' +def test_update_schema(): + from dataflows import Flow, printer, update_schema, validate + + f = Flow( + [['a', '-'], ['a', 0]], + update_schema(-1, missingValues=['-']), + validate(), + printer() + ) + results, dp, stats = f.results() + print(dp.descriptor) + assert results[0] == [ + dict(col0='a', col1=None), + dict(col0='a', col1=0), + ] + def test_set_type_resources(): from dataflows import Flow, set_type, validate