Skip to content

Commit

Permalink
Adds the 'update_resource' processor (#23)
Browse files Browse the repository at this point in the history
* Introduce the update resource processor

* v0.0.15

* Renaming add_metadata to update_package, update docs

* lint
  • Loading branch information
akariv committed Oct 9, 2018
1 parent ae621ae commit 1387bd8
Show file tree
Hide file tree
Showing 7 changed files with 87 additions and 6 deletions.
29 changes: 26 additions & 3 deletions PROCESSORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ DataFlows comes with a few built-in processors which do most of the heavy liftin
- **filter_rows** - Filter rows based on inclusive and exclusive value filters

### Manipulate package
- **add_metadata** - Add high-level metadata about your package
- **update_package** - Updates metadata of entire package
- **update_resource** - Updates metadata of one or more resources
- **concatenate** - Concatenate multiple streams of data to a single one, resolving differently named columns along the way
- **duplicate** - Duplicate a single stream of data to make two streams

Expand Down Expand Up @@ -365,16 +366,38 @@ def filter_rows(equals=tuple(), not_equals=tuple(), resources=None):
Both `in` and `out` should be a list of dicts.

### Manipulate package
#### add_metadata.py
#### update_package.py
Add high-level metadata about your package

```python
def add_metadata(**metadata):
def update_package(**metadata):
pass
```

- `metadata` - Any allowed property (according to the [spec]([https://frictionlessdata.io/specs/data-package/#metadata)) can be provided here.

(`add_metadata` is an alias for `update_package` kept for backward compatibility)

#### update_resource.py
Update metadata for one or more resources in the package

```python
def update_resource(resources, **metadata):
pass
```

- `resources`
- A name of a resource to operate on
- A regular expression matching resource names
- A list of resource names
- `None` indicates operation should be done on all resources
- `metadata` - Any allowed property (according to the [spec]([https://frictionlessdata.io/specs/data-resource/#metadata)) can be provided here.

You can use `update_resource` to rename a resource like so:
```python
update_resource('current-name', name='new-name')
```

#### concatenate.py
Concatenate multiple streams of data to a single one, resolving differently named columns along the way.

Expand Down
2 changes: 1 addition & 1 deletion dataflows/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.0.14
0.0.15
3 changes: 2 additions & 1 deletion dataflows/processors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from .dumpers import dump_to_path, dump_to_zip, dump_to_sql

from .add_computed_field import add_computed_field
from .add_metadata import add_metadata
from .cache import cache
from .concatenate import concatenate
from .delete_fields import delete_fields
Expand All @@ -15,3 +14,5 @@
from .join import join, join_self
from .sort_rows import sort_rows
from .unpivot import unpivot
from .update_package import update_package, add_metadata
from .update_resource import update_resource
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import copy


def add_metadata(**metadata):
def update_package(**metadata):

metadata = copy.deepcopy(metadata)
if 'resources' in metadata:
Expand All @@ -13,3 +13,6 @@ def func(package):
yield from package

return func


add_metadata = update_package
22 changes: 22 additions & 0 deletions dataflows/processors/update_resource.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from dataflows import PackageWrapper
from dataflows.helpers.resource_matcher import ResourceMatcher


def update_resource(resource, **props):

resources = ResourceMatcher(resource)

def func(package: PackageWrapper):
for resource in package.pkg.descriptor['resources']:
if resources.match(resource['name']):
resource.update(props)
yield package.pkg

res_iter = iter(package)
for r in res_iter:
if resources.match(r.res.name):
yield r.it
else:
yield r

return func
14 changes: 14 additions & 0 deletions tests/test_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,3 +270,17 @@ def rename(package: PackageWrapper):
)
results, dp, stats = f.results()
print(dp.descriptor)
assert dp.descriptor['resources'][0]['name'] == 'renamed'


def test_rename_resource2():
from dataflows import Flow, printer, update_resource

f = Flow(
({'a': x} for x in range(10)),
update_resource(None, name='renamed'),
printer()
)
results, dp, stats = f.results()
print(dp.descriptor)
assert dp.descriptor['resources'][0]['name'] == 'renamed'
18 changes: 18 additions & 0 deletions tests/test_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,3 +362,21 @@ def processor(package):
assert stats['foo'] == 20
for f in expected_files:
assert os.path.exists(cache_path + '/' + f)


def test_update_resource():
from dataflows import Flow, printer, update_resource

f = Flow(
*[
({k: x} for x in range(10))
for k in 'abcdef'
],
update_resource(['res_1', 'res_3', 'res_5'], source='thewild'),
printer()
)
results, dp, stats = f.results()
print(dp.descriptor)
assert dp.descriptor['resources'][0]['source'] == 'thewild'
assert dp.descriptor['resources'][2]['source'] == 'thewild'
assert dp.descriptor['resources'][4]['source'] == 'thewild'

0 comments on commit 1387bd8

Please sign in to comment.