Skip to content

Commit

Permalink
Load from env var (#27)
Browse files Browse the repository at this point in the history
* Load from env var

* v0.0.20
  • Loading branch information
akariv committed Oct 15, 2018
1 parent 607b84c commit ab2f52c
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 25 deletions.
2 changes: 2 additions & 0 deletions PROCESSORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ def load(source, name=None, resources=None, **options):
- a remote URL (e.g. `https://path.to/the/data.csv`)
- Other supported links, based on the current support of schemes and formats in [tabulator](https://github.com/frictionlessdata/tabulator-py#schemes)
- a local path or remote URL to a datapackage.json file (e.g. `https://path.to/data_package/datapackage.json`)
- a reference to an environment variable containing the source location,
in the form of `env://ENV_VAR`
- a tuple containing (datapackage_descriptor, resources_iterator)
- `resources` - optional, relevant only if source points to a datapackage.json file or datapackage / resourecs tuple. Value should be one of the following:
- Name of a single resource to load
Expand Down
2 changes: 1 addition & 1 deletion dataflows/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.0.19
0.0.20
54 changes: 30 additions & 24 deletions dataflows/processors/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,31 +23,37 @@ def process_datapackage(self, dp: Package):
for resource_descriptor in datapackage_descriptor['resources']:
if self.resource_matcher.match(resource_descriptor['name']):
dp.add_resource(resource_descriptor)
elif os.path.basename(self.load_source) == 'datapackage.json':
self.load_dp = Package(self.load_source)
self.resource_matcher = ResourceMatcher(self.resources, self.load_dp)
dp.descriptor.setdefault('resources', [])
for resource in self.load_dp.resources:
if self.resource_matcher.match(resource.name):
dp.add_resource(resource.descriptor)
else:
if os.path.exists(self.load_source):
base_path = os.path.dirname(self.load_source) or '.'
self.load_source = os.path.basename(self.load_source)
else: # load_source is string:
if self.load_source.startswith('env://'):
env_var = self.load_source[6:]
self.load_source = os.environ.get(env_var)
if self.load_source is None:
raise ValueError(f"Couldn't find value for env var '{env_var}'")
if os.path.basename(self.load_source) == 'datapackage.json':
self.load_dp = Package(self.load_source)
self.resource_matcher = ResourceMatcher(self.resources, self.load_dp)
dp.descriptor.setdefault('resources', [])
for resource in self.load_dp.resources:
if self.resource_matcher.match(resource.name):
dp.add_resource(resource.descriptor)
else:
base_path = None
descriptor = dict(path=self.load_source,
profile='tabular-data-resource')
if 'format' in self.options:
descriptor['format'] = self.options['format']
self.res = Resource(descriptor,
base_path=base_path,
**self.options)
self.res.infer(confidence=1, limit=1000)
if self.name is not None:
self.res.descriptor['name'] = self.name
self.res.descriptor['path'] = '{name}.{format}'.format(**self.res.descriptor)
dp.add_resource(self.res.descriptor)
if os.path.exists(self.load_source):
base_path = os.path.dirname(self.load_source) or '.'
self.load_source = os.path.basename(self.load_source)
else:
base_path = None
descriptor = dict(path=self.load_source,
profile='tabular-data-resource')
if 'format' in self.options:
descriptor['format'] = self.options['format']
self.res = Resource(descriptor,
base_path=base_path,
**self.options)
self.res.infer(confidence=1, limit=1000)
if self.name is not None:
self.res.descriptor['name'] = self.name
self.res.descriptor['path'] = '{name}.{format}'.format(**self.res.descriptor)
dp.add_resource(self.res.descriptor)
return dp

def process_resources(self, resources):
Expand Down
18 changes: 18 additions & 0 deletions tests/test_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,24 @@ def test_load_from_package():
assert [list(res) for res in ds.res_iter] == [[{'foo': 'bar'}]]


def test_load_from_env_var():
import os
from dataflows import load, dump_to_path

Flow(
[{'foo': 'bar'}],
dump_to_path('data/load_from_package')
).process()

os.environ['MY_DATAPACKAGE'] = 'data/load_from_package/datapackage.json'
results, dp, _ = Flow(
load('env://MY_DATAPACKAGE')
).results()

assert len(dp.resources) == 1
assert results == [[{'foo': 'bar'}]]


def test_load_from_package_resource_matching():
from dataflows import dump_to_path, load

Expand Down

0 comments on commit ab2f52c

Please sign in to comment.