diff --git a/PROCESSORS.md b/PROCESSORS.md index 645f97e..cb6f90e 100644 --- a/PROCESSORS.md +++ b/PROCESSORS.md @@ -45,6 +45,8 @@ def load(source, name=None, resources=None, **options): - a remote URL (e.g. `https://path.to/the/data.csv`) - Other supported links, based on the current support of schemes and formats in [tabulator](https://github.com/frictionlessdata/tabulator-py#schemes) - a local path or remote URL to a datapackage.json file (e.g. `https://path.to/data_package/datapackage.json`) + - a reference to an environment variable containing the source location, + in the form of `env://ENV_VAR` - a tuple containing (datapackage_descriptor, resources_iterator) - `resources` - optional, relevant only if source points to a datapackage.json file or datapackage / resourecs tuple. Value should be one of the following: - Name of a single resource to load diff --git a/dataflows/processors/load.py b/dataflows/processors/load.py index bef0f2d..7e0b291 100644 --- a/dataflows/processors/load.py +++ b/dataflows/processors/load.py @@ -23,31 +23,37 @@ def process_datapackage(self, dp: Package): for resource_descriptor in datapackage_descriptor['resources']: if self.resource_matcher.match(resource_descriptor['name']): dp.add_resource(resource_descriptor) - elif os.path.basename(self.load_source) == 'datapackage.json': - self.load_dp = Package(self.load_source) - self.resource_matcher = ResourceMatcher(self.resources, self.load_dp) - dp.descriptor.setdefault('resources', []) - for resource in self.load_dp.resources: - if self.resource_matcher.match(resource.name): - dp.add_resource(resource.descriptor) - else: - if os.path.exists(self.load_source): - base_path = os.path.dirname(self.load_source) or '.' - self.load_source = os.path.basename(self.load_source) + else: # load_source is string: + if self.load_source.startswith('env://'): + env_var = self.load_source[6:] + self.load_source = os.environ.get(env_var) + if self.load_source is None: + raise ValueError(f"Couldn't find value for env var '{env_var}'") + if os.path.basename(self.load_source) == 'datapackage.json': + self.load_dp = Package(self.load_source) + self.resource_matcher = ResourceMatcher(self.resources, self.load_dp) + dp.descriptor.setdefault('resources', []) + for resource in self.load_dp.resources: + if self.resource_matcher.match(resource.name): + dp.add_resource(resource.descriptor) else: - base_path = None - descriptor = dict(path=self.load_source, - profile='tabular-data-resource') - if 'format' in self.options: - descriptor['format'] = self.options['format'] - self.res = Resource(descriptor, - base_path=base_path, - **self.options) - self.res.infer(confidence=1, limit=1000) - if self.name is not None: - self.res.descriptor['name'] = self.name - self.res.descriptor['path'] = '{name}.{format}'.format(**self.res.descriptor) - dp.add_resource(self.res.descriptor) + if os.path.exists(self.load_source): + base_path = os.path.dirname(self.load_source) or '.' + self.load_source = os.path.basename(self.load_source) + else: + base_path = None + descriptor = dict(path=self.load_source, + profile='tabular-data-resource') + if 'format' in self.options: + descriptor['format'] = self.options['format'] + self.res = Resource(descriptor, + base_path=base_path, + **self.options) + self.res.infer(confidence=1, limit=1000) + if self.name is not None: + self.res.descriptor['name'] = self.name + self.res.descriptor['path'] = '{name}.{format}'.format(**self.res.descriptor) + dp.add_resource(self.res.descriptor) return dp def process_resources(self, resources): diff --git a/tests/test_lib.py b/tests/test_lib.py index 5186129..cfef210 100644 --- a/tests/test_lib.py +++ b/tests/test_lib.py @@ -279,6 +279,24 @@ def test_load_from_package(): assert [list(res) for res in ds.res_iter] == [[{'foo': 'bar'}]] +def test_load_from_env_var(): + import os + from dataflows import load, dump_to_path + + Flow( + [{'foo': 'bar'}], + dump_to_path('data/load_from_package') + ).process() + + os.environ['MY_DATAPACKAGE'] = 'data/load_from_package/datapackage.json' + results, dp, _ = Flow( + load('env://MY_DATAPACKAGE') + ).results() + + assert len(dp.resources) == 1 + assert results == [[{'foo': 'bar'}]] + + def test_load_from_package_resource_matching(): from dataflows import dump_to_path, load