From af3f7d9f47d83a6c3101aba141b5b7e541ee29c5 Mon Sep 17 00:00:00 2001 From: Adam Kariv Date: Tue, 22 Dec 2020 09:34:39 +0200 Subject: [PATCH] Adds geojson support (#155) * fix default serializer * basic support geopoint * using self.fields and checking for geojson * moving to write_transformed_row * fixes * finalizing * Some code cleanup * Lint fixes Co-authored-by: Giuseppe PERONATO --- PROCESSORS.md | 2 +- data/cities_location.csv | 4 ++ dataflows/processors/dumpers/file_dumper.py | 5 ++- dataflows/processors/dumpers/file_formats.py | 43 ++++++++++++++++++-- tests/test_lib.py | 9 ++++ 5 files changed, 56 insertions(+), 7 deletions(-) create mode 100644 data/cities_location.csv diff --git a/PROCESSORS.md b/PROCESSORS.md index 0f74215..527cd97 100644 --- a/PROCESSORS.md +++ b/PROCESSORS.md @@ -144,7 +144,7 @@ def dump_to_path(out_path='.', - `force-format` - Specifies whether to force all output files to be generated with the same format - if `True` (the default), all resources will use the same format - if `False`, format will be deduced from the file extension. Resources with unknown extensions will be discarded. -- `format` - Specifies the type of output files to be generated (if `force-format` is true): `csv` (the default) or `json` +- `format` - Specifies the type of output files to be generated (if `force-format` is true): `csv` (the default), `json` or `geojson` - `temporal-format-property` - Specifies a property to be used for temporal values serialization. For example, if some field has a property `outputFormat: %d/%m/%y` setting `temporal-format-property` to `outputFormat` will lead to using this format for this field serialization. - `add-filehash-to-path`: Specifies whether to include file md5 hash into the resource path. Defaults to `False`. If `True` Embeds hash in path like so: - If original path is `path/to/the/file.ext` diff --git a/data/cities_location.csv b/data/cities_location.csv new file mode 100644 index 0000000..019f5a7 --- /dev/null +++ b/data/cities_location.csv @@ -0,0 +1,4 @@ +id,city,lat,long +1,london,51.509865,-0.118092 +2,paris,48.8566,2.3522 +3,rome,41.9028,2.4964 diff --git a/dataflows/processors/dumpers/file_dumper.py b/dataflows/processors/dumpers/file_dumper.py index 0ee916d..fba5a41 100644 --- a/dataflows/processors/dumpers/file_dumper.py +++ b/dataflows/processors/dumpers/file_dumper.py @@ -6,7 +6,7 @@ from datapackage import Resource from .dumper_base import DumperBase -from .file_formats import CSVFormat, JSONFormat +from .file_formats import CSVFormat, JSONFormat, GeoJSONFormat class FileDumper(DumperBase): @@ -34,7 +34,8 @@ def process_datapackage(self, datapackage): file_format = file_format[1:] file_formatter = { 'csv': CSVFormat, - 'json': JSONFormat + 'json': JSONFormat, + 'geojson': GeoJSONFormat }.get(file_format) if file_format is not None: self.file_formatters[resource.name] = file_formatter diff --git a/dataflows/processors/dumpers/file_formats.py b/dataflows/processors/dumpers/file_formats.py index 3b149e9..2448f9e 100644 --- a/dataflows/processors/dumpers/file_formats.py +++ b/dataflows/processors/dumpers/file_formats.py @@ -161,6 +161,7 @@ class JSONFormat(FileFormat): 'geopoint': lambda d: list(map(float, d)), 'yearmonth': lambda d: '{:04d}-{:02d}'.format(*d), } + NULL_VALUE = None PYTHON_DIALECT = { @@ -176,10 +177,12 @@ class JSONFormat(FileFormat): } def __init__(self, file, schema, **options): - writer = file - writer.write('[') - writer.__first = True - super(JSONFormat, self).__init__(writer, schema, default_serializer=identity, **options) + self.initialize_file(file) + super(JSONFormat, self).__init__(file, schema, default_serializer=identity, **options) + + def initialize_file(self, file): + file.write('[') + file.__first = True @classmethod def prepare_resource(cls, resource): @@ -200,3 +203,35 @@ def write_transformed_row(self, transformed_row): def finalize_file(self): self.writer.write(']') + + +class GeoJSONFormat(JSONFormat): + + def initialize_file(self, file): + file.write('{"type": "FeatureCollection","features":') + super(GeoJSONFormat, self).initialize_file(file) + + def write_transformed_row(self, transformed_row): + properties = dict() + for k, v in transformed_row.items(): + if self.fields[k].type == "geopoint": + geometry = dict( + type='Point', + coordinates=v + ) + break + elif self.fields[k].type == "geojson": + geometry = v + break + else: + properties[k] = v + feature = dict( + geometry=geometry, + type='Feature', + properties=properties + ) + super(GeoJSONFormat, self).write_transformed_row(feature) + + def finalize_file(self): + super(GeoJSONFormat, self).finalize_file() + self.writer.write('}') diff --git a/tests/test_lib.py b/tests/test_lib.py index 7ad51c6..c610fac 100644 --- a/tests/test_lib.py +++ b/tests/test_lib.py @@ -1999,3 +1999,12 @@ def test_dump_to_zip(): zz = dump_to_zip('out/test_dump_to_zip.zip') Flow([dict(a=1)], zz).process() assert zz.out_file.closed + +def test_dump_to_geojson(): + from dataflows import Flow, dump_to_path, load, add_computed_field, delete_fields + Flow( + load('data/cities_location.csv'), + add_computed_field(target=dict(name='Location', type='geopoint'), operation='format', with_='{lat}, {long}'), + delete_fields(['lat', 'long']), + dump_to_path(out_path='out', format='geojson'), + ).process()