From af3f7d9f47d83a6c3101aba141b5b7e541ee29c5 Mon Sep 17 00:00:00 2001
From: Adam Kariv <akariv@users.noreply.github.com>
Date: Tue, 22 Dec 2020 09:34:39 +0200
Subject: [PATCH] Adds geojson support (#155)

* fix default serializer

* basic support geopoint

* using self.fields and checking for geojson

* moving to write_transformed_row

* fixes

* finalizing

* Some code cleanup

* Lint fixes

Co-authored-by: Giuseppe PERONATO <giuseppe.peronato@idiap.ch>
---
 PROCESSORS.md                                |  2 +-
 data/cities_location.csv                     |  4 ++
 dataflows/processors/dumpers/file_dumper.py  |  5 ++-
 dataflows/processors/dumpers/file_formats.py | 43 ++++++++++++++++++--
 tests/test_lib.py                            |  9 ++++
 5 files changed, 56 insertions(+), 7 deletions(-)
 create mode 100644 data/cities_location.csv

diff --git a/PROCESSORS.md b/PROCESSORS.md
index 0f74215..527cd97 100644
--- a/PROCESSORS.md
+++ b/PROCESSORS.md
@@ -144,7 +144,7 @@ def dump_to_path(out_path='.',
 - `force-format` - Specifies whether to force all output files to be generated with the same format
     - if `True` (the default), all resources will use the same format
     - if `False`, format will be deduced from the file extension. Resources with unknown extensions will be discarded.
-- `format` - Specifies the type of output files to be generated (if `force-format` is true): `csv` (the default) or `json`
+- `format` - Specifies the type of output files to be generated (if `force-format` is true): `csv` (the default), `json` or `geojson` 
 - `temporal-format-property` - Specifies a property to be used for temporal values serialization. For example, if some field has a property `outputFormat: %d/%m/%y` setting `temporal-format-property` to `outputFormat` will lead to using this format for this field serialization.
 - `add-filehash-to-path`: Specifies whether to include file md5 hash into the resource path. Defaults to `False`. If `True` Embeds hash in path like so:
     - If original path is `path/to/the/file.ext`
diff --git a/data/cities_location.csv b/data/cities_location.csv
new file mode 100644
index 0000000..019f5a7
--- /dev/null
+++ b/data/cities_location.csv
@@ -0,0 +1,4 @@
+id,city,lat,long
+1,london,51.509865,-0.118092
+2,paris,48.8566,2.3522
+3,rome,41.9028,2.4964
diff --git a/dataflows/processors/dumpers/file_dumper.py b/dataflows/processors/dumpers/file_dumper.py
index 0ee916d..fba5a41 100644
--- a/dataflows/processors/dumpers/file_dumper.py
+++ b/dataflows/processors/dumpers/file_dumper.py
@@ -6,7 +6,7 @@
 from datapackage import Resource
 
 from .dumper_base import DumperBase
-from .file_formats import CSVFormat, JSONFormat
+from .file_formats import CSVFormat, JSONFormat, GeoJSONFormat
 
 
 class FileDumper(DumperBase):
@@ -34,7 +34,8 @@ def process_datapackage(self, datapackage):
                 file_format = file_format[1:]
             file_formatter = {
                 'csv': CSVFormat,
-                'json': JSONFormat
+                'json': JSONFormat,
+                'geojson': GeoJSONFormat
             }.get(file_format)
             if file_format is not None:
                 self.file_formatters[resource.name] = file_formatter
diff --git a/dataflows/processors/dumpers/file_formats.py b/dataflows/processors/dumpers/file_formats.py
index 3b149e9..2448f9e 100644
--- a/dataflows/processors/dumpers/file_formats.py
+++ b/dataflows/processors/dumpers/file_formats.py
@@ -161,6 +161,7 @@ class JSONFormat(FileFormat):
         'geopoint': lambda d: list(map(float, d)),
         'yearmonth': lambda d: '{:04d}-{:02d}'.format(*d),
     }
+
     NULL_VALUE = None
 
     PYTHON_DIALECT = {
@@ -176,10 +177,12 @@ class JSONFormat(FileFormat):
     }
 
     def __init__(self, file, schema, **options):
-        writer = file
-        writer.write('[')
-        writer.__first = True
-        super(JSONFormat, self).__init__(writer, schema, default_serializer=identity, **options)
+        self.initialize_file(file)
+        super(JSONFormat, self).__init__(file, schema, default_serializer=identity, **options)
+
+    def initialize_file(self, file):
+        file.write('[')
+        file.__first = True
 
     @classmethod
     def prepare_resource(cls, resource):
@@ -200,3 +203,35 @@ def write_transformed_row(self, transformed_row):
 
     def finalize_file(self):
         self.writer.write(']')
+
+
+class GeoJSONFormat(JSONFormat):
+
+    def initialize_file(self, file):
+        file.write('{"type": "FeatureCollection","features":')
+        super(GeoJSONFormat, self).initialize_file(file)
+
+    def write_transformed_row(self, transformed_row):
+        properties = dict()
+        for k, v in transformed_row.items():
+            if self.fields[k].type == "geopoint":
+                geometry = dict(
+                    type='Point',
+                    coordinates=v
+                )
+                break
+            elif self.fields[k].type == "geojson":
+                geometry = v
+                break
+            else:
+                properties[k] = v
+        feature = dict(
+            geometry=geometry,
+            type='Feature',
+            properties=properties
+        )
+        super(GeoJSONFormat, self).write_transformed_row(feature)
+
+    def finalize_file(self):
+        super(GeoJSONFormat, self).finalize_file()
+        self.writer.write('}')
diff --git a/tests/test_lib.py b/tests/test_lib.py
index 7ad51c6..c610fac 100644
--- a/tests/test_lib.py
+++ b/tests/test_lib.py
@@ -1999,3 +1999,12 @@ def test_dump_to_zip():
     zz = dump_to_zip('out/test_dump_to_zip.zip')
     Flow([dict(a=1)], zz).process()
     assert zz.out_file.closed
+
+def test_dump_to_geojson():
+    from dataflows import Flow, dump_to_path, load, add_computed_field, delete_fields
+    Flow(
+        load('data/cities_location.csv'),
+        add_computed_field(target=dict(name='Location', type='geopoint'), operation='format', with_='{lat}, {long}'),
+        delete_fields(['lat', 'long']),
+        dump_to_path(out_path='out', format='geojson'),
+    ).process()