0.0.9 Add validate processor

datahq · Sep 13, 2018 · a76f030 · a76f030
1 parent e57bbf8
commit a76f030
Show file tree

Hide file tree

Showing 6 changed files with 48 additions and 5 deletions.
diff --git a/PROCESSORS.md b/PROCESSORS.md
@@ -14,7 +14,8 @@ DataFlows comes with a few built-in processors which do most of the heavy liftin
 - **delete_fields** - Removes some columns from the data
 - **add_computed_field** - Adds new fields whose values are based on existing columns
 - **find_replace** - Look for specific patterns in specific fields and replace them with new data
-- **set_type** - Parse incoming data based on provided schema, validate the data in the process
+- **set_type** - Modify schema, parse incoming data based on new schema, validate the data in the process
+- **validate** - Parse incoming data based on existing schema, validate the incoming data in the process
 
 ### Manipulate the entire resource
 - **sort_rows** - Sort incoming data based on key
@@ -243,7 +244,7 @@ Sets a field's data type and type options and validates its data based on its ne
 This processor modifies the last resource in the package.
 
 ```python
-def set_Type(name, **options):
+def set_type(name, **options):
     pass
 ```
 
@@ -254,6 +255,14 @@ def set_Type(name, **options):
   etc.
  (more info on possible options can be found in the [tableschema spec](https://frictionlessdata.io/specs/table-schema/))
 
+#### validate.py
+Validate incoming data based on existing type definitions.
+
+```python
+def validate():
+    pass
+```
+
 ### Manipulate the entire resource
 #### sort_rows.py
 Sort incoming data based on key.

diff --git a/dataflows/VERSION b/dataflows/VERSION
@@ -1 +1 @@
-0.0.8
+0.0.9
diff --git a/dataflows/processors/__init__.py b/dataflows/processors/__init__.py
@@ -1,6 +1,7 @@
 from .load import load
 from .printer import printer
 from .set_type import set_type
+from .validate import validate
 from .dumpers import dump_to_path, dump_to_zip
 
 from .add_computed_field import add_computed_field

diff --git a/dataflows/processors/validate.py b/dataflows/processors/validate.py
@@ -0,0 +1,10 @@
+from .. import DataStreamProcessor, schema_validator
+
+
+class validate(DataStreamProcessor):
+
+    def __init__(self):
+        super(validate, self).__init__()
+
+    def process_resource(self, res):
+        yield from super(validate, self).process_resource(schema_validator(res.res, res))
diff --git a/dataflows/templates/main.tpl.py b/dataflows/templates/main.tpl.py
@@ -1,5 +1,5 @@
 from dataflows import Flow, load, dump_to_path, dump_to_zip, printer, add_metadata
-from dataflows import sort_rows, filter_rows, find_replace, delete_fields, set_type, unpivot
+from dataflows import sort_rows, filter_rows, find_replace, delete_fields, set_type, validate, unpivot
 
 
 {% if 'custom' in processing %}
@@ -43,8 +43,9 @@ def {{slug}}():
         delete_fields(['field_name']),  # Pass a list of field names to delete from the data
         {% endif %}
         {% if 'set_type' in processing %}
-        set_type('field_name', type='number', constraints=dict(minimum=3)),  # There are quite a few options you can user here
+        set_type('field_name', type='number', constraints=dict(minimum=3)),  # There are quite a few options you can use here
                                                                              # Take a look at https://frictionlessdata.io/specs/table-schema/
+        # Or you can simply use validate() here instead                                                                             
         {% endif %}
         {% if 'unpivot' in processing %}
         unpivot(unpivot_fields, extra_keys, extra_value),  # See documentation on the meaning of each of these parameters

diff --git a/tests/test_examples.py b/tests/test_examples.py
@@ -110,6 +110,28 @@ def filter_pythagorean_triplets(rows):
     )
     _ = f.process()
 
+def test_validate():
+    from dataflows import Flow, validate, set_type, printer
+
+    def adder(row):
+        row['a'] += 0.5
+        row['a'] = str(row['a'])
+
+
+    f = Flow(
+        (dict(a=x) for x in range(10)),
+        set_type('a', type='integer'),
+        adder,
+        validate(),
+        printer()
+    )
+    try:
+        _ = f.process()
+        assert False
+    except ValueError:
+        pass
+
+
 def test_example_7():
     from dataflows import Flow, load, dump_to_path