Merge pull request #142 from dimagi/cz/data-types

Add explicit data type support
dimagi · Jun 3, 2020 · d377788 · d377788
2 parents 9995d75 + 98d5faf
commit d377788
Show file tree

Hide file tree

Showing 13 changed files with 262 additions and 97 deletions.
diff --git a/commcare_export/cli.py b/commcare_export/cli.py
@@ -15,6 +15,7 @@
 from commcare_export import excel_query
 from commcare_export import writers
 from commcare_export.checkpoint import CheckpointManagerProvider
+from commcare_export.misc import default_to_json
 from commcare_export.utils import get_checkpoint_manager
 from commcare_export.commcare_hq_client import CommCareHqClient, LATEST_KNOWN_VERSION
 from commcare_export.commcare_minilinq import CommCareHqEnv
@@ -328,7 +329,7 @@ def main_with_args(args):
     exit_status = evaluate_query(env, query)
 
     if args.output_format == 'json':
-        print(json.dumps(list(writer.tables.values()), indent=4, default=RepeatableIterator.to_jvalue))
+        print(json.dumps(list(writer.tables.values()), indent=4, default=default_to_json))
 
     return exit_status
 

diff --git a/commcare_export/data_types.py b/commcare_export/data_types.py
@@ -0,0 +1,24 @@
+import sqlalchemy
+
+DATA_TYPE_TEXT = 'text'
+DATA_TYPE_BOOLEAN = 'boolean'
+DATA_TYPE_DATE = 'date'
+DATA_TYPE_DATETIME = 'datetime'
+DATA_TYPE_INTEGER = 'integer'
+
+DATA_TYPES_TO_SQLALCHEMY_TYPES = {
+    DATA_TYPE_BOOLEAN: sqlalchemy.Boolean(),
+    DATA_TYPE_DATETIME: sqlalchemy.DateTime(),
+    DATA_TYPE_DATE: sqlalchemy.Date(),
+    DATA_TYPE_INTEGER: sqlalchemy.Integer(),
+}
+
+class UnknownDataType(Exception):
+    pass
+
+
+def get_sqlalchemy_type(data_type):
+    if data_type not in DATA_TYPES_TO_SQLALCHEMY_TYPES:
+        raise UnknownDataType(data_type)
+    else:
+        return DATA_TYPES_TO_SQLALCHEMY_TYPES[data_type]
diff --git a/commcare_export/env.py b/commcare_export/env.py
@@ -451,7 +451,7 @@ def lookup(self, key): raise NotFound()
 
     def emit_table(self, table_spec):
         self.emitted = True
-        table_spec['rows'] = self._unwrap_row_vals(table_spec['rows'])
+        table_spec.rows = self._unwrap_row_vals(table_spec.rows)
         self.writer.write_table(table_spec)
 
     def has_emitted_tables(self):

diff --git a/commcare_export/excel_query.py b/commcare_export/excel_query.py
@@ -306,13 +306,18 @@ def parse_sheet(worksheet, mappings=None, column_enforcer=None):
     else:
         output_table_name = worksheet.title
     output_headings = get_column_by_name(worksheet, 'field')
+    output_types = get_column_by_name(worksheet, 'data type') or []
     output_fields = compile_fields(worksheet, mappings=mappings)
 
     if not output_fields:
         headings = []
+        data_types = []
         source = source_expr
         body = None
     else:
+        # note: if we want to add data types to the columns added by the column_enforcer
+        # this will have to conditionally move into the if/else below
+        data_types = [Literal(data_type.value) for data_type in output_types]
         if column_enforcer is not None:
             (headings, body) = require_column_in_sheet(worksheet.title,
                                                        data_source,
@@ -333,12 +338,14 @@ def parse_sheet(worksheet, mappings=None, column_enforcer=None):
         source,
         body,
         root_doc_expr,
+        data_types,
     )
 
 
-class SheetParts(namedtuple('SheetParts', 'name headings source body root_expr')):
-    def __new__(cls, name, headings, source, body, root_expr=None):
-        return super(SheetParts, cls).__new__(cls, name, headings, source, body, root_expr)
+class SheetParts(namedtuple('SheetParts', 'name headings source body root_expr data_types')):
+    def __new__(cls, name, headings, source, body, root_expr=None, data_types=None):
+        data_types = data_types or []
+        return super(SheetParts, cls).__new__(cls, name, headings, source, body, root_expr, data_types)
 
     @property
     def columns(self):
@@ -435,7 +442,8 @@ def get_multi_emit_query(source, sheets, missing_value):
                     source=root_expr,
                     body=sheet.body
                 ),
-                missing_value=missing_value
+                missing_value=missing_value,
+                data_types=sheet.data_types,
             )
         )
 
@@ -464,7 +472,8 @@ def _get_source(source, root_expr):
             source=_get_source(sheet.source, sheet.root_expr),
             body=sheet.body
         ),
-        missing_value=missing_value
+        missing_value=missing_value,
+        data_types=sheet.data_types,
     )
     return Bind('checkpoint_manager', Apply(Reference('get_checkpoint_manager'), Literal([sheet.name])), emit)
 

diff --git a/commcare_export/minilinq.py b/commcare_export/minilinq.py
@@ -8,6 +8,8 @@
 
 from commcare_export.repeatable_iterator import RepeatableIterator
 
+from commcare_export.specs import TableSpec
+
 logger = logging.getLogger(__name__)
 
 class MiniLinq(object):
@@ -400,12 +402,13 @@ class Emit(MiniLinq):
     are actually lists - it is just crashy instead.
     """
 
-    def __init__(self, table, headings, source, missing_value=None):
+    def __init__(self, table, headings, source, missing_value=None, data_types=None):
         "(str, [str], [MiniLinq]) -> MiniLinq"
         self.table = table
         self.headings = headings
         self.source = source
         self.missing_value = missing_value
+        self.data_types = data_types or []
 
     @unwrap('cell')
     def coerce_cell_blithely(self, cell):
@@ -428,31 +431,37 @@ def coerce_row(self, row):
 
     def eval(self, env):
         rows = self.source.eval(env)
-        env.emit_table({'name': self.table,
-                        'headings': [heading.eval(env) for heading in self.headings],
-                        'rows': map(self.coerce_row, rows)})
+        env.emit_table(TableSpec(
+            name=self.table,
+            headings=[heading.eval(env) for heading in self.headings],
+            rows=list(map(self.coerce_row, rows)),
+        ))
 
     @classmethod
     def from_jvalue(cls, jvalue):
         fields = jvalue['Emit']
-
-        return cls(table    = fields['table'],
-                   source   = MiniLinq.from_jvalue(fields['source']),
-                   headings = [MiniLinq.from_jvalue(heading) for heading in fields['headings']],
-                   missing_value=fields.get('missing_value'))
+        return cls(
+            table=fields['table'],
+            source=MiniLinq.from_jvalue(fields['source']),
+            headings=[MiniLinq.from_jvalue(heading) for heading in fields['headings']],
+            missing_value=fields.get('missing_value'),
+            data_types=fields.get('data_types'),
+        )
 
     def to_jvalue(self):
         return {'Emit': {'table': self.table,
                          'headings': [heading.to_jvalue() for heading in self.headings],
                          'source': self.source.to_jvalue(),
-                         'missing_value': self.missing_value}}
+                         'missing_value': self.missing_value,
+                         'data_types': [heading.to_jvalue() for heading in self.headings]}}
 
     def __eq__(self, other):
         return (
             isinstance(other, Emit) and self.table == other.table
             and self.headings == other.headings
             and self.source == other.source
             and self.missing_value == other.missing_value
+            and self.data_types == other.data_types
         )
 
     def __repr__(self):

diff --git a/commcare_export/misc.py b/commcare_export/misc.py
@@ -47,3 +47,10 @@ def unwrap_val(val):
         val = val.value
 
     return val
+
+
+def default_to_json(obj):
+    if hasattr(obj, 'toJSON'):
+        return obj.toJSON()
+    else:
+        return RepeatableIterator.to_jvalue(obj)
diff --git a/commcare_export/specs.py b/commcare_export/specs.py
@@ -0,0 +1,26 @@
+
+
+class TableSpec:
+
+    def __init__(self, name, headings, rows, data_types=None):
+        self.name = name
+        self.headings = headings
+        self.rows = rows
+        self.data_types = data_types or []
+
+    def __eq__(self, other):
+        return (
+            isinstance(other, TableSpec)
+            and other.name == self.name
+            and other.headings == self.headings
+            and other.rows == self.rows
+            and other.data_types == self.data_types
+        )
+
+    def toJSON(self):
+        return {
+            'name': self.name,
+            'headings': self.headings,
+            'rows': self.rows,
+            'data_types': self.data_types,
+        }