Skip to content

Commit

Permalink
Merge c3994cb into a8b49f7
Browse files Browse the repository at this point in the history
  • Loading branch information
snopoke committed Jun 25, 2021
2 parents a8b49f7 + c3994cb commit 76ef093
Show file tree
Hide file tree
Showing 6 changed files with 180 additions and 36 deletions.
13 changes: 10 additions & 3 deletions commcare_export/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ def add_to_parser(self, parser, **additional_kwargs):
help="Export tables containing mobile worker data and "
"location data and add a commcare_userid field to any "
"exported form or case"),
Argument('export-root-if-no-subdocument', default=False, action='store_true', help=(
"Use this when you are exporting a nested document e.g. form.form..case, messaging-event.messages.[*]"
" And you want to have a record exported even if the nested document does not exist or is empty."))
]


Expand Down Expand Up @@ -154,18 +157,22 @@ def _get_query(args, writer, column_enforcer=None):
writer.supports_multi_table_write,
writer.max_column_length,
writer.required_columns,
column_enforcer
column_enforcer,
args.export_root_if_no_subdocument
)


def _get_query_from_file(query_arg, missing_value, combine_emits,
max_column_length, required_columns, column_enforcer):
max_column_length, required_columns, column_enforcer,
value_or_root):
if os.path.exists(query_arg):
if os.path.splitext(query_arg)[1] in ['.xls', '.xlsx']:
import openpyxl
workbook = openpyxl.load_workbook(query_arg)
return excel_query.get_queries_from_excel(
workbook, missing_value, combine_emits,
max_column_length, required_columns, column_enforcer
max_column_length, required_columns, column_enforcer,
value_or_root
)
else:
with io.open(query_arg, encoding='utf-8') as fh:
Expand Down
30 changes: 26 additions & 4 deletions commcare_export/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@

from jsonpath_rw import jsonpath
from jsonpath_rw.parser import parse as parse_jsonpath

from commcare_export.jsonpath_utils import split_leftmost
from commcare_export.misc import unwrap, unwrap_val

from commcare_export.repeatable_iterator import RepeatableIterator
Expand Down Expand Up @@ -179,7 +181,8 @@ class JsonPathEnv(Env):
"""
def __init__(self, bindings=None):
self.__bindings = bindings or {}

self.__restrict_to_root = bool(jsonpath.Fields("__root_only").find(self.__bindings))

# Currently hardcoded because it is a global is jsonpath-rw
# Probably not widely used, but will require refactor if so
jsonpath.auto_id_field = "id"
Expand All @@ -198,14 +201,19 @@ def lookup(self, name):
else:
raise NotFound(unwrap_val(name))

def iter(jsonpath_expr=jsonpath_expr): # Capture closure
if self.__restrict_to_root and str(jsonpath_expr) != 'id': # special case for 'id'
expr, _ = split_leftmost(jsonpath_expr)
if not isinstance(expr, jsonpath.Root):
return RepeatableIterator(lambda : iter(()))

def iterator(jsonpath_expr=jsonpath_expr): # Capture closure
for datum in jsonpath_expr.find(self.__bindings):
# HACK: The auto id from jsonpath_rw is good, but we lose it when we do .value here,
# so just slap it on if not present
if isinstance(datum.value, dict) and 'id' not in datum.value:
datum.value['id'] = jsonpath.AutoIdForDatum(datum).value
yield datum
return RepeatableIterator(iter)
return RepeatableIterator(iterator)

def bind(self, *args):
"(str, ??) -> Env | ({str: ??}) -> Env"
Expand Down Expand Up @@ -433,7 +441,20 @@ def template(format_template, *args):


def _or(*args):
unwrapped_args = (unwrap_val(arg) for arg in args)
return _or_impl(unwrap_val, *args)


def _or_raw(*args):
def unwrap_iter(arg):
if isinstance(arg, RepeatableIterator):
return list(arg)
return arg

return _or_impl(unwrap_iter, *args)


def _or_impl(_unwrap, *args):
unwrapped_args = (_unwrap(arg) for arg in args)
vals = (val for val in unwrapped_args if val is not None and val != [])
try:
return next(vals)
Expand Down Expand Up @@ -498,6 +519,7 @@ def __init__(self, d=None):
'or': _or,
'sha1': sha1,
'substr': substr,
'_or_raw': _or_raw, # for internal use
})
return super(BuiltInEnv, self).__init__(d)

Expand Down
50 changes: 32 additions & 18 deletions commcare_export/excel_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from jsonpath_rw.parser import parse as parse_jsonpath

from commcare_export.exceptions import LongFieldsException, MissingColumnException, ReservedTableNameException
from commcare_export.jsonpath_utils import split_leftmost
from commcare_export.map_format import compile_map_format_via
from commcare_export.minilinq import *

Expand Down Expand Up @@ -193,18 +194,8 @@ def compile_fields(worksheet, mappings=None):
for field, source_field, alt_source_fields, map_via, format_via in args
]

def split_leftmost(jsonpath_expr):
if isinstance(jsonpath_expr, jsonpath.Child):
further_leftmost, rest = split_leftmost(jsonpath_expr.left)
return further_leftmost, rest.child(jsonpath_expr.right)
elif isinstance(jsonpath_expr, jsonpath.Descendants):
further_leftmost, rest = split_leftmost(jsonpath_expr.left)
return further_leftmost, jsonpath.Descendants(rest, jsonpath_expr.right)
else:
return (jsonpath_expr, jsonpath.This())


def compile_source(worksheet):
def compile_source(worksheet, value_or_root=False):
"""
Compiles just the part of the Excel Spreadsheet that
indicates the API endpoint to hit along with optional filters
Expand Down Expand Up @@ -260,7 +251,26 @@ def compile_source(worksheet):
if data_source_jsonpath is None or isinstance(data_source_jsonpath, jsonpath.This) or isinstance(data_source_jsonpath, jsonpath.Root):
return data_source, api_query, None
else:
return data_source, api_query, Reference(str(data_source_jsonpath))
if value_or_root:
# if the jsonpath doesn't yield a value yield the root document
expr = get_value_or_root_expression(data_source_jsonpath)
else:
expr = Reference(str(data_source_jsonpath))
return data_source, api_query, expr


def get_value_or_root_expression(value_expression):
"""Return expression used when iterating over a nested document but also wanting
a record if the value expression returns an empty result."""

# We add a bind here so that in JsonPathEnv we can restrict expressions to only those that reference
# the root. That prevents us from mistakenly getting values from the root that happen to have the
# same name as those in the child.
root_expr = Bind("__root_only", Literal(True), Reference("$"))
return Apply(
Reference('_or_raw'), Reference(str(value_expression)), root_expr
)


# If the source is expected to provide a column, then require that it is
# already present or can be added without conflicting with an existing
Expand Down Expand Up @@ -296,9 +306,9 @@ def require_column_in_sheet(sheet_name, data_source, table_name, output_headings

return (headings, body)

def parse_sheet(worksheet, mappings=None, column_enforcer=None):
def parse_sheet(worksheet, mappings=None, column_enforcer=None, value_or_root=False):
mappings = mappings or {}
data_source, source_expr, root_doc_expr = compile_source(worksheet)
data_source, source_expr, root_doc_expr = compile_source(worksheet, value_or_root)

table_name_column = get_column_by_name(worksheet, 'table name')
if table_name_column:
Expand Down Expand Up @@ -355,7 +365,7 @@ def columns(self):
]


def parse_workbook(workbook, column_enforcer=None):
def parse_workbook(workbook, column_enforcer=None, value_or_root=False):
"""
Returns a MiniLinq corresponding to the Excel configuration, which
consists of the following sheets:
Expand All @@ -378,7 +388,7 @@ def parse_workbook(workbook, column_enforcer=None):
parsed_sheets = []
for sheet in emit_sheets:
try:
sheet_parts = parse_sheet(workbook[sheet], mappings, column_enforcer)
sheet_parts = parse_sheet(workbook[sheet], mappings, column_enforcer, value_or_root)
except Exception as e:
msg = 'Ignoring sheet "{}": {}'.format(sheet, str(e))
if logger.isEnabledFor(logging.DEBUG):
Expand Down Expand Up @@ -509,14 +519,18 @@ def check_columns(parsed_sheets, columns):
if errors_by_sheet:
raise MissingColumnException(errors_by_sheet)


blacklisted_tables = []


def blacklist(table_name):
blacklisted_tables.append(table_name)


def get_queries_from_excel(workbook, missing_value=None, combine_emits=False,
max_column_length=None, required_columns=None,
column_enforcer=None):
parsed_sheets = parse_workbook(workbook, column_enforcer)
column_enforcer=None, value_or_root=False):
parsed_sheets = parse_workbook(workbook, column_enforcer, value_or_root)
for sheet in parsed_sheets:
if sheet.name in blacklisted_tables:
raise ReservedTableNameException(sheet.name)
Expand Down
12 changes: 12 additions & 0 deletions commcare_export/jsonpath_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from jsonpath_rw import jsonpath


def split_leftmost(jsonpath_expr):
if isinstance(jsonpath_expr, jsonpath.Child):
further_leftmost, rest = split_leftmost(jsonpath_expr.left)
return further_leftmost, rest.child(jsonpath_expr.right)
elif isinstance(jsonpath_expr, jsonpath.Descendants):
further_leftmost, rest = split_leftmost(jsonpath_expr.left)
return further_leftmost, jsonpath.Descendants(rest, jsonpath_expr.right)
else:
return jsonpath_expr, jsonpath.This()
66 changes: 61 additions & 5 deletions tests/test_excel_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,7 +412,7 @@ def test_multi_emit(self):
)
])

self._compare_minilinq_to_compiled(minilinq, '008_multiple-tables.xlsx', combine=True)
self._compare_minilinq_to_compiled(minilinq, '008_multiple-tables.xlsx', combine_emits=True)

def test_multi_emit_no_combine(self):
minilinq = List([
Expand Down Expand Up @@ -461,7 +461,7 @@ def test_multi_emit_no_combine(self):
)
])

self._compare_minilinq_to_compiled(minilinq, '008_multiple-tables.xlsx', combine=False)
self._compare_minilinq_to_compiled(minilinq, '008_multiple-tables.xlsx', combine_emits=False)

def test_multi_emit_with_organization(self):
minilinq = List([
Expand Down Expand Up @@ -522,11 +522,67 @@ def test_multi_emit_with_organization(self):
])

column_enforcer = ColumnEnforcer()
self._compare_minilinq_to_compiled(minilinq, '008_multiple-tables.xlsx', combine=True,
self._compare_minilinq_to_compiled(minilinq, '008_multiple-tables.xlsx', combine_emits=True,
column_enforcer=column_enforcer)

def _compare_minilinq_to_compiled(self, minilinq, filename, combine=False, column_enforcer=None):
def test_value_or_root(self):
minilinq = List([
Bind("checkpoint_manager",
Apply(Reference('get_checkpoint_manager'), Literal("form"), Literal(["Forms"])),
Emit(
table="Forms",
headings=[Literal("id"), Literal("name")],
missing_value='---',
source=Map(
source=Apply(Reference("api_data"), Literal("form"), Reference('checkpoint_manager')),
body=List([
Reference("id"),
Reference("form.name"),
]),
)
)
),
Bind("checkpoint_manager",
Apply(Reference('get_checkpoint_manager'), Literal("form"), Literal(["Cases"])),
Emit(
table="Cases",
headings=[Literal("case_id")],
missing_value='---',
source=Map(
source=FlatMap(
body=Apply(
Reference("_or_raw"),
Reference("form..case"),
Bind("__root_only", Literal(True), Reference("$"))
),
source=Apply(Reference("api_data"), Literal("form"), Reference('checkpoint_manager'))
),
body=List([
Reference("@case_id"),
]),
)
)
),
Bind("checkpoint_manager",
Apply(Reference('get_checkpoint_manager'), Literal("case"), Literal(["Other cases"])),
Emit(
table="Other cases",
headings=[Literal("id")],
missing_value='---',
source=Map(
source=Apply(Reference("api_data"), Literal("case"), Reference('checkpoint_manager')),
body=List([
Reference("id")
])
)
)
)
])

self._compare_minilinq_to_compiled(minilinq, '008_multiple-tables.xlsx', combine_emits=False, value_or_root=True)

def _compare_minilinq_to_compiled(self, minilinq, filename, **kwargs):
print("Parsing {}".format(filename))
abs_path = os.path.join(os.path.dirname(__file__), filename)
compiled = get_queries_from_excel(openpyxl.load_workbook(abs_path), missing_value='---', combine_emits=combine, column_enforcer=column_enforcer)
compiled = get_queries_from_excel(openpyxl.load_workbook(abs_path), missing_value='---', **kwargs)
assert compiled.to_jvalue() == minilinq.to_jvalue(), filename
45 changes: 39 additions & 6 deletions tests/test_minilinq.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,21 @@
# -*- coding: utf-8 -*-
import inspect
import types
import unittest
from itertools import *

import pytest
from six.moves import map, xrange
from six.moves import xrange

from jsonpath_rw import jsonpath

from commcare_export.minilinq import *
from commcare_export.repeatable_iterator import RepeatableIterator
from commcare_export.env import *
from commcare_export.excel_query import get_value_or_root_expression
from commcare_export.minilinq import *
from commcare_export.writers import JValueTableWriter


class LazinessException(Exception): pass
def die(msg): raise LazinessException(msg) # Hack: since "raise" is a statement not an expression, need a funcall wrapping it


class TestMiniLinq(unittest.TestCase):

@classmethod
Expand Down Expand Up @@ -88,6 +86,33 @@ def test_eval_auto_id_reference_nested(self):
# Reference('$.foo.id'):
# '1.bid' -> 'bid'

def test_flatmap_value_or_root(self):
"""Low level test case for 'value-or-root' use case"""
env = BuiltInEnv() | JsonPathEnv({})

data = [
{"id": 1, "foo": {'id': 'bid', 'name': 'zip'}, "bar": [{'baz': 'a1'}, {'baz': 'a2', 'id': 'bazzer'}]},
{"id": 2, "foo": {'id': 'bid', 'name': 'zap'}, "bar": []},
{"id": 3, "foo": {'id': 'bid', 'name': 'mip'}, "bar": {}},
# {"id": 4, "foo": {'id': 'bid', 'name': 'map'}, "bar": None}, # fails with TypeError from jsonpath
{"id": 5, "foo": {'id': 'bid', 'name': 'mop'}},
{"id": 6, "foo": {'id': 'bid', 'name': 'mop'}, "baz": "root_bazz"},
]
value_or_root = get_value_or_root_expression('bar.[*]')
flatmap = FlatMap(source=Literal(data), body=value_or_root)
mmap = Map(source=flatmap, body=List([
Reference("id"), Reference('baz'), Reference('$.id'), Reference('$.foo.id'), Reference('$.foo.name')
]))
self.check_case(mmap.eval(env), [
['1.bar.1.bar.[0]', 'a1', '1', '1.bid', 'zip'],
['1.bar.bazzer', 'a2', '1', '1.bid', 'zip'],
['2', [], '2', '2.bid', 'zap'],
['3.bar.3.bar.[0]', [], '3', '3.bid', 'mip'],
# ['4.bar.[0]', [], '4', '4.bid', 'map'],
['5', [], '5', '5.bid', 'mop'],
['6', [], '6', '6.bid', 'mop'],
])

def test_eval_collapsed_list(self):
"""
Special case to handle XML -> JSON conversion where there just happened to be a single value at save time
Expand Down Expand Up @@ -147,6 +172,14 @@ def test_or(self):
assert Apply(Reference("or"), Reference('a.b'), Reference('a.c')).eval(env) == 'c val'
assert Apply(Reference("or"), Reference('a.b'), Reference('a.d')).eval(env) is None

env = env.replace({'a': [], 'b': [1, 2], 'c': 2})
self.check_case(Apply(Reference("or"), Reference('a.[*]'), Reference('b')).eval(env), [1, 2])
self.check_case(Apply(Reference("or"), Reference('b.[*]'), Reference('c')).eval(env), [1, 2])
self.check_case(
Apply(Reference("or"), Reference('a.[*]'), Reference('$')).eval(env),
{'a': [], 'b': [1, 2], 'c': 2, 'id': '$'}
)

def test_attachment_url(self):
env = BuiltInEnv({'commcarehq_base_url': 'https://www.commcarehq.org'}) | JsonPathEnv({'id': '123', 'domain': 'd1', 'photo': 'a.jpg'})
expected = 'https://www.commcarehq.org/a/d1/api/form/attachment/123/a.jpg'
Expand Down

0 comments on commit 76ef093

Please sign in to comment.