Skip to content

Commit

Permalink
Merge 4d194d2 into a8b49f7
Browse files Browse the repository at this point in the history
  • Loading branch information
snopoke committed Jun 25, 2021
2 parents a8b49f7 + 4d194d2 commit 0ec3899
Show file tree
Hide file tree
Showing 6 changed files with 229 additions and 36 deletions.
13 changes: 10 additions & 3 deletions commcare_export/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ def add_to_parser(self, parser, **additional_kwargs):
help="Export tables containing mobile worker data and "
"location data and add a commcare_userid field to any "
"exported form or case"),
Argument('export-root-if-no-subdocument', default=False, action='store_true', help=(
"Use this when you are exporting a nested document e.g. form.form..case, messaging-event.messages.[*]"
" And you want to have a record exported even if the nested document does not exist or is empty."))
]


Expand Down Expand Up @@ -154,18 +157,22 @@ def _get_query(args, writer, column_enforcer=None):
writer.supports_multi_table_write,
writer.max_column_length,
writer.required_columns,
column_enforcer
column_enforcer,
args.export_root_if_no_subdocument
)


def _get_query_from_file(query_arg, missing_value, combine_emits,
max_column_length, required_columns, column_enforcer):
max_column_length, required_columns, column_enforcer,
value_or_root):
if os.path.exists(query_arg):
if os.path.splitext(query_arg)[1] in ['.xls', '.xlsx']:
import openpyxl
workbook = openpyxl.load_workbook(query_arg)
return excel_query.get_queries_from_excel(
workbook, missing_value, combine_emits,
max_column_length, required_columns, column_enforcer
max_column_length, required_columns, column_enforcer,
value_or_root
)
else:
with io.open(query_arg, encoding='utf-8') as fh:
Expand Down
30 changes: 26 additions & 4 deletions commcare_export/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@

from jsonpath_rw import jsonpath
from jsonpath_rw.parser import parse as parse_jsonpath

from commcare_export.jsonpath_utils import split_leftmost
from commcare_export.misc import unwrap, unwrap_val

from commcare_export.repeatable_iterator import RepeatableIterator
Expand Down Expand Up @@ -179,7 +181,8 @@ class JsonPathEnv(Env):
"""
def __init__(self, bindings=None):
self.__bindings = bindings or {}

self.__restrict_to_root = bool(jsonpath.Fields("__root_only").find(self.__bindings))

# Currently hardcoded because it is a global is jsonpath-rw
# Probably not widely used, but will require refactor if so
jsonpath.auto_id_field = "id"
Expand All @@ -198,14 +201,19 @@ def lookup(self, name):
else:
raise NotFound(unwrap_val(name))

def iter(jsonpath_expr=jsonpath_expr): # Capture closure
if self.__restrict_to_root and str(jsonpath_expr) != 'id': # special case for 'id'
expr, _ = split_leftmost(jsonpath_expr)
if not isinstance(expr, jsonpath.Root):
return RepeatableIterator(lambda : iter(()))

def iterator(jsonpath_expr=jsonpath_expr): # Capture closure
for datum in jsonpath_expr.find(self.__bindings):
# HACK: The auto id from jsonpath_rw is good, but we lose it when we do .value here,
# so just slap it on if not present
if isinstance(datum.value, dict) and 'id' not in datum.value:
datum.value['id'] = jsonpath.AutoIdForDatum(datum).value
yield datum
return RepeatableIterator(iter)
return RepeatableIterator(iterator)

def bind(self, *args):
"(str, ??) -> Env | ({str: ??}) -> Env"
Expand Down Expand Up @@ -433,7 +441,20 @@ def template(format_template, *args):


def _or(*args):
unwrapped_args = (unwrap_val(arg) for arg in args)
return _or_impl(unwrap_val, *args)


def _or_raw(*args):
def unwrap_iter(arg):
if isinstance(arg, RepeatableIterator):
return list(arg)
return arg

return _or_impl(unwrap_iter, *args)


def _or_impl(_unwrap, *args):
unwrapped_args = (_unwrap(arg) for arg in args)
vals = (val for val in unwrapped_args if val is not None and val != [])
try:
return next(vals)
Expand Down Expand Up @@ -498,6 +519,7 @@ def __init__(self, d=None):
'or': _or,
'sha1': sha1,
'substr': substr,
'_or_raw': _or_raw, # for internal use
})
return super(BuiltInEnv, self).__init__(d)

Expand Down
50 changes: 32 additions & 18 deletions commcare_export/excel_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from jsonpath_rw.parser import parse as parse_jsonpath

from commcare_export.exceptions import LongFieldsException, MissingColumnException, ReservedTableNameException
from commcare_export.jsonpath_utils import split_leftmost
from commcare_export.map_format import compile_map_format_via
from commcare_export.minilinq import *

Expand Down Expand Up @@ -193,18 +194,8 @@ def compile_fields(worksheet, mappings=None):
for field, source_field, alt_source_fields, map_via, format_via in args
]

def split_leftmost(jsonpath_expr):
if isinstance(jsonpath_expr, jsonpath.Child):
further_leftmost, rest = split_leftmost(jsonpath_expr.left)
return further_leftmost, rest.child(jsonpath_expr.right)
elif isinstance(jsonpath_expr, jsonpath.Descendants):
further_leftmost, rest = split_leftmost(jsonpath_expr.left)
return further_leftmost, jsonpath.Descendants(rest, jsonpath_expr.right)
else:
return (jsonpath_expr, jsonpath.This())


def compile_source(worksheet):
def compile_source(worksheet, value_or_root=False):
"""
Compiles just the part of the Excel Spreadsheet that
indicates the API endpoint to hit along with optional filters
Expand Down Expand Up @@ -260,7 +251,26 @@ def compile_source(worksheet):
if data_source_jsonpath is None or isinstance(data_source_jsonpath, jsonpath.This) or isinstance(data_source_jsonpath, jsonpath.Root):
return data_source, api_query, None
else:
return data_source, api_query, Reference(str(data_source_jsonpath))
if value_or_root:
# if the jsonpath doesn't yield a value yield the root document
expr = get_value_or_root_expression(data_source_jsonpath)
else:
expr = Reference(str(data_source_jsonpath))
return data_source, api_query, expr


def get_value_or_root_expression(value_expression):
"""Return expression used when iterating over a nested document but also wanting
a record if the value expression returns an empty result."""

# We add a bind here so that in JsonPathEnv we can restrict expressions to only those that reference
# the root. That prevents us from mistakenly getting values from the root that happen to have the
# same name as those in the child.
root_expr = Bind("__root_only", Literal(True), Reference("$"))
return Apply(
Reference('_or_raw'), Reference(str(value_expression)), root_expr
)


# If the source is expected to provide a column, then require that it is
# already present or can be added without conflicting with an existing
Expand Down Expand Up @@ -296,9 +306,9 @@ def require_column_in_sheet(sheet_name, data_source, table_name, output_headings

return (headings, body)

def parse_sheet(worksheet, mappings=None, column_enforcer=None):
def parse_sheet(worksheet, mappings=None, column_enforcer=None, value_or_root=False):
mappings = mappings or {}
data_source, source_expr, root_doc_expr = compile_source(worksheet)
data_source, source_expr, root_doc_expr = compile_source(worksheet, value_or_root)

table_name_column = get_column_by_name(worksheet, 'table name')
if table_name_column:
Expand Down Expand Up @@ -355,7 +365,7 @@ def columns(self):
]


def parse_workbook(workbook, column_enforcer=None):
def parse_workbook(workbook, column_enforcer=None, value_or_root=False):
"""
Returns a MiniLinq corresponding to the Excel configuration, which
consists of the following sheets:
Expand All @@ -378,7 +388,7 @@ def parse_workbook(workbook, column_enforcer=None):
parsed_sheets = []
for sheet in emit_sheets:
try:
sheet_parts = parse_sheet(workbook[sheet], mappings, column_enforcer)
sheet_parts = parse_sheet(workbook[sheet], mappings, column_enforcer, value_or_root)
except Exception as e:
msg = 'Ignoring sheet "{}": {}'.format(sheet, str(e))
if logger.isEnabledFor(logging.DEBUG):
Expand Down Expand Up @@ -509,14 +519,18 @@ def check_columns(parsed_sheets, columns):
if errors_by_sheet:
raise MissingColumnException(errors_by_sheet)


blacklisted_tables = []


def blacklist(table_name):
blacklisted_tables.append(table_name)


def get_queries_from_excel(workbook, missing_value=None, combine_emits=False,
max_column_length=None, required_columns=None,
column_enforcer=None):
parsed_sheets = parse_workbook(workbook, column_enforcer)
column_enforcer=None, value_or_root=False):
parsed_sheets = parse_workbook(workbook, column_enforcer, value_or_root)
for sheet in parsed_sheets:
if sheet.name in blacklisted_tables:
raise ReservedTableNameException(sheet.name)
Expand Down
12 changes: 12 additions & 0 deletions commcare_export/jsonpath_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from jsonpath_rw import jsonpath


def split_leftmost(jsonpath_expr):
if isinstance(jsonpath_expr, jsonpath.Child):
further_leftmost, rest = split_leftmost(jsonpath_expr.left)
return further_leftmost, rest.child(jsonpath_expr.right)
elif isinstance(jsonpath_expr, jsonpath.Descendants):
further_leftmost, rest = split_leftmost(jsonpath_expr.left)
return further_leftmost, jsonpath.Descendants(rest, jsonpath_expr.right)
else:
return jsonpath_expr, jsonpath.This()
66 changes: 61 additions & 5 deletions tests/test_excel_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,7 +412,7 @@ def test_multi_emit(self):
)
])

self._compare_minilinq_to_compiled(minilinq, '008_multiple-tables.xlsx', combine=True)
self._compare_minilinq_to_compiled(minilinq, '008_multiple-tables.xlsx', combine_emits=True)

def test_multi_emit_no_combine(self):
minilinq = List([
Expand Down Expand Up @@ -461,7 +461,7 @@ def test_multi_emit_no_combine(self):
)
])

self._compare_minilinq_to_compiled(minilinq, '008_multiple-tables.xlsx', combine=False)
self._compare_minilinq_to_compiled(minilinq, '008_multiple-tables.xlsx', combine_emits=False)

def test_multi_emit_with_organization(self):
minilinq = List([
Expand Down Expand Up @@ -522,11 +522,67 @@ def test_multi_emit_with_organization(self):
])

column_enforcer = ColumnEnforcer()
self._compare_minilinq_to_compiled(minilinq, '008_multiple-tables.xlsx', combine=True,
self._compare_minilinq_to_compiled(minilinq, '008_multiple-tables.xlsx', combine_emits=True,
column_enforcer=column_enforcer)

def _compare_minilinq_to_compiled(self, minilinq, filename, combine=False, column_enforcer=None):
def test_value_or_root(self):
minilinq = List([
Bind("checkpoint_manager",
Apply(Reference('get_checkpoint_manager'), Literal("form"), Literal(["Forms"])),
Emit(
table="Forms",
headings=[Literal("id"), Literal("name")],
missing_value='---',
source=Map(
source=Apply(Reference("api_data"), Literal("form"), Reference('checkpoint_manager')),
body=List([
Reference("id"),
Reference("form.name"),
]),
)
)
),
Bind("checkpoint_manager",
Apply(Reference('get_checkpoint_manager'), Literal("form"), Literal(["Cases"])),
Emit(
table="Cases",
headings=[Literal("case_id")],
missing_value='---',
source=Map(
source=FlatMap(
body=Apply(
Reference("_or_raw"),
Reference("form..case"),
Bind("__root_only", Literal(True), Reference("$"))
),
source=Apply(Reference("api_data"), Literal("form"), Reference('checkpoint_manager'))
),
body=List([
Reference("@case_id"),
]),
)
)
),
Bind("checkpoint_manager",
Apply(Reference('get_checkpoint_manager'), Literal("case"), Literal(["Other cases"])),
Emit(
table="Other cases",
headings=[Literal("id")],
missing_value='---',
source=Map(
source=Apply(Reference("api_data"), Literal("case"), Reference('checkpoint_manager')),
body=List([
Reference("id")
])
)
)
)
])

self._compare_minilinq_to_compiled(minilinq, '008_multiple-tables.xlsx', combine_emits=False, value_or_root=True)

def _compare_minilinq_to_compiled(self, minilinq, filename, **kwargs):
print("Parsing {}".format(filename))
abs_path = os.path.join(os.path.dirname(__file__), filename)
compiled = get_queries_from_excel(openpyxl.load_workbook(abs_path), missing_value='---', combine_emits=combine, column_enforcer=column_enforcer)
compiled = get_queries_from_excel(openpyxl.load_workbook(abs_path), missing_value='---', **kwargs)
assert compiled.to_jvalue() == minilinq.to_jvalue(), filename

0 comments on commit 0ec3899

Please sign in to comment.