From c34419f30f517e71492eeee1507ac4b094c6d132 Mon Sep 17 00:00:00 2001 From: Leo Meyerovich Date: Fri, 24 Apr 2026 00:42:32 -0700 Subject: [PATCH 1/4] feat(gfql/ir): expose Lark boolean-expression tree as WhereClause.expr_tree (#1200 slice 1/N) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Parser now captures Lark's already-parsed and_op / or_op / xor_op / not_op tree structurally and surfaces it on ``WhereClause.expr_tree`` alongside the existing raw-text ``.expr``. Downstream consumers (binder, predicate pushdown) stay on the text path for now; subsequent PRs migrate them to walk the structural tree and eventually retire ``expr_split.py``. What this PR adds: - ``BooleanExpr`` dataclass in ``cypher/ast.py`` with ``op`` (``and`` / ``or`` / ``xor`` / ``not`` / ``atom``), span, left/right children, and atom_text/atom_span for leaves. - Parser transformer methods ``and_op`` / ``or_op`` / ``xor_op`` / ``not_op`` that construct ``BooleanExpr`` nodes bottom-up. - ``grouped_expr`` passthrough so ``(...)`` parens do not collapse nested boolean structure into atom leaves. - ``_wrap_as_boolean_atom`` helper that handles BooleanExpr passthrough, _ExpressionSlice spans, Lark Tree .meta spans, and primitive literal operands (bool/int/None) without meta. - ``WhereClause.expr_tree: Optional[BooleanExpr]`` additive field; ``.expr`` preserved for backward compat. - Wiring in ``generic_where_clause`` to extract BooleanExpr from items[0] when Lark produces one; ``.expr`` still populated. Tests (``test_boolean_expr.py``, 12 cases): - Structured paths (single predicate, pure-AND of comparables, structured label narrowing) correctly leave expr_tree=None. - OR / XOR / NOT shapes populate expr_tree with the expected op. - Precedence: AND binds tighter than OR; NOT binds tighter than AND/OR. - Parenthesized subexpressions preserve nested boolean structure via the grouped_expr passthrough. - Spans: atom spans quote the source text exactly; branch spans cover the full subexpression. - Backward compat: expr_tree and expr both populated when Lark routes through the generic expr path. No binder, pushdown, or expr_split.py changes this PR — those are tracked in follow-up slices of #1200. Co-Authored-By: Claude Sonnet 4.6 --- graphistry/compute/gfql/cypher/__init__.py | 4 + graphistry/compute/gfql/cypher/ast.py | 27 +++ graphistry/compute/gfql/cypher/parser.py | 119 +++++++++++- .../compute/gfql/cypher/test_boolean_expr.py | 171 ++++++++++++++++++ 4 files changed, 319 insertions(+), 2 deletions(-) create mode 100644 graphistry/tests/compute/gfql/cypher/test_boolean_expr.py diff --git a/graphistry/compute/gfql/cypher/__init__.py b/graphistry/compute/gfql/cypher/__init__.py index 686c8727ec..a74733b6c6 100644 --- a/graphistry/compute/gfql/cypher/__init__.py +++ b/graphistry/compute/gfql/cypher/__init__.py @@ -4,6 +4,8 @@ import warnings from .ast import ( + BooleanExpr, + BooleanOp, CallClause, CypherGraphQuery, CypherQuery, @@ -43,6 +45,8 @@ from .parser import parse_cypher __all__ = [ + "BooleanExpr", + "BooleanOp", "CypherQuery", "CypherUnionQuery", "CallClause", diff --git a/graphistry/compute/gfql/cypher/ast.py b/graphistry/compute/gfql/cypher/ast.py index 7141ce103a..333ddbb0c8 100644 --- a/graphistry/compute/gfql/cypher/ast.py +++ b/graphistry/compute/gfql/cypher/ast.py @@ -145,11 +145,38 @@ class WherePatternPredicate: WhereTerm = Union[WherePredicate, WherePatternPredicate] +BooleanOp = Literal["and", "or", "xor", "not", "atom"] + + +@dataclass(frozen=True) +class BooleanExpr: + """Structural representation of a parsed boolean expression tree. + + Mirrors Lark's ``and_op`` / ``or_op`` / ``xor_op`` / ``not_op`` grammar + rules so downstream consumers (binder serialization, predicate + pushdown) can walk structure instead of re-parsing expression text. + + Leaf nodes have ``op == "atom"`` and carry the atomic predicate's + ``atom_text`` (exact source slice) and ``atom_span``. Branch nodes + have ``op`` in ``{"and", "or", "xor"}`` with both ``left`` and + ``right`` set, or ``op == "not"`` with only ``left`` set. ``span`` + covers the full subexpression in every case. + """ + + op: BooleanOp + span: SourceSpan + left: Optional["BooleanExpr"] = None + right: Optional["BooleanExpr"] = None + atom_text: Optional[str] = None + atom_span: Optional[SourceSpan] = None + + @dataclass(frozen=True) class WhereClause: predicates: Tuple[WhereTerm, ...] span: SourceSpan expr: Optional[ExpressionText] = None + expr_tree: Optional[BooleanExpr] = None @dataclass(frozen=True) diff --git a/graphistry/compute/gfql/cypher/parser.py b/graphistry/compute/gfql/cypher/parser.py index cf7097c333..fbf2817956 100644 --- a/graphistry/compute/gfql/cypher/parser.py +++ b/graphistry/compute/gfql/cypher/parser.py @@ -9,6 +9,7 @@ from graphistry.compute.exceptions import ErrorCode, GFQLSyntaxError, GFQLValidationError from graphistry.compute.gfql.expr_split import split_top_level_and from graphistry.compute.gfql.cypher.ast import ( + BooleanExpr, CallClause, CypherGraphQuery, CypherLiteral, @@ -1014,6 +1015,103 @@ def expr_and_where_pattern_clause(self, meta: Any, _items: Sequence[Any]) -> Whe span=span, ) + def _wrap_as_boolean_atom(self, operand: Any, enclosing_meta: Any) -> BooleanExpr: + """Coerce a parsed expression operand into a ``BooleanExpr`` atom. + + Recursion bottom-out for ``and_op`` / ``or_op`` / ``xor_op`` / + ``not_op``. ``BooleanExpr`` passes through. Lark ``Tree`` and + ``_ExpressionSlice`` operands carry their own span, so we use + it to extract the source slice precisely. Primitive operands + (bool, int, float, str, ``None`` from literal transformers) + have no span; we approximate with the enclosing operator's + meta and stringify the value for ``atom_text``. That loss of + precision is acceptable because ``and_op`` / ``or_op`` / etc. + also fire in non-WHERE contexts (``RETURN true AND false``) + where the caller discards the ``BooleanExpr`` and reconstructs + text via ``_slice`` of the full expression anyway. + """ + if isinstance(operand, BooleanExpr): + return operand + if isinstance(operand, _ExpressionSlice): + span = operand.span + text = operand.text + else: + operand_meta = getattr(operand, "meta", None) + if operand_meta is None: + # Primitive operand from a literal transformer. Use the + # enclosing expression's span as a conservative fallback. + span = _span_from_meta(enclosing_meta) + text = str(operand) + else: + span = _span_from_meta(operand_meta) + text = self._slice(span) + return BooleanExpr( + op="atom", + span=span, + atom_text=text, + atom_span=span, + ) + + def grouped_expr(self, _meta: Any, items: Sequence[Any]) -> Any: + # ``(expr)`` — passthrough so a parenthesized BooleanExpr bubbles + # up unchanged to enclosing ``and_op`` / ``or_op`` / ``xor_op`` / + # ``not_op`` handlers. Without this, Lark would wrap the parens + # as a ``grouped_expr`` Tree and our ``_wrap_as_boolean_atom`` + # would collapse the inner structure into a text atom, losing + # the nested boolean tree that this PR exists to preserve. + return items[0] if len(items) == 1 else items + + def and_op(self, meta: Any, items: Sequence[Any]) -> BooleanExpr: + if len(items) != 2: + raise _to_syntax_error( + "AND expression requires two operands", + line=meta.line, column=meta.column, + ) + return BooleanExpr( + op="and", + span=_span_from_meta(meta), + left=self._wrap_as_boolean_atom(items[0], meta), + right=self._wrap_as_boolean_atom(items[1], meta), + ) + + def or_op(self, meta: Any, items: Sequence[Any]) -> BooleanExpr: + if len(items) != 2: + raise _to_syntax_error( + "OR expression requires two operands", + line=meta.line, column=meta.column, + ) + return BooleanExpr( + op="or", + span=_span_from_meta(meta), + left=self._wrap_as_boolean_atom(items[0], meta), + right=self._wrap_as_boolean_atom(items[1], meta), + ) + + def xor_op(self, meta: Any, items: Sequence[Any]) -> BooleanExpr: + if len(items) != 2: + raise _to_syntax_error( + "XOR expression requires two operands", + line=meta.line, column=meta.column, + ) + return BooleanExpr( + op="xor", + span=_span_from_meta(meta), + left=self._wrap_as_boolean_atom(items[0], meta), + right=self._wrap_as_boolean_atom(items[1], meta), + ) + + def not_op(self, meta: Any, items: Sequence[Any]) -> BooleanExpr: + if len(items) != 1: + raise _to_syntax_error( + "NOT expression requires one operand", + line=meta.line, column=meta.column, + ) + return BooleanExpr( + op="not", + span=_span_from_meta(meta), + left=self._wrap_as_boolean_atom(items[0], meta), + ) + def where_clause(self, meta: Any, items: Sequence[Any]) -> WhereClause: if len(items) != 1: raise _to_syntax_error("WHERE clause cannot be empty", line=meta.line, column=meta.column) @@ -1025,9 +1123,21 @@ def where_clause(self, meta: Any, items: Sequence[Any]) -> WhereClause: raise _to_syntax_error("WHERE clause cannot be empty", line=meta.line, column=meta.column) return WhereClause(predicates=cast(Any, predicates), expr=None, span=_span_from_meta(meta)) - def generic_where_clause(self, meta: Any, _items: Sequence[Any]) -> WhereClause: + def generic_where_clause(self, meta: Any, items: Sequence[Any]) -> WhereClause: span = _span_from_meta(meta) expr = self._slice(span)[len("WHERE"):].strip() + # Capture Lark's structural expression tree when available so + # downstream consumers can walk ``BooleanExpr`` instead of + # re-parsing ``ExpressionText``. Only ``and_op``/``or_op``/ + # ``xor_op``/``not_op`` produce ``BooleanExpr`` today; atomic + # WHERE expressions (single predicate) still route here with + # a non-BooleanExpr operand, in which case ``expr_tree`` + # remains ``None`` — no behavior change for those callers. + expr_tree: Optional[BooleanExpr] = ( + cast(BooleanExpr, items[0]) + if items and isinstance(items[0], BooleanExpr) + else None + ) # Lark's ambiguity resolution prefers the generic `expr` path over # the structured `where_predicates` rule, so AND-joined bare label # predicates ("n:Admin AND n:Active") land here rather than in @@ -1062,7 +1172,12 @@ def generic_where_clause(self, meta: Any, _items: Sequence[Any]) -> WhereClause: ) if predicates: return WhereClause(predicates=tuple(predicates), expr=None, span=span) - return WhereClause(predicates=(), expr=ExpressionText(text=expr, span=span), span=span) + return WhereClause( + predicates=(), + expr=ExpressionText(text=expr, span=span), + expr_tree=expr_tree, + span=span, + ) def unwind_clause(self, meta: Any, items: Sequence[Any]) -> UnwindClause: if len(items) != 2: diff --git a/graphistry/tests/compute/gfql/cypher/test_boolean_expr.py b/graphistry/tests/compute/gfql/cypher/test_boolean_expr.py new file mode 100644 index 0000000000..89efbcca5d --- /dev/null +++ b/graphistry/tests/compute/gfql/cypher/test_boolean_expr.py @@ -0,0 +1,171 @@ +"""Tests for ``BooleanExpr`` structural tree in parsed WHERE clauses. + +The parser now exposes Lark's already-computed ``and_op`` / ``or_op`` / +``xor_op`` / ``not_op`` tree structurally as ``WhereClause.expr_tree``. +These tests lock the tree shape so downstream consumers (binder, +predicate pushdown) can safely migrate off text re-parsing. + +Routing note: pure AND conjunctions of comparable predicates route +through the structured ``where_predicates`` grammar rule and do **not** +populate ``expr_tree`` — those use ``WhereClause.predicates`` instead. +``expr_tree`` is populated only when the WHERE body forces Lark into +the generic ``expr`` path: OR / XOR / NOT operators, or parenthesized +subexpressions that mix operator flavors. + +Pre-existing grammar quirk: the LALR parser cannot disambiguate bare +``OR`` between two raw comparable predicates (``WHERE a = 1 OR b = 2`` +fails); parenthesized forms (``WHERE (a = 1) OR (b = 2)``) work. Tests +use the parenthesized form to probe the expected boolean-tree output. +""" +from __future__ import annotations + +from typing import cast + +from graphistry.compute.gfql.cypher import ( + BooleanExpr, + CypherQuery, + parse_cypher, +) + + +def _parsed_where(query: str) -> CypherQuery: + return cast(CypherQuery, parse_cypher(query)) + + +def _expr_tree(query: str) -> BooleanExpr: + parsed = _parsed_where(query) + assert parsed.where is not None, "parsed query has no WHERE clause" + assert parsed.where.expr_tree is not None, ( + f"expected expr_tree for boolean-operator WHERE, got None. " + f"Predicates={parsed.where.predicates}, expr={parsed.where.expr}" + ) + return parsed.where.expr_tree + + +# --------------------------------------------------------------------------- +# No expr_tree for structured paths +# --------------------------------------------------------------------------- + + +def test_single_predicate_where_clause_has_no_expr_tree() -> None: + parsed = _parsed_where("MATCH (n) WHERE n.age > 5 RETURN n") + assert parsed.where is not None + assert parsed.where.expr_tree is None + + +def test_pure_and_of_comparables_routes_through_where_predicates() -> None: + parsed = _parsed_where("MATCH (n) WHERE n.x > 1 AND n.y < 2 RETURN n") + assert parsed.where is not None + assert len(parsed.where.predicates) == 2 + assert parsed.where.expr_tree is None + + +def test_structured_label_narrowing_does_not_populate_expr_tree() -> None: + parsed = _parsed_where("MATCH (n) WHERE n:Admin AND n:Active RETURN n") + assert parsed.where is not None + assert len(parsed.where.predicates) == 2 + assert parsed.where.expr_tree is None + + +# --------------------------------------------------------------------------- +# Single-operator shapes that force the expr path +# --------------------------------------------------------------------------- + + +def test_or_op_produces_or_tree() -> None: + # Outer parens are transparent (see ``grouped_expr`` passthrough); + # atoms carry the inner predicate text without surrounding parens. + tree = _expr_tree("MATCH (n) WHERE (n.x > 1) OR (n.y < 2) RETURN n") + assert tree.op == "or" + assert tree.left is not None and tree.left.op == "atom" + assert tree.right is not None and tree.right.op == "atom" + assert tree.left.atom_text == "n.x > 1" + assert tree.right.atom_text == "n.y < 2" + + +def test_xor_op_produces_xor_tree() -> None: + tree = _expr_tree("MATCH (n) WHERE (n.x > 1) XOR (n.y < 2) RETURN n") + assert tree.op == "xor" + assert tree.left is not None and tree.left.atom_text == "n.x > 1" + assert tree.right is not None and tree.right.atom_text == "n.y < 2" + + +def test_not_op_produces_not_tree() -> None: + tree = _expr_tree("MATCH (n) WHERE NOT n.x > 1 RETURN n") + assert tree.op == "not" + assert tree.left is not None and tree.left.op == "atom" + assert tree.left.atom_text == "n.x > 1" + assert tree.right is None + + +# --------------------------------------------------------------------------- +# Multi-operator precedence +# --------------------------------------------------------------------------- + + +def test_and_binds_tighter_than_or_when_mixed() -> None: + # (a) OR (b AND c) — expr path because top-level OR. + # Outer: or_op. Parens are transparent, so atoms/subtrees appear + # unwrapped. The right side ``(b AND c)`` is a Lark and_op Tree — + # after my and_op transformer fires, it becomes a BooleanExpr and + # passes through grouped_expr unchanged. + tree = _expr_tree("MATCH (n) WHERE (n.a > 1) OR (n.b > 2 AND n.c > 3) RETURN n") + assert tree.op == "or" + assert tree.left is not None and tree.left.op == "atom" + assert tree.left.atom_text == "n.a > 1" + assert tree.right is not None and tree.right.op == "and" + assert tree.right.left is not None and tree.right.left.atom_text == "n.b > 2" + assert tree.right.right is not None and tree.right.right.atom_text == "n.c > 3" + + +def test_not_binds_tighter_than_or() -> None: + # NOT (a) OR (b) → parses as ((NOT a) OR b) + tree = _expr_tree("MATCH (n) WHERE NOT (n.a > 1) OR (n.b > 2) RETURN n") + assert tree.op == "or" + assert tree.left is not None and tree.left.op == "not" + assert tree.right is not None and tree.right.op == "atom" + + +def test_parenthesized_or_inside_and_routes_through_expr() -> None: + # (a OR b) AND c — top-level AND combined with OR subexpression forces expr. + # Outer: and_op (because left is a complex expression, not plain comparable). + tree = _expr_tree("MATCH (n) WHERE ((n.a > 1) OR (n.b > 2)) AND n.c > 3 RETURN n") + assert tree.op == "and" + assert tree.left is not None and tree.left.op == "or" + assert tree.right is not None and tree.right.op == "atom" + assert tree.right.atom_text == "n.c > 3" + + +# --------------------------------------------------------------------------- +# Span coverage +# --------------------------------------------------------------------------- + + +def test_atom_span_matches_source_slice() -> None: + # Atom spans must quote exactly the inner predicate text from source. + # grouped_expr is transparent, so the atom span covers "n.x > 1" + # (without surrounding parens). + query = "MATCH (n) WHERE (n.x > 1) OR (n.y < 2) RETURN n" + tree = _expr_tree(query) + assert tree.left is not None and tree.left.atom_span is not None + lhs_span = tree.left.atom_span + assert query[lhs_span.start_pos:lhs_span.end_pos] == tree.left.atom_text + + +def test_branch_span_covers_full_subexpression() -> None: + query = "MATCH (n) WHERE (n.x > 1) OR (n.y < 2) RETURN n" + tree = _expr_tree(query) + assert query[tree.span.start_pos:tree.span.end_pos].strip() == "(n.x > 1) OR (n.y < 2)" + + +# --------------------------------------------------------------------------- +# expr_tree coexists with expr for backward compat +# --------------------------------------------------------------------------- + + +def test_expr_tree_and_expr_both_populated_for_compat() -> None: + parsed = _parsed_where("MATCH (n) WHERE (n.x > 1) OR (n.y < 2) RETURN n") + assert parsed.where is not None + assert parsed.where.expr is not None + assert parsed.where.expr_tree is not None + assert "OR" in parsed.where.expr.text.upper() From af4e17397c8b5b15935cb25acc87d5d5270c4668 Mon Sep 17 00:00:00 2001 From: Leo Meyerovich Date: Fri, 24 Apr 2026 00:50:09 -0700 Subject: [PATCH 2/4] refactor+test: address wave-1 review findings on #1202 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit IMPORTANT fixes: - ast_normalizer._rewrite_where_pattern_predicates_to_matches: when reconstructing a remaining WhereClause after pulling a WHERE pattern predicate into its own MatchClause, the constructor now preserves ``expr_tree=query.where.expr_tree``. Previously it was silently dropped, breaking structural capture for any query mixing a WHERE pattern predicate with a boolean-expression tail. - Documented the primitive-literal ``atom_text`` fidelity limitation on ``_wrap_as_boolean_atom`` (``str(True)`` → ``\"True\"`` not Cypher ``\"true\"``). No consumer currently reads atom_text on literal atoms; locked by a new test ``test_literal_boolean_atoms_known_ limitation_python_style_text`` so any future fix flips intentionally. SUGGESTION fixes: - DRY: ``_boolean_binary`` helper collapses ``and_op`` / ``or_op`` / ``xor_op`` bodies into single-line delegates, centralizing the arity check and error message. - ``WhereClause`` class docstring now states field coexistence contract (structured path vs raw-text path vs structured-tree path). - Tighter branch-span assertion (no ``.strip()``) so future drift is caught. - Three new tests: deeply nested OR grouping preserves structure, ``NOT NOT`` produces nested NOT nodes, mixed AND/XOR precedence via parens. Co-Authored-By: Claude Sonnet 4.6 --- graphistry/compute/gfql/cypher/ast.py | 19 +++++ .../compute/gfql/cypher/ast_normalizer.py | 1 + graphistry/compute/gfql/cypher/parser.py | 80 +++++++++---------- .../compute/gfql/cypher/test_boolean_expr.py | 65 ++++++++++++++- 4 files changed, 121 insertions(+), 44 deletions(-) diff --git a/graphistry/compute/gfql/cypher/ast.py b/graphistry/compute/gfql/cypher/ast.py index 333ddbb0c8..a5658bdbc8 100644 --- a/graphistry/compute/gfql/cypher/ast.py +++ b/graphistry/compute/gfql/cypher/ast.py @@ -173,6 +173,25 @@ class BooleanExpr: @dataclass(frozen=True) class WhereClause: + """Parsed WHERE clause. + + Field coexistence rules: + + - **Structured path**: ``predicates`` populated, ``expr`` is ``None``, + ``expr_tree`` is ``None``. Fires when Lark routes through the + ``where_predicates`` grammar rule (pure AND conjunctions of + comparable predicates) or when ``generic_where_clause`` lifts + AND-joined bare label predicates via label narrowing. + - **Raw-text path**: ``predicates == ()``, ``expr`` populated with + the WHERE body text. Fires when ``generic_where_clause`` cannot + lift to structured predicates. + - **Structured-tree path**: ``expr_tree`` populated alongside + ``expr`` when the WHERE body contains ``AND`` / ``OR`` / ``XOR`` / + ``NOT`` operators that Lark captured via ``and_op`` / ``or_op`` / + ``xor_op`` / ``not_op``. ``expr_tree`` is additive — consumers + that ignore it see the raw-text path unchanged. + """ + predicates: Tuple[WhereTerm, ...] span: SourceSpan expr: Optional[ExpressionText] = None diff --git a/graphistry/compute/gfql/cypher/ast_normalizer.py b/graphistry/compute/gfql/cypher/ast_normalizer.py index b36716dcd4..4c99792af3 100644 --- a/graphistry/compute/gfql/cypher/ast_normalizer.py +++ b/graphistry/compute/gfql/cypher/ast_normalizer.py @@ -509,6 +509,7 @@ def _rewrite_where_pattern_predicates_to_matches(query: CypherQuery) -> CypherQu remaining_where = WhereClause( predicates=cast(Any, remaining), expr=query.where.expr, + expr_tree=query.where.expr_tree, span=query.where.span, ) extra_match = MatchClause( diff --git a/graphistry/compute/gfql/cypher/parser.py b/graphistry/compute/gfql/cypher/parser.py index fbf2817956..80fc0bbc7e 100644 --- a/graphistry/compute/gfql/cypher/parser.py +++ b/graphistry/compute/gfql/cypher/parser.py @@ -1021,14 +1021,21 @@ def _wrap_as_boolean_atom(self, operand: Any, enclosing_meta: Any) -> BooleanExp Recursion bottom-out for ``and_op`` / ``or_op`` / ``xor_op`` / ``not_op``. ``BooleanExpr`` passes through. Lark ``Tree`` and ``_ExpressionSlice`` operands carry their own span, so we use - it to extract the source slice precisely. Primitive operands - (bool, int, float, str, ``None`` from literal transformers) - have no span; we approximate with the enclosing operator's - meta and stringify the value for ``atom_text``. That loss of - precision is acceptable because ``and_op`` / ``or_op`` / etc. - also fire in non-WHERE contexts (``RETURN true AND false``) - where the caller discards the ``BooleanExpr`` and reconstructs - text via ``_slice`` of the full expression anyway. + it to extract the source slice precisely. + + **Known limitation — primitive literal atoms.** Literal + transformers (``true_lit`` / ``false_lit`` / ``null_lit`` / + ``number_lit``) return raw Python values without span info. + When such a value reaches us as a boolean-operator operand + (``WHERE true AND false``), we cannot recover the original + source text for that specific operand; we approximate with + the enclosing operator's span and ``str(operand)`` (which + produces Python-style text like ``"True"`` not Cypher-style + ``"true"``). No current consumer reads ``atom_text`` on + literal atoms — the binder is not wired to ``expr_tree`` in + this slice. Accuracy for this path is a follow-up concern + tracked in issue #1200; if/when literal transformers gain + span-carrying wrappers, this fallback can be removed. """ if isinstance(operand, BooleanExpr): return operand @@ -1038,8 +1045,7 @@ def _wrap_as_boolean_atom(self, operand: Any, enclosing_meta: Any) -> BooleanExp else: operand_meta = getattr(operand, "meta", None) if operand_meta is None: - # Primitive operand from a literal transformer. Use the - # enclosing expression's span as a conservative fallback. + # Primitive literal — see docstring caveat. span = _span_from_meta(enclosing_meta) text = str(operand) else: @@ -1052,6 +1058,24 @@ def _wrap_as_boolean_atom(self, operand: Any, enclosing_meta: Any) -> BooleanExp atom_span=span, ) + def _boolean_binary( + self, + op: Literal["and", "or", "xor"], + meta: Any, + items: Sequence[Any], + ) -> BooleanExpr: + if len(items) != 2: + raise _to_syntax_error( + f"{op.upper()} expression requires two operands", + line=meta.line, column=meta.column, + ) + return BooleanExpr( + op=op, + span=_span_from_meta(meta), + left=self._wrap_as_boolean_atom(items[0], meta), + right=self._wrap_as_boolean_atom(items[1], meta), + ) + def grouped_expr(self, _meta: Any, items: Sequence[Any]) -> Any: # ``(expr)`` — passthrough so a parenthesized BooleanExpr bubbles # up unchanged to enclosing ``and_op`` / ``or_op`` / ``xor_op`` / @@ -1062,43 +1086,13 @@ def grouped_expr(self, _meta: Any, items: Sequence[Any]) -> Any: return items[0] if len(items) == 1 else items def and_op(self, meta: Any, items: Sequence[Any]) -> BooleanExpr: - if len(items) != 2: - raise _to_syntax_error( - "AND expression requires two operands", - line=meta.line, column=meta.column, - ) - return BooleanExpr( - op="and", - span=_span_from_meta(meta), - left=self._wrap_as_boolean_atom(items[0], meta), - right=self._wrap_as_boolean_atom(items[1], meta), - ) + return self._boolean_binary("and", meta, items) def or_op(self, meta: Any, items: Sequence[Any]) -> BooleanExpr: - if len(items) != 2: - raise _to_syntax_error( - "OR expression requires two operands", - line=meta.line, column=meta.column, - ) - return BooleanExpr( - op="or", - span=_span_from_meta(meta), - left=self._wrap_as_boolean_atom(items[0], meta), - right=self._wrap_as_boolean_atom(items[1], meta), - ) + return self._boolean_binary("or", meta, items) def xor_op(self, meta: Any, items: Sequence[Any]) -> BooleanExpr: - if len(items) != 2: - raise _to_syntax_error( - "XOR expression requires two operands", - line=meta.line, column=meta.column, - ) - return BooleanExpr( - op="xor", - span=_span_from_meta(meta), - left=self._wrap_as_boolean_atom(items[0], meta), - right=self._wrap_as_boolean_atom(items[1], meta), - ) + return self._boolean_binary("xor", meta, items) def not_op(self, meta: Any, items: Sequence[Any]) -> BooleanExpr: if len(items) != 1: diff --git a/graphistry/tests/compute/gfql/cypher/test_boolean_expr.py b/graphistry/tests/compute/gfql/cypher/test_boolean_expr.py index 89efbcca5d..3ae09335db 100644 --- a/graphistry/tests/compute/gfql/cypher/test_boolean_expr.py +++ b/graphistry/tests/compute/gfql/cypher/test_boolean_expr.py @@ -153,9 +153,10 @@ def test_atom_span_matches_source_slice() -> None: def test_branch_span_covers_full_subexpression() -> None: + # Strict span bounds — no .strip() — so any future drift is caught. query = "MATCH (n) WHERE (n.x > 1) OR (n.y < 2) RETURN n" tree = _expr_tree(query) - assert query[tree.span.start_pos:tree.span.end_pos].strip() == "(n.x > 1) OR (n.y < 2)" + assert query[tree.span.start_pos:tree.span.end_pos] == "(n.x > 1) OR (n.y < 2)" # --------------------------------------------------------------------------- @@ -169,3 +170,65 @@ def test_expr_tree_and_expr_both_populated_for_compat() -> None: assert parsed.where.expr is not None assert parsed.where.expr_tree is not None assert "OR" in parsed.where.expr.text.upper() + + +# --------------------------------------------------------------------------- +# Deeper nesting and additional precedence combinations +# --------------------------------------------------------------------------- + + +def test_deeply_nested_or_grouping_preserves_structure() -> None: + # ((a OR b) OR (c OR d)) — nested OR-of-ORs; grouped_expr must pass + # through cleanly at every level. + tree = _expr_tree( + "MATCH (n) WHERE ((n.a > 1) OR (n.b > 2)) OR ((n.c > 3) OR (n.d > 4)) RETURN n" + ) + assert tree.op == "or" + assert tree.left is not None and tree.left.op == "or" + assert tree.right is not None and tree.right.op == "or" + # Leaves + assert tree.left.left is not None and tree.left.left.atom_text == "n.a > 1" + assert tree.left.right is not None and tree.left.right.atom_text == "n.b > 2" + assert tree.right.left is not None and tree.right.left.atom_text == "n.c > 3" + assert tree.right.right is not None and tree.right.right.atom_text == "n.d > 4" + + +def test_nested_not_not_produces_nested_not_nodes() -> None: + # NOT NOT a — two unary NOT applications. + tree = _expr_tree("MATCH (n) WHERE NOT NOT n.a > 1 RETURN n") + assert tree.op == "not" + assert tree.left is not None and tree.left.op == "not" + assert tree.left.left is not None and tree.left.left.atom_text == "n.a > 1" + + +def test_and_xor_mixed_precedence_via_parens() -> None: + # AND binds tighter than XOR: (a) XOR (b AND c) → xor(a, and(b, c)). + # The bare form ``a XOR b AND c`` hits the same LALR ambiguity as bare + # ``a OR b``; parenthesizing the XOR operands lets it parse cleanly. + tree = _expr_tree("MATCH (n) WHERE (n.a > 1) XOR (n.b > 2 AND n.c > 3) RETURN n") + assert tree.op == "xor" + assert tree.left is not None and tree.left.atom_text == "n.a > 1" + assert tree.right is not None and tree.right.op == "and" + + +# --------------------------------------------------------------------------- +# Known-limitation lock: primitive literal atoms +# --------------------------------------------------------------------------- + + +def test_literal_boolean_atoms_known_limitation_python_style_text() -> None: + # Locks the documented caveat in ``_wrap_as_boolean_atom``: literal + # transformers return raw Python values without span info, so atom_text + # falls back to ``str(True)`` → ``"True"`` instead of the source ``"true"``. + # No current consumer reads atom_text on literal atoms; if a future PR + # teaches literal transformers to carry span, this test should be updated + # to assert the Cypher-style lowercase text. + parsed = _parsed_where("MATCH (n) WHERE true XOR n.x > 1 RETURN n") + assert parsed.where is not None and parsed.where.expr_tree is not None + tree = parsed.where.expr_tree + assert tree.op == "xor" + # Left operand is the boolean literal `true` — atom_text is Python-stringified. + assert tree.left is not None and tree.left.op == "atom" + assert tree.left.atom_text == "True" # known limitation — see docstring + # Right operand is a comparable with a Lark Tree span — accurate slice. + assert tree.right is not None and tree.right.atom_text == "n.x > 1" From 8e4a863bde14314a1f6dc157c016ed68c69c9199 Mon Sep 17 00:00:00 2001 From: Leo Meyerovich Date: Fri, 24 Apr 2026 00:52:17 -0700 Subject: [PATCH 3/4] docs(gfql/parser): note mixed-clause expr_tree gap in _mixed_where_clause (wave-2 #1202) The where_pattern_and_expr_clause and expr_and_where_pattern_clause routes reconstruct expr from source text and do not capture Lark's structural boolean-expression tree. This is pre-existing (they already re-parse from text for the pattern-side anyway) but the new WhereClause docstring contract around expr_tree made it worth noting inline as a known limitation for the slice-1 scope of #1200. Co-Authored-By: Claude Sonnet 4.6 --- graphistry/compute/gfql/cypher/parser.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/graphistry/compute/gfql/cypher/parser.py b/graphistry/compute/gfql/cypher/parser.py index 80fc0bbc7e..b8482f3ced 100644 --- a/graphistry/compute/gfql/cypher/parser.py +++ b/graphistry/compute/gfql/cypher/parser.py @@ -974,6 +974,13 @@ def _mixed_where_clause( ) -> WhereClause: if expr_text.strip() == "": raise _to_syntax_error("Invalid WHERE clause", line=span.line, column=span.column) + # Mixed-clause handlers (``where_pattern_and_expr_clause`` and + # ``expr_and_where_pattern_clause``) reconstruct ``expr_text`` + # from the source slice and do not capture Lark's structural + # expression tree — so ``expr_tree`` stays unset here even when + # ``expr_text`` contains AND/OR/XOR/NOT operators. Out of scope + # for issue #1200 slice 1 (text-only path; tracked for a later + # slice alongside full binder migration). return WhereClause( predicates=(self._parse_where_pattern_predicate_text(pattern_text, span),), expr=ExpressionText(text=expr_text.strip(), span=span), From dbe3c79df995054791493677be48d0ee05d23338 Mon Sep 17 00:00:00 2001 From: Leo Meyerovich Date: Fri, 24 Apr 2026 00:52:49 -0700 Subject: [PATCH 4/4] docs(changelog): add #1200/#1202 BooleanExpr slice-1 entry Co-Authored-By: Claude Sonnet 4.6 --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3c3a1bb952..fdb701fc4c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm - **Polars support**: `polars.DataFrame` and `polars.LazyFrame` now work in `plot()`, `materialize_nodes()`, `get_degrees()`, `get_indegrees()`, `get_outdegrees()`, and `hypergraph()`. Polars is an optional dependency — no behavior change when not installed. Upload path uses efficient Arrow conversion (`to_arrow()` with schema-metadata stripping and memoization); compute/hypergraph paths coerce to pandas at entry. `LazyFrame` is materialized via `.collect()` at each boundary. Adds `test_polars.py` with 17 tests; skips gracefully when polars is absent (#1133). ### Internal +- **GFQL / Cypher parser**: Parser now captures Lark's already-parsed boolean-expression tree (`and_op` / `or_op` / `xor_op` / `not_op`) as a structural `BooleanExpr` value on `WhereClause.expr_tree`, alongside the existing raw-text `.expr`. Adds `grouped_expr` passthrough so parenthesized subexpressions preserve nested boolean structure rather than collapsing to atom leaves. Backward-compatible: `.expr` stays populated; no binder/pushdown/`expr_split.py` changes this slice. Downstream migration (binder walks tree to emit one `BoundPredicate` per top-level AND conjunct; pushdown drops text-split fallback; `expr_split.py` retired) is tracked in the remaining slices of #1200. Slice 1 is strictly additive (#1200, #1202). - **GFQL / IR refactor**: Consolidated the `("input", "left", "right", "subquery")` child-slot tuple that was duplicated across 5 sites (IR verifier, physical planner, two rewrite passes, one test helper) into a single `CHILD_SLOTS` constant and `iter_children` helper in `graphistry/compute/gfql/ir/logical_plan.py`. Adds 8 regression tests including: identity-preservation for `UnnestApply` and `PredicatePushdownPass` when no descendant is rewritten (the `rewritten_child is not child` guard is load-bearing for tier-2 fixed-point convergence), asymmetric identity preservation across `Join` when only one branch is rewritten, and a reflective `typing.get_type_hints` check that any future `LogicalPlan` subclass adding a new child slot must update `CHILD_SLOTS` (#1196, #1199). ### Fixed