diff --git a/CHANGELOG.md b/CHANGELOG.md index 3c3a1bb952..fdb701fc4c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm - **Polars support**: `polars.DataFrame` and `polars.LazyFrame` now work in `plot()`, `materialize_nodes()`, `get_degrees()`, `get_indegrees()`, `get_outdegrees()`, and `hypergraph()`. Polars is an optional dependency — no behavior change when not installed. Upload path uses efficient Arrow conversion (`to_arrow()` with schema-metadata stripping and memoization); compute/hypergraph paths coerce to pandas at entry. `LazyFrame` is materialized via `.collect()` at each boundary. Adds `test_polars.py` with 17 tests; skips gracefully when polars is absent (#1133). ### Internal +- **GFQL / Cypher parser**: Parser now captures Lark's already-parsed boolean-expression tree (`and_op` / `or_op` / `xor_op` / `not_op`) as a structural `BooleanExpr` value on `WhereClause.expr_tree`, alongside the existing raw-text `.expr`. Adds `grouped_expr` passthrough so parenthesized subexpressions preserve nested boolean structure rather than collapsing to atom leaves. Backward-compatible: `.expr` stays populated; no binder/pushdown/`expr_split.py` changes this slice. Downstream migration (binder walks tree to emit one `BoundPredicate` per top-level AND conjunct; pushdown drops text-split fallback; `expr_split.py` retired) is tracked in the remaining slices of #1200. Slice 1 is strictly additive (#1200, #1202). - **GFQL / IR refactor**: Consolidated the `("input", "left", "right", "subquery")` child-slot tuple that was duplicated across 5 sites (IR verifier, physical planner, two rewrite passes, one test helper) into a single `CHILD_SLOTS` constant and `iter_children` helper in `graphistry/compute/gfql/ir/logical_plan.py`. Adds 8 regression tests including: identity-preservation for `UnnestApply` and `PredicatePushdownPass` when no descendant is rewritten (the `rewritten_child is not child` guard is load-bearing for tier-2 fixed-point convergence), asymmetric identity preservation across `Join` when only one branch is rewritten, and a reflective `typing.get_type_hints` check that any future `LogicalPlan` subclass adding a new child slot must update `CHILD_SLOTS` (#1196, #1199). ### Fixed diff --git a/graphistry/compute/gfql/cypher/__init__.py b/graphistry/compute/gfql/cypher/__init__.py index 686c8727ec..a74733b6c6 100644 --- a/graphistry/compute/gfql/cypher/__init__.py +++ b/graphistry/compute/gfql/cypher/__init__.py @@ -4,6 +4,8 @@ import warnings from .ast import ( + BooleanExpr, + BooleanOp, CallClause, CypherGraphQuery, CypherQuery, @@ -43,6 +45,8 @@ from .parser import parse_cypher __all__ = [ + "BooleanExpr", + "BooleanOp", "CypherQuery", "CypherUnionQuery", "CallClause", diff --git a/graphistry/compute/gfql/cypher/ast.py b/graphistry/compute/gfql/cypher/ast.py index 7141ce103a..a5658bdbc8 100644 --- a/graphistry/compute/gfql/cypher/ast.py +++ b/graphistry/compute/gfql/cypher/ast.py @@ -145,11 +145,57 @@ class WherePatternPredicate: WhereTerm = Union[WherePredicate, WherePatternPredicate] +BooleanOp = Literal["and", "or", "xor", "not", "atom"] + + +@dataclass(frozen=True) +class BooleanExpr: + """Structural representation of a parsed boolean expression tree. + + Mirrors Lark's ``and_op`` / ``or_op`` / ``xor_op`` / ``not_op`` grammar + rules so downstream consumers (binder serialization, predicate + pushdown) can walk structure instead of re-parsing expression text. + + Leaf nodes have ``op == "atom"`` and carry the atomic predicate's + ``atom_text`` (exact source slice) and ``atom_span``. Branch nodes + have ``op`` in ``{"and", "or", "xor"}`` with both ``left`` and + ``right`` set, or ``op == "not"`` with only ``left`` set. ``span`` + covers the full subexpression in every case. + """ + + op: BooleanOp + span: SourceSpan + left: Optional["BooleanExpr"] = None + right: Optional["BooleanExpr"] = None + atom_text: Optional[str] = None + atom_span: Optional[SourceSpan] = None + + @dataclass(frozen=True) class WhereClause: + """Parsed WHERE clause. + + Field coexistence rules: + + - **Structured path**: ``predicates`` populated, ``expr`` is ``None``, + ``expr_tree`` is ``None``. Fires when Lark routes through the + ``where_predicates`` grammar rule (pure AND conjunctions of + comparable predicates) or when ``generic_where_clause`` lifts + AND-joined bare label predicates via label narrowing. + - **Raw-text path**: ``predicates == ()``, ``expr`` populated with + the WHERE body text. Fires when ``generic_where_clause`` cannot + lift to structured predicates. + - **Structured-tree path**: ``expr_tree`` populated alongside + ``expr`` when the WHERE body contains ``AND`` / ``OR`` / ``XOR`` / + ``NOT`` operators that Lark captured via ``and_op`` / ``or_op`` / + ``xor_op`` / ``not_op``. ``expr_tree`` is additive — consumers + that ignore it see the raw-text path unchanged. + """ + predicates: Tuple[WhereTerm, ...] span: SourceSpan expr: Optional[ExpressionText] = None + expr_tree: Optional[BooleanExpr] = None @dataclass(frozen=True) diff --git a/graphistry/compute/gfql/cypher/ast_normalizer.py b/graphistry/compute/gfql/cypher/ast_normalizer.py index b36716dcd4..4c99792af3 100644 --- a/graphistry/compute/gfql/cypher/ast_normalizer.py +++ b/graphistry/compute/gfql/cypher/ast_normalizer.py @@ -509,6 +509,7 @@ def _rewrite_where_pattern_predicates_to_matches(query: CypherQuery) -> CypherQu remaining_where = WhereClause( predicates=cast(Any, remaining), expr=query.where.expr, + expr_tree=query.where.expr_tree, span=query.where.span, ) extra_match = MatchClause( diff --git a/graphistry/compute/gfql/cypher/parser.py b/graphistry/compute/gfql/cypher/parser.py index cf7097c333..b8482f3ced 100644 --- a/graphistry/compute/gfql/cypher/parser.py +++ b/graphistry/compute/gfql/cypher/parser.py @@ -9,6 +9,7 @@ from graphistry.compute.exceptions import ErrorCode, GFQLSyntaxError, GFQLValidationError from graphistry.compute.gfql.expr_split import split_top_level_and from graphistry.compute.gfql.cypher.ast import ( + BooleanExpr, CallClause, CypherGraphQuery, CypherLiteral, @@ -973,6 +974,13 @@ def _mixed_where_clause( ) -> WhereClause: if expr_text.strip() == "": raise _to_syntax_error("Invalid WHERE clause", line=span.line, column=span.column) + # Mixed-clause handlers (``where_pattern_and_expr_clause`` and + # ``expr_and_where_pattern_clause``) reconstruct ``expr_text`` + # from the source slice and do not capture Lark's structural + # expression tree — so ``expr_tree`` stays unset here even when + # ``expr_text`` contains AND/OR/XOR/NOT operators. Out of scope + # for issue #1200 slice 1 (text-only path; tracked for a later + # slice alongside full binder migration). return WhereClause( predicates=(self._parse_where_pattern_predicate_text(pattern_text, span),), expr=ExpressionText(text=expr_text.strip(), span=span), @@ -1014,6 +1022,97 @@ def expr_and_where_pattern_clause(self, meta: Any, _items: Sequence[Any]) -> Whe span=span, ) + def _wrap_as_boolean_atom(self, operand: Any, enclosing_meta: Any) -> BooleanExpr: + """Coerce a parsed expression operand into a ``BooleanExpr`` atom. + + Recursion bottom-out for ``and_op`` / ``or_op`` / ``xor_op`` / + ``not_op``. ``BooleanExpr`` passes through. Lark ``Tree`` and + ``_ExpressionSlice`` operands carry their own span, so we use + it to extract the source slice precisely. + + **Known limitation — primitive literal atoms.** Literal + transformers (``true_lit`` / ``false_lit`` / ``null_lit`` / + ``number_lit``) return raw Python values without span info. + When such a value reaches us as a boolean-operator operand + (``WHERE true AND false``), we cannot recover the original + source text for that specific operand; we approximate with + the enclosing operator's span and ``str(operand)`` (which + produces Python-style text like ``"True"`` not Cypher-style + ``"true"``). No current consumer reads ``atom_text`` on + literal atoms — the binder is not wired to ``expr_tree`` in + this slice. Accuracy for this path is a follow-up concern + tracked in issue #1200; if/when literal transformers gain + span-carrying wrappers, this fallback can be removed. + """ + if isinstance(operand, BooleanExpr): + return operand + if isinstance(operand, _ExpressionSlice): + span = operand.span + text = operand.text + else: + operand_meta = getattr(operand, "meta", None) + if operand_meta is None: + # Primitive literal — see docstring caveat. + span = _span_from_meta(enclosing_meta) + text = str(operand) + else: + span = _span_from_meta(operand_meta) + text = self._slice(span) + return BooleanExpr( + op="atom", + span=span, + atom_text=text, + atom_span=span, + ) + + def _boolean_binary( + self, + op: Literal["and", "or", "xor"], + meta: Any, + items: Sequence[Any], + ) -> BooleanExpr: + if len(items) != 2: + raise _to_syntax_error( + f"{op.upper()} expression requires two operands", + line=meta.line, column=meta.column, + ) + return BooleanExpr( + op=op, + span=_span_from_meta(meta), + left=self._wrap_as_boolean_atom(items[0], meta), + right=self._wrap_as_boolean_atom(items[1], meta), + ) + + def grouped_expr(self, _meta: Any, items: Sequence[Any]) -> Any: + # ``(expr)`` — passthrough so a parenthesized BooleanExpr bubbles + # up unchanged to enclosing ``and_op`` / ``or_op`` / ``xor_op`` / + # ``not_op`` handlers. Without this, Lark would wrap the parens + # as a ``grouped_expr`` Tree and our ``_wrap_as_boolean_atom`` + # would collapse the inner structure into a text atom, losing + # the nested boolean tree that this PR exists to preserve. + return items[0] if len(items) == 1 else items + + def and_op(self, meta: Any, items: Sequence[Any]) -> BooleanExpr: + return self._boolean_binary("and", meta, items) + + def or_op(self, meta: Any, items: Sequence[Any]) -> BooleanExpr: + return self._boolean_binary("or", meta, items) + + def xor_op(self, meta: Any, items: Sequence[Any]) -> BooleanExpr: + return self._boolean_binary("xor", meta, items) + + def not_op(self, meta: Any, items: Sequence[Any]) -> BooleanExpr: + if len(items) != 1: + raise _to_syntax_error( + "NOT expression requires one operand", + line=meta.line, column=meta.column, + ) + return BooleanExpr( + op="not", + span=_span_from_meta(meta), + left=self._wrap_as_boolean_atom(items[0], meta), + ) + def where_clause(self, meta: Any, items: Sequence[Any]) -> WhereClause: if len(items) != 1: raise _to_syntax_error("WHERE clause cannot be empty", line=meta.line, column=meta.column) @@ -1025,9 +1124,21 @@ def where_clause(self, meta: Any, items: Sequence[Any]) -> WhereClause: raise _to_syntax_error("WHERE clause cannot be empty", line=meta.line, column=meta.column) return WhereClause(predicates=cast(Any, predicates), expr=None, span=_span_from_meta(meta)) - def generic_where_clause(self, meta: Any, _items: Sequence[Any]) -> WhereClause: + def generic_where_clause(self, meta: Any, items: Sequence[Any]) -> WhereClause: span = _span_from_meta(meta) expr = self._slice(span)[len("WHERE"):].strip() + # Capture Lark's structural expression tree when available so + # downstream consumers can walk ``BooleanExpr`` instead of + # re-parsing ``ExpressionText``. Only ``and_op``/``or_op``/ + # ``xor_op``/``not_op`` produce ``BooleanExpr`` today; atomic + # WHERE expressions (single predicate) still route here with + # a non-BooleanExpr operand, in which case ``expr_tree`` + # remains ``None`` — no behavior change for those callers. + expr_tree: Optional[BooleanExpr] = ( + cast(BooleanExpr, items[0]) + if items and isinstance(items[0], BooleanExpr) + else None + ) # Lark's ambiguity resolution prefers the generic `expr` path over # the structured `where_predicates` rule, so AND-joined bare label # predicates ("n:Admin AND n:Active") land here rather than in @@ -1062,7 +1173,12 @@ def generic_where_clause(self, meta: Any, _items: Sequence[Any]) -> WhereClause: ) if predicates: return WhereClause(predicates=tuple(predicates), expr=None, span=span) - return WhereClause(predicates=(), expr=ExpressionText(text=expr, span=span), span=span) + return WhereClause( + predicates=(), + expr=ExpressionText(text=expr, span=span), + expr_tree=expr_tree, + span=span, + ) def unwind_clause(self, meta: Any, items: Sequence[Any]) -> UnwindClause: if len(items) != 2: diff --git a/graphistry/tests/compute/gfql/cypher/test_boolean_expr.py b/graphistry/tests/compute/gfql/cypher/test_boolean_expr.py new file mode 100644 index 0000000000..3ae09335db --- /dev/null +++ b/graphistry/tests/compute/gfql/cypher/test_boolean_expr.py @@ -0,0 +1,234 @@ +"""Tests for ``BooleanExpr`` structural tree in parsed WHERE clauses. + +The parser now exposes Lark's already-computed ``and_op`` / ``or_op`` / +``xor_op`` / ``not_op`` tree structurally as ``WhereClause.expr_tree``. +These tests lock the tree shape so downstream consumers (binder, +predicate pushdown) can safely migrate off text re-parsing. + +Routing note: pure AND conjunctions of comparable predicates route +through the structured ``where_predicates`` grammar rule and do **not** +populate ``expr_tree`` — those use ``WhereClause.predicates`` instead. +``expr_tree`` is populated only when the WHERE body forces Lark into +the generic ``expr`` path: OR / XOR / NOT operators, or parenthesized +subexpressions that mix operator flavors. + +Pre-existing grammar quirk: the LALR parser cannot disambiguate bare +``OR`` between two raw comparable predicates (``WHERE a = 1 OR b = 2`` +fails); parenthesized forms (``WHERE (a = 1) OR (b = 2)``) work. Tests +use the parenthesized form to probe the expected boolean-tree output. +""" +from __future__ import annotations + +from typing import cast + +from graphistry.compute.gfql.cypher import ( + BooleanExpr, + CypherQuery, + parse_cypher, +) + + +def _parsed_where(query: str) -> CypherQuery: + return cast(CypherQuery, parse_cypher(query)) + + +def _expr_tree(query: str) -> BooleanExpr: + parsed = _parsed_where(query) + assert parsed.where is not None, "parsed query has no WHERE clause" + assert parsed.where.expr_tree is not None, ( + f"expected expr_tree for boolean-operator WHERE, got None. " + f"Predicates={parsed.where.predicates}, expr={parsed.where.expr}" + ) + return parsed.where.expr_tree + + +# --------------------------------------------------------------------------- +# No expr_tree for structured paths +# --------------------------------------------------------------------------- + + +def test_single_predicate_where_clause_has_no_expr_tree() -> None: + parsed = _parsed_where("MATCH (n) WHERE n.age > 5 RETURN n") + assert parsed.where is not None + assert parsed.where.expr_tree is None + + +def test_pure_and_of_comparables_routes_through_where_predicates() -> None: + parsed = _parsed_where("MATCH (n) WHERE n.x > 1 AND n.y < 2 RETURN n") + assert parsed.where is not None + assert len(parsed.where.predicates) == 2 + assert parsed.where.expr_tree is None + + +def test_structured_label_narrowing_does_not_populate_expr_tree() -> None: + parsed = _parsed_where("MATCH (n) WHERE n:Admin AND n:Active RETURN n") + assert parsed.where is not None + assert len(parsed.where.predicates) == 2 + assert parsed.where.expr_tree is None + + +# --------------------------------------------------------------------------- +# Single-operator shapes that force the expr path +# --------------------------------------------------------------------------- + + +def test_or_op_produces_or_tree() -> None: + # Outer parens are transparent (see ``grouped_expr`` passthrough); + # atoms carry the inner predicate text without surrounding parens. + tree = _expr_tree("MATCH (n) WHERE (n.x > 1) OR (n.y < 2) RETURN n") + assert tree.op == "or" + assert tree.left is not None and tree.left.op == "atom" + assert tree.right is not None and tree.right.op == "atom" + assert tree.left.atom_text == "n.x > 1" + assert tree.right.atom_text == "n.y < 2" + + +def test_xor_op_produces_xor_tree() -> None: + tree = _expr_tree("MATCH (n) WHERE (n.x > 1) XOR (n.y < 2) RETURN n") + assert tree.op == "xor" + assert tree.left is not None and tree.left.atom_text == "n.x > 1" + assert tree.right is not None and tree.right.atom_text == "n.y < 2" + + +def test_not_op_produces_not_tree() -> None: + tree = _expr_tree("MATCH (n) WHERE NOT n.x > 1 RETURN n") + assert tree.op == "not" + assert tree.left is not None and tree.left.op == "atom" + assert tree.left.atom_text == "n.x > 1" + assert tree.right is None + + +# --------------------------------------------------------------------------- +# Multi-operator precedence +# --------------------------------------------------------------------------- + + +def test_and_binds_tighter_than_or_when_mixed() -> None: + # (a) OR (b AND c) — expr path because top-level OR. + # Outer: or_op. Parens are transparent, so atoms/subtrees appear + # unwrapped. The right side ``(b AND c)`` is a Lark and_op Tree — + # after my and_op transformer fires, it becomes a BooleanExpr and + # passes through grouped_expr unchanged. + tree = _expr_tree("MATCH (n) WHERE (n.a > 1) OR (n.b > 2 AND n.c > 3) RETURN n") + assert tree.op == "or" + assert tree.left is not None and tree.left.op == "atom" + assert tree.left.atom_text == "n.a > 1" + assert tree.right is not None and tree.right.op == "and" + assert tree.right.left is not None and tree.right.left.atom_text == "n.b > 2" + assert tree.right.right is not None and tree.right.right.atom_text == "n.c > 3" + + +def test_not_binds_tighter_than_or() -> None: + # NOT (a) OR (b) → parses as ((NOT a) OR b) + tree = _expr_tree("MATCH (n) WHERE NOT (n.a > 1) OR (n.b > 2) RETURN n") + assert tree.op == "or" + assert tree.left is not None and tree.left.op == "not" + assert tree.right is not None and tree.right.op == "atom" + + +def test_parenthesized_or_inside_and_routes_through_expr() -> None: + # (a OR b) AND c — top-level AND combined with OR subexpression forces expr. + # Outer: and_op (because left is a complex expression, not plain comparable). + tree = _expr_tree("MATCH (n) WHERE ((n.a > 1) OR (n.b > 2)) AND n.c > 3 RETURN n") + assert tree.op == "and" + assert tree.left is not None and tree.left.op == "or" + assert tree.right is not None and tree.right.op == "atom" + assert tree.right.atom_text == "n.c > 3" + + +# --------------------------------------------------------------------------- +# Span coverage +# --------------------------------------------------------------------------- + + +def test_atom_span_matches_source_slice() -> None: + # Atom spans must quote exactly the inner predicate text from source. + # grouped_expr is transparent, so the atom span covers "n.x > 1" + # (without surrounding parens). + query = "MATCH (n) WHERE (n.x > 1) OR (n.y < 2) RETURN n" + tree = _expr_tree(query) + assert tree.left is not None and tree.left.atom_span is not None + lhs_span = tree.left.atom_span + assert query[lhs_span.start_pos:lhs_span.end_pos] == tree.left.atom_text + + +def test_branch_span_covers_full_subexpression() -> None: + # Strict span bounds — no .strip() — so any future drift is caught. + query = "MATCH (n) WHERE (n.x > 1) OR (n.y < 2) RETURN n" + tree = _expr_tree(query) + assert query[tree.span.start_pos:tree.span.end_pos] == "(n.x > 1) OR (n.y < 2)" + + +# --------------------------------------------------------------------------- +# expr_tree coexists with expr for backward compat +# --------------------------------------------------------------------------- + + +def test_expr_tree_and_expr_both_populated_for_compat() -> None: + parsed = _parsed_where("MATCH (n) WHERE (n.x > 1) OR (n.y < 2) RETURN n") + assert parsed.where is not None + assert parsed.where.expr is not None + assert parsed.where.expr_tree is not None + assert "OR" in parsed.where.expr.text.upper() + + +# --------------------------------------------------------------------------- +# Deeper nesting and additional precedence combinations +# --------------------------------------------------------------------------- + + +def test_deeply_nested_or_grouping_preserves_structure() -> None: + # ((a OR b) OR (c OR d)) — nested OR-of-ORs; grouped_expr must pass + # through cleanly at every level. + tree = _expr_tree( + "MATCH (n) WHERE ((n.a > 1) OR (n.b > 2)) OR ((n.c > 3) OR (n.d > 4)) RETURN n" + ) + assert tree.op == "or" + assert tree.left is not None and tree.left.op == "or" + assert tree.right is not None and tree.right.op == "or" + # Leaves + assert tree.left.left is not None and tree.left.left.atom_text == "n.a > 1" + assert tree.left.right is not None and tree.left.right.atom_text == "n.b > 2" + assert tree.right.left is not None and tree.right.left.atom_text == "n.c > 3" + assert tree.right.right is not None and tree.right.right.atom_text == "n.d > 4" + + +def test_nested_not_not_produces_nested_not_nodes() -> None: + # NOT NOT a — two unary NOT applications. + tree = _expr_tree("MATCH (n) WHERE NOT NOT n.a > 1 RETURN n") + assert tree.op == "not" + assert tree.left is not None and tree.left.op == "not" + assert tree.left.left is not None and tree.left.left.atom_text == "n.a > 1" + + +def test_and_xor_mixed_precedence_via_parens() -> None: + # AND binds tighter than XOR: (a) XOR (b AND c) → xor(a, and(b, c)). + # The bare form ``a XOR b AND c`` hits the same LALR ambiguity as bare + # ``a OR b``; parenthesizing the XOR operands lets it parse cleanly. + tree = _expr_tree("MATCH (n) WHERE (n.a > 1) XOR (n.b > 2 AND n.c > 3) RETURN n") + assert tree.op == "xor" + assert tree.left is not None and tree.left.atom_text == "n.a > 1" + assert tree.right is not None and tree.right.op == "and" + + +# --------------------------------------------------------------------------- +# Known-limitation lock: primitive literal atoms +# --------------------------------------------------------------------------- + + +def test_literal_boolean_atoms_known_limitation_python_style_text() -> None: + # Locks the documented caveat in ``_wrap_as_boolean_atom``: literal + # transformers return raw Python values without span info, so atom_text + # falls back to ``str(True)`` → ``"True"`` instead of the source ``"true"``. + # No current consumer reads atom_text on literal atoms; if a future PR + # teaches literal transformers to carry span, this test should be updated + # to assert the Cypher-style lowercase text. + parsed = _parsed_where("MATCH (n) WHERE true XOR n.x > 1 RETURN n") + assert parsed.where is not None and parsed.where.expr_tree is not None + tree = parsed.where.expr_tree + assert tree.op == "xor" + # Left operand is the boolean literal `true` — atom_text is Python-stringified. + assert tree.left is not None and tree.left.op == "atom" + assert tree.left.atom_text == "True" # known limitation — see docstring + # Right operand is a comparable with a Lark Tree span — accurate slice. + assert tree.right is not None and tree.right.atom_text == "n.x > 1"