Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
- **Polars support**: `polars.DataFrame` and `polars.LazyFrame` now work in `plot()`, `materialize_nodes()`, `get_degrees()`, `get_indegrees()`, `get_outdegrees()`, and `hypergraph()`. Polars is an optional dependency — no behavior change when not installed. Upload path uses efficient Arrow conversion (`to_arrow()` with schema-metadata stripping and memoization); compute/hypergraph paths coerce to pandas at entry. `LazyFrame` is materialized via `.collect()` at each boundary. Adds `test_polars.py` with 17 tests; skips gracefully when polars is absent (#1133).

### Internal
- **GFQL / Cypher parser**: Parser now captures Lark's already-parsed boolean-expression tree (`and_op` / `or_op` / `xor_op` / `not_op`) as a structural `BooleanExpr` value on `WhereClause.expr_tree`, alongside the existing raw-text `.expr`. Adds `grouped_expr` passthrough so parenthesized subexpressions preserve nested boolean structure rather than collapsing to atom leaves. Backward-compatible: `.expr` stays populated; no binder/pushdown/`expr_split.py` changes this slice. Downstream migration (binder walks tree to emit one `BoundPredicate` per top-level AND conjunct; pushdown drops text-split fallback; `expr_split.py` retired) is tracked in the remaining slices of #1200. Slice 1 is strictly additive (#1200, #1202).
- **GFQL / IR refactor**: Consolidated the `("input", "left", "right", "subquery")` child-slot tuple that was duplicated across 5 sites (IR verifier, physical planner, two rewrite passes, one test helper) into a single `CHILD_SLOTS` constant and `iter_children` helper in `graphistry/compute/gfql/ir/logical_plan.py`. Adds 8 regression tests including: identity-preservation for `UnnestApply` and `PredicatePushdownPass` when no descendant is rewritten (the `rewritten_child is not child` guard is load-bearing for tier-2 fixed-point convergence), asymmetric identity preservation across `Join` when only one branch is rewritten, and a reflective `typing.get_type_hints` check that any future `LogicalPlan` subclass adding a new child slot must update `CHILD_SLOTS` (#1196, #1199).

### Fixed
Expand Down
4 changes: 4 additions & 0 deletions graphistry/compute/gfql/cypher/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import warnings

from .ast import (
BooleanExpr,
BooleanOp,
CallClause,
CypherGraphQuery,
CypherQuery,
Expand Down Expand Up @@ -43,6 +45,8 @@
from .parser import parse_cypher

__all__ = [
"BooleanExpr",
"BooleanOp",
"CypherQuery",
"CypherUnionQuery",
"CallClause",
Expand Down
46 changes: 46 additions & 0 deletions graphistry/compute/gfql/cypher/ast.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,11 +145,57 @@ class WherePatternPredicate:
WhereTerm = Union[WherePredicate, WherePatternPredicate]


BooleanOp = Literal["and", "or", "xor", "not", "atom"]


@dataclass(frozen=True)
class BooleanExpr:
"""Structural representation of a parsed boolean expression tree.

Mirrors Lark's ``and_op`` / ``or_op`` / ``xor_op`` / ``not_op`` grammar
rules so downstream consumers (binder serialization, predicate
pushdown) can walk structure instead of re-parsing expression text.

Leaf nodes have ``op == "atom"`` and carry the atomic predicate's
``atom_text`` (exact source slice) and ``atom_span``. Branch nodes
have ``op`` in ``{"and", "or", "xor"}`` with both ``left`` and
``right`` set, or ``op == "not"`` with only ``left`` set. ``span``
covers the full subexpression in every case.
"""

op: BooleanOp
span: SourceSpan
left: Optional["BooleanExpr"] = None
right: Optional["BooleanExpr"] = None
atom_text: Optional[str] = None
atom_span: Optional[SourceSpan] = None


@dataclass(frozen=True)
class WhereClause:
"""Parsed WHERE clause.

Field coexistence rules:

- **Structured path**: ``predicates`` populated, ``expr`` is ``None``,
``expr_tree`` is ``None``. Fires when Lark routes through the
``where_predicates`` grammar rule (pure AND conjunctions of
comparable predicates) or when ``generic_where_clause`` lifts
AND-joined bare label predicates via label narrowing.
- **Raw-text path**: ``predicates == ()``, ``expr`` populated with
the WHERE body text. Fires when ``generic_where_clause`` cannot
lift to structured predicates.
- **Structured-tree path**: ``expr_tree`` populated alongside
``expr`` when the WHERE body contains ``AND`` / ``OR`` / ``XOR`` /
``NOT`` operators that Lark captured via ``and_op`` / ``or_op`` /
``xor_op`` / ``not_op``. ``expr_tree`` is additive — consumers
that ignore it see the raw-text path unchanged.
"""

predicates: Tuple[WhereTerm, ...]
span: SourceSpan
expr: Optional[ExpressionText] = None
expr_tree: Optional[BooleanExpr] = None


@dataclass(frozen=True)
Expand Down
1 change: 1 addition & 0 deletions graphistry/compute/gfql/cypher/ast_normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,6 +509,7 @@ def _rewrite_where_pattern_predicates_to_matches(query: CypherQuery) -> CypherQu
remaining_where = WhereClause(
predicates=cast(Any, remaining),
expr=query.where.expr,
expr_tree=query.where.expr_tree,
span=query.where.span,
)
extra_match = MatchClause(
Expand Down
120 changes: 118 additions & 2 deletions graphistry/compute/gfql/cypher/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from graphistry.compute.exceptions import ErrorCode, GFQLSyntaxError, GFQLValidationError
from graphistry.compute.gfql.expr_split import split_top_level_and
from graphistry.compute.gfql.cypher.ast import (
BooleanExpr,
CallClause,
CypherGraphQuery,
CypherLiteral,
Expand Down Expand Up @@ -973,6 +974,13 @@ def _mixed_where_clause(
) -> WhereClause:
if expr_text.strip() == "":
raise _to_syntax_error("Invalid WHERE clause", line=span.line, column=span.column)
# Mixed-clause handlers (``where_pattern_and_expr_clause`` and
# ``expr_and_where_pattern_clause``) reconstruct ``expr_text``
# from the source slice and do not capture Lark's structural
# expression tree — so ``expr_tree`` stays unset here even when
# ``expr_text`` contains AND/OR/XOR/NOT operators. Out of scope
# for issue #1200 slice 1 (text-only path; tracked for a later
# slice alongside full binder migration).
return WhereClause(
predicates=(self._parse_where_pattern_predicate_text(pattern_text, span),),
expr=ExpressionText(text=expr_text.strip(), span=span),
Expand Down Expand Up @@ -1014,6 +1022,97 @@ def expr_and_where_pattern_clause(self, meta: Any, _items: Sequence[Any]) -> Whe
span=span,
)

def _wrap_as_boolean_atom(self, operand: Any, enclosing_meta: Any) -> BooleanExpr:
"""Coerce a parsed expression operand into a ``BooleanExpr`` atom.

Recursion bottom-out for ``and_op`` / ``or_op`` / ``xor_op`` /
``not_op``. ``BooleanExpr`` passes through. Lark ``Tree`` and
``_ExpressionSlice`` operands carry their own span, so we use
it to extract the source slice precisely.

**Known limitation — primitive literal atoms.** Literal
transformers (``true_lit`` / ``false_lit`` / ``null_lit`` /
``number_lit``) return raw Python values without span info.
When such a value reaches us as a boolean-operator operand
(``WHERE true AND false``), we cannot recover the original
source text for that specific operand; we approximate with
the enclosing operator's span and ``str(operand)`` (which
produces Python-style text like ``"True"`` not Cypher-style
``"true"``). No current consumer reads ``atom_text`` on
literal atoms — the binder is not wired to ``expr_tree`` in
this slice. Accuracy for this path is a follow-up concern
tracked in issue #1200; if/when literal transformers gain
span-carrying wrappers, this fallback can be removed.
"""
if isinstance(operand, BooleanExpr):
return operand
if isinstance(operand, _ExpressionSlice):
span = operand.span
text = operand.text
else:
operand_meta = getattr(operand, "meta", None)
if operand_meta is None:
# Primitive literal — see docstring caveat.
span = _span_from_meta(enclosing_meta)
text = str(operand)
else:
span = _span_from_meta(operand_meta)
text = self._slice(span)
return BooleanExpr(
op="atom",
span=span,
atom_text=text,
atom_span=span,
)

def _boolean_binary(
self,
op: Literal["and", "or", "xor"],
meta: Any,
items: Sequence[Any],
) -> BooleanExpr:
if len(items) != 2:
raise _to_syntax_error(
f"{op.upper()} expression requires two operands",
line=meta.line, column=meta.column,
)
return BooleanExpr(
op=op,
span=_span_from_meta(meta),
left=self._wrap_as_boolean_atom(items[0], meta),
right=self._wrap_as_boolean_atom(items[1], meta),
)

def grouped_expr(self, _meta: Any, items: Sequence[Any]) -> Any:
# ``(expr)`` — passthrough so a parenthesized BooleanExpr bubbles
# up unchanged to enclosing ``and_op`` / ``or_op`` / ``xor_op`` /
# ``not_op`` handlers. Without this, Lark would wrap the parens
# as a ``grouped_expr`` Tree and our ``_wrap_as_boolean_atom``
# would collapse the inner structure into a text atom, losing
# the nested boolean tree that this PR exists to preserve.
return items[0] if len(items) == 1 else items

def and_op(self, meta: Any, items: Sequence[Any]) -> BooleanExpr:
return self._boolean_binary("and", meta, items)

def or_op(self, meta: Any, items: Sequence[Any]) -> BooleanExpr:
return self._boolean_binary("or", meta, items)

def xor_op(self, meta: Any, items: Sequence[Any]) -> BooleanExpr:
return self._boolean_binary("xor", meta, items)

def not_op(self, meta: Any, items: Sequence[Any]) -> BooleanExpr:
if len(items) != 1:
raise _to_syntax_error(
"NOT expression requires one operand",
line=meta.line, column=meta.column,
)
return BooleanExpr(
op="not",
span=_span_from_meta(meta),
left=self._wrap_as_boolean_atom(items[0], meta),
)

def where_clause(self, meta: Any, items: Sequence[Any]) -> WhereClause:
if len(items) != 1:
raise _to_syntax_error("WHERE clause cannot be empty", line=meta.line, column=meta.column)
Expand All @@ -1025,9 +1124,21 @@ def where_clause(self, meta: Any, items: Sequence[Any]) -> WhereClause:
raise _to_syntax_error("WHERE clause cannot be empty", line=meta.line, column=meta.column)
return WhereClause(predicates=cast(Any, predicates), expr=None, span=_span_from_meta(meta))

def generic_where_clause(self, meta: Any, _items: Sequence[Any]) -> WhereClause:
def generic_where_clause(self, meta: Any, items: Sequence[Any]) -> WhereClause:
span = _span_from_meta(meta)
expr = self._slice(span)[len("WHERE"):].strip()
# Capture Lark's structural expression tree when available so
# downstream consumers can walk ``BooleanExpr`` instead of
# re-parsing ``ExpressionText``. Only ``and_op``/``or_op``/
# ``xor_op``/``not_op`` produce ``BooleanExpr`` today; atomic
# WHERE expressions (single predicate) still route here with
# a non-BooleanExpr operand, in which case ``expr_tree``
# remains ``None`` — no behavior change for those callers.
expr_tree: Optional[BooleanExpr] = (
cast(BooleanExpr, items[0])
if items and isinstance(items[0], BooleanExpr)
else None
)
# Lark's ambiguity resolution prefers the generic `expr` path over
# the structured `where_predicates` rule, so AND-joined bare label
# predicates ("n:Admin AND n:Active") land here rather than in
Expand Down Expand Up @@ -1062,7 +1173,12 @@ def generic_where_clause(self, meta: Any, _items: Sequence[Any]) -> WhereClause:
)
if predicates:
return WhereClause(predicates=tuple(predicates), expr=None, span=span)
return WhereClause(predicates=(), expr=ExpressionText(text=expr, span=span), span=span)
return WhereClause(
predicates=(),
expr=ExpressionText(text=expr, span=span),
expr_tree=expr_tree,
span=span,
)

def unwind_clause(self, meta: Any, items: Sequence[Any]) -> UnwindClause:
if len(items) != 2:
Expand Down
Loading
Loading