dask · phofl · Feb 15, 2024 · Feb 15, 2024 · Feb 16, 2024 · Feb 16, 2024
diff --git a/dask_expr/_collection.py b/dask_expr/_collection.py
@@ -757,6 +757,7 @@ def shuffle(
                 shuffle_method,
                 options,
                 index_shuffle=on_index,
+                _branch_id=expr.BranchId(0),
             )
         )
 
@@ -4780,6 +4781,7 @@ def merge(
             shuffle_method=shuffle_method,
             _npartitions=npartitions,
             broadcast=broadcast,
+            _branch_id=expr.BranchId(0),
         )
     )
 
@@ -4866,7 +4868,7 @@ def merge_asof(
 
     from dask_expr._merge_asof import MergeAsof
 
-    return new_collection(MergeAsof(left, right, **kwargs))
+    return new_collection(MergeAsof(left, right, **kwargs, _branch_id=expr.BranchId(0)))
 
 
 def from_map(

diff --git a/dask_expr/_core.py b/dask_expr/_core.py
@@ -5,7 +5,7 @@
 import weakref
 from collections import defaultdict
 from collections.abc import Generator
-from typing import TYPE_CHECKING, Literal
+from typing import TYPE_CHECKING, Literal, NamedTuple
 
 import dask
 import pandas as pd
@@ -29,6 +29,10 @@
 ]
 
 
+class BranchId(NamedTuple):
+    branch_id: int
+
+
 def _unpack_collections(o):
     if isinstance(o, Expr):
         return o
@@ -43,9 +47,17 @@ class Expr:
     _parameters = []
     _defaults = {}
     _instances = weakref.WeakValueDictionary()
+    _branch_id_required = False
+    _reuse_consumer = False
 
-    def __new__(cls, *args, **kwargs):
+    def __new__(cls, *args, _branch_id=None, **kwargs):
+        cls._check_branch_id_given(args, _branch_id)
         operands = list(args)
+        if _branch_id is None and len(operands) and isinstance(operands[-1], BranchId):
+            _branch_id = operands.pop(-1)
+        elif _branch_id is None:
+            _branch_id = BranchId(0)
+
         for parameter in cls._parameters[len(operands) :]:
             try:
                 operands.append(kwargs.pop(parameter))
@@ -54,13 +66,23 @@ def __new__(cls, *args, **kwargs):
         assert not kwargs, kwargs
         inst = object.__new__(cls)
         inst.operands = [_unpack_collections(o) for o in operands]
+        inst._branch_id = _branch_id
         _name = inst._name
         if _name in Expr._instances:
             return Expr._instances[_name]
 
         Expr._instances[_name] = inst
         return inst
 
+    @classmethod
+    def _check_branch_id_given(cls, args, _branch_id):
-    def _check_branch_id_given(cls, args, _branch_id):
+    def _maybe_check_branch_id_given(cls, args, _branch_id):
-    def _check_branch_id_given(cls, args, _branch_id):
+    def _maybe_check_branch_id_given(cls, args, _branch_id):
+        if not cls._branch_id_required:
+            return
+        operands = list(args)
+        if _branch_id is None and len(operands) and isinstance(operands[-1], BranchId):
+            _branch_id = operands.pop(-1)
+        assert _branch_id is not None, "BranchId not found"
+
     def _tune_down(self):
         return None
 
@@ -116,7 +138,10 @@ def _tree_repr_lines(self, indent=0, recursive=True):
                 elif is_arraylike(op):
                     op = "<array>"
                 header = self._tree_repr_argument_construction(i, op, header)
-
+        if self._branch_id.branch_id != 0:
+            header = self._tree_repr_argument_construction(
+                i + 1, f" branch_id={self._branch_id.branch_id}", header
+            )
         lines = [header] + lines
         lines = [" " * indent + line for line in lines]
 
@@ -218,7 +243,7 @@ def _layer(self) -> dict:
 
         return {(self._name, i): self._task(i) for i in range(self.npartitions)}
 
-    def rewrite(self, kind: str):
+    def rewrite(self, kind: str, cache):
         """Rewrite an expression
 
         This leverages the ``._{kind}_down`` and ``._{kind}_up``
@@ -231,6 +256,9 @@ def rewrite(self, kind: str):
         changed:
             whether or not any change occured
         """
+        if self._name in cache:
+            return cache[self._name]
+
         expr = self
         down_name = f"_{kind}_down"
         up_name = f"_{kind}_up"
@@ -267,21 +295,46 @@ def rewrite(self, kind: str):
             changed = False
             for operand in expr.operands:
                 if isinstance(operand, Expr):
-                    new = operand.rewrite(kind=kind)
+                    new = operand.rewrite(kind=kind, cache=cache)
+                    cache[operand._name] = new
                     if new._name != operand._name:
                         changed = True
                 else:
                     new = operand
                 new_operands.append(new)
 
             if changed:
-                expr = type(expr)(*new_operands)
+                expr = type(expr)(*new_operands, _branch_id=expr._branch_id)
                 continue
             else:
                 break
 
         return expr
 
+    def _reuse_up(self, parent):
+        return
+
+    def _reuse_down(self):
+        if not self.dependencies():
+            return
+        return self._bubble_branch_id_down()
+
+    def _bubble_branch_id_down(self):
-    def _bubble_branch_id_down(self):
+    def _propagate_branch_id_down(self):
-    def _bubble_branch_id_down(self):
+    def _propagate_branch_id_down(self):
+        b_id = self._branch_id
+        if b_id.branch_id <= 0:
+            return
+        if any(b_id.branch_id != d._branch_id.branch_id for d in self.dependencies()):
+            ops = [
+                op._substitute_branch_id(b_id) if isinstance(op, Expr) else op
+                for op in self.operands
+            ]
+            return type(self)(*ops)
+
+    def _substitute_branch_id(self, branch_id):
-    def _substitute_branch_id(self, branch_id):
+    def _maybe_substitute_branch_id(self, branch_id):
-    def _substitute_branch_id(self, branch_id):
+    def _maybe_substitute_branch_id(self, branch_id):
+        if self._branch_id.branch_id != 0:
+            return self
+        return type(self)(*self.operands, branch_id)
+
     def simplify_once(self, dependents: defaultdict, simplified: dict):
         """Simplify an expression
 
@@ -346,7 +399,7 @@ def simplify_once(self, dependents: defaultdict, simplified: dict):
                 new_operands.append(new)
 
             if changed:
-                expr = type(expr)(*new_operands)
+                expr = type(expr)(*new_operands, _branch_id=expr._branch_id)
 
             break
 
@@ -391,7 +444,7 @@ def lower_once(self):
             new_operands.append(new)
 
         if changed:
-            out = type(out)(*new_operands)
+            out = type(out)(*new_operands, _branch_id=out._branch_id)
 
         return out
 
@@ -426,6 +479,23 @@ def _lower(self):
 
     @functools.cached_property
     def _name(self):
+        return (
+            funcname(type(self)).lower()
+            + "-"
+            + _tokenize_deterministic(*self.operands, self._branch_id)
+        )
+
+    @functools.cached_property
+    def _dep_name(self):
+        # The name identifies every expression uniquely. The dependents name
+        # is used during optimization to capture the dependents of any given
+        # expression. A reuse consumer will have the same dependents independently
+        # of the branch_id parameter, since we want to reuse everything that comes
+        # before us and split branches up everything that is processed after
-        # before us and split branches up everything that is processed after
+        # before us and split branches up for everything that is processed after
-        # before us and split branches up everything that is processed after
+        # before us and split branches up for everything that is processed after
+        # us. So we have to ignore the branch_id from tokenization for those
+        # nodes.
+        if not self._reuse_consumer:
+            return self._name
         return (
             funcname(type(self)).lower() + "-" + _tokenize_deterministic(*self.operands)
         )
@@ -554,7 +624,7 @@ def _substitute(self, old, new, _seen):
                 new_exprs.append(operand)
 
         if update:  # Only recreate if something changed
-            return type(self)(*new_exprs)
+            return type(self)(*new_exprs, _branch_id=self._branch_id)
         else:
             _seen.add(self._name)
         return self
@@ -580,7 +650,7 @@ def substitute_parameters(self, substitutions: dict) -> Expr:
             else:
                 new_operands.append(operand)
         if changed:
-            return type(self)(*new_operands)
+            return type(self)(*new_operands, _branch_id=self._branch_id)
         return self
 
     def _node_label_args(self):
@@ -741,5 +811,5 @@ def collect_dependents(expr) -> defaultdict:
 
         for dep in node.dependencies():
             stack.append(dep)
-            dependents[dep._name].append(weakref.ref(node))
+            dependents[dep._dep_name].append(weakref.ref(node))
     return dependents