Refresh documentation for annotations and spans (#8593)

dask · Mar 28, 2024 · e434793 · e434793
1 parent 30d5df7
commit e434793
Show file tree

Hide file tree

Showing 4 changed files with 104 additions and 21 deletions.
diff --git a/distributed/tests/test_client.py b/distributed/tests/test_client.py
@@ -7275,6 +7275,20 @@ async def test_annotations_submit_map(c, s, a, b):
     assert not b.state.tasks
 
 
+@gen_cluster(client=True)
+async def test_annotations_global_vs_local(c, s, a, b):
+    """Test that local annotations take precedence over global annotations"""
+    with dask.annotate(foo=1):
+        x = delayed(inc)(1, dask_key_name="x")
+    y = delayed(inc)(2, dask_key_name="y")
+    with dask.annotate(foo=2):
+        xf, yf = c.compute([x, y])
+
+    await c.gather(xf, yf)
+    assert s.tasks["x"].annotations == {"foo": 1}
+    assert s.tasks["y"].annotations == {"foo": 2}
+
+
 @gen_cluster(client=True)
 async def test_workers_collection_restriction(c, s, a, b):
     da = pytest.importorskip("dask.array")

diff --git a/distributed/tests/test_spans.py b/distributed/tests/test_spans.py
@@ -841,3 +841,21 @@ def f():
 
     # No annotation is created for the default span
     assert await c.submit(dask.get_annotations) == {}
+
+
+@gen_cluster(client=True)
+async def test_span_on_persist(c, s, a, b):
+    """As a workaround to lack of annotations support in dask-expr and loss of
+    annotations due to low level optimization in dask.array, you can use span() to wrap
+    calls to persist() and compute()
+    """
+    x = delayed(inc)(1, dask_key_name="x")
+    with span("x") as x_id:
+        x = c.persist(x)
+    y = delayed(inc)(x, dask_key_name="y")
+    with span("y") as y_id:
+        y = c.compute(y)
+    assert await y == 3
+
+    assert s.tasks["x"].group.span_id == x_id
+    assert s.tasks["y"].group.span_id == y_id
diff --git a/docs/source/resources.rst b/docs/source/resources.rst
@@ -143,19 +143,49 @@ memory as actual resources and uses these in normal scheduling operation.
 Resources with collections
 --------------------------
 
-You can also use resources with Dask collections, like arrays, dataframes, and
-delayed objects. You can annotate operations on collections with specific resources
-that should be required perform the computation using the dask annotations machinery.
+You can also use resources with Dask collections, like arrays and delayed objects. You
+can annotate operations on collections with specific resources that should be required
+to perform the computation using the dask annotations machinery.
 
 .. code-block:: python
 
-    x = dd.read_csv(...)
+    # Read note below!
+    dask.config.set({"optimization.fuse.active": False})
+    x = da.read_zarr(...)
     with dask.annotate(resources={'GPU': 1}):
-        y = x.map_partitions(func1)
-    z = y.map_partitions(func2)
+        y = x.map_blocks(func1)
+    z = y.map_blocks(func2)
+    z.compute()
 
-    z.compute(optimize_graph=False)
+.. note::
 
-In most cases (such as the case above) the annotations for ``y`` may be lost during
-graph optimization before execution. You can avoid that by passing the
-``optimize_graph=False`` keyword.
+    This feature is currently supported for dataframes only when
+    ``with dask.annotate(...):`` wraps the `compute()` or `persist()` call; in that
+    case, the annotation applies to the whole graph, starting from and excluding
+    any previously persisted collections.
+
+    For other collections, like arrays and delayed objects, annotations can get lost
+    during the optimization phase. To prevent this issue, you must set:
+
+    >>> dask.config.set({"optimization.fuse.active": False})
+
+    Or in dask.yaml:
+
+    .. code-block:: yaml
+
+        optimization:
+          fuse:
+            active: false
+
+    A possible workaround, that also works for dataframes, can be to perform
+    intermediate calls to `persist()`. Note however that this can significantly
+    impact optimizations and reduce overall performance.
+
+    .. code-block:: python
+
+        x = dd.read_parquet(...)
+        with dask.annotate(resources={'GPU': 1}):
+            y = x.map_partitions(func1).persist()
+        z = y.map_partitions(func2)
+        del y  # Release distributed memory for y as soon as possible
+        z.compute()
diff --git a/docs/source/spans.rst b/docs/source/spans.rst
@@ -26,23 +26,21 @@ For example:
 .. code-block:: python
 
     import dask.config
-    import dask.dataframe as dd
+    import dask.array as da
     from distributed import Client, span
 
+    # Read important note below
     dask.config.set({"optimization.fuse.active": False})
     client = Client()
 
     with span("Alice's workflow"):
         with span("data load"):
-            df = dd.read_parquet(...)
-
+            a = da.read_zarr(...)
         with span("ML preprocessing"):
-            df = preprocess(df)
-
+            a = preprocess(a)
         with span("Model training"):
-            model = train(df)
-
-    model = model.compute()
+            model = train(a)
+        model = model.compute()
 
 Note how the :func:`span` context manager can be nested.
 The example will create the following spans on the scheduler:
@@ -95,10 +93,16 @@ Additionally, spans can be queried using scheduler extensions or
 
 User API
 --------
-.. warning::
+.. important::
 
-    Spans are based on annotations, and just like annotations they can be lost during
-    optimization. To prevent this issue, you must set
+    Dataframes have a minimum granularity of a single call to `compute()` or `persist()`
+    and can't break it down further into groups of operations - if the example above
+    used dataframes, everything would have been uniformly tagged as "Alice's Workflow",
+    as it is the span that's active during `compute()`.
+
+    In other collections, such as arrays and delayed objects, spans that don't wrap
+    around a call to `compute()` or `persist()` can get lost during the optimization
+    phase. To prevent this issue, you must set
 
     >>> dask.config.set({"optimization.fuse.active": False})
 
@@ -110,6 +114,23 @@ User API
           fuse:
             active: false
 
+    A possible workaround, that also works for dataframes, can be to perform
+    intermediate calls to `persist()`. Note however that this can significantly
+    impact optimizations and reduce overall performance.
+
+    .. code-block:: python
+
+        with span("Alice's workflow"):
+            with span("data load"):
+                a = dd.read_parquet(...).persist()
+            with span("ML preprocessing"):
+                a = preprocess(a).persist()
+                del a  # Release distributed memory for a as soon as possible
+            with span("Model training"):
+                model = train(b).persist()
+                del b  # Release distributed memory for b as soon as possible
+                model = model.compute()
+
 .. autofunction:: span