Skip to content

Flaky behavior with Distributed MemorySampler #7082

@hayesgb

Description

@hayesgb

When running the following, I periodically get MemorySample errors.

I also see this behavior with both query 1 and query 2 from the coiled-runtime benchmark suite.

dask == 2022.9.1
distributed == 2022.9.1

from pathlib import Path
from uuid import uuid1

import coiled
import dask.dataframe as dd
from dask import config
from dask.distributed import Client, performance_report
from distributed.diagnostics import MemorySampler
import pandas as pd

def main():
    cluster = coiled.Cluster(
                name=f"h2o-benchmarks-{uuid1().hex}",
                n_workers=n_workers,
                worker_vm_types=["t3.large"],  # 2CPU, 8GiB
                scheduler_vm_types=["t3.large"],
                software="sept-release-arrow-nightly-3"
            )
    client = Client(cluster)
    if client.amm.running() is False:
        client.amm.start()
    ddf = dd.read_parquet(
                "s3://coiled-datasets/h2o-benchmark/N_1e9_K_1e2_parquet/*.parquet",
                engine="pyarrow",
            )
    ms = MemorySampler()
    for i in range(5):
        try:
            client.restart()
            print(f"Run {i}")
            with ms.sample():
                print("starting sample...")  
                ddf_q3 = (
                            ddf.groupby("id3", dropna=False)
                            .agg({"v1": "sum", "v3": "mean"}, split_out=6)
                            .compute()
                )
        except Exception as e:
            print(f"run failed with {e}")

if __name__ == "__main__"
    main()

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions