In [1]:
import coiled
import dask
import dask.dataframe as dd

In [2]:
cluster = coiled.Cluster(name="powers-benchmark2", n_workers=3)

In [3]:
client = dask.distributed.Client(cluster)


+-------------+-----------+-----------+-----------+
| Package     | client    | scheduler | workers   |
+-------------+-----------+-----------+-----------+
| dask        | 2021.11.2 | 2022.01.0 | 2022.01.0 |
| distributed | 2021.11.2 | 2022.01.0 | 2022.01.0 |
| numpy       | 1.22.0    | 1.21.5    | 1.21.5    |
+-------------+-----------+-----------+-----------+


## Coiled Quickstart

In [9]:
%%time

df = dd.read_csv(
    "s3://nyc-tlc/trip data/yellow_tripdata_2019-*.csv",
    parse_dates=["tpep_pickup_datetime", "tpep_dropoff_datetime"],
    dtype={
        "payment_type": "UInt8",
        "VendorID": "UInt8",
        "passenger_count": "UInt8",
        "RatecodeID": "UInt8",
        "store_and_fwd_flag": "category",
        "PULocationID": "UInt16",
        "DOLocationID": "UInt16",
    },
    storage_options={"anon": True},
    blocksize="16 MiB",
)

df.groupby("passenger_count").tip_amount.mean().compute()

CPU times: user 1.06 s, sys: 197 ms, total: 1.26 s
Wall time: 2min


passenger_count
0    2.122789
1    2.206790
2    2.214306
3    2.137775
4    2.023804
5    2.235441
6    2.221105
7    6.675962
8    7.111625
9    7.377822
Name: tip_amount, dtype: float64

distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
asyncio.exceptions.CancelledError
Traceback (most recent call last):
  File "/Users/powers/opt/miniconda3/envs/standard-coiled/lib/python3.9/site-packages/distributed/comm/tcp.py", line 398, in connect
    stream = await self.client.connect(
  File "/Users/powers/opt/miniconda3/envs/standard-coiled/lib/python3.9/site-packages/tornado/tcpclient.py", line 275, in connect
    af, addr, stream = await connector.start(connect_timeout=timeout)
asyncio.exceptions.CancelledError

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/powers/opt/miniconda3/envs/standard-coiled/lib/python3.9/asyncio/tasks.py", line 492, in wait_for
    fut.result()
asyncio.exceptions.CancelledError

The above exception was the direct cause of th

## Same query with coiled-datasets

In [7]:
%%time

ddf = dd.read_parquet(
    "s3://coiled-datasets/nyc-tlc/2019",
    engine="pyarrow",
    columns=["passenger_count", "tip_amount"],
)
ddf.groupby("passenger_count").tip_amount.mean().compute()

CPU times: user 343 ms, sys: 121 ms, total: 464 ms
Wall time: 16 s


passenger_count
0.0    2.122789
1.0    2.206790
2.0    2.214306
3.0    2.137775
4.0    2.023804
5.0    2.235441
6.0    2.221105
7.0    6.675962
8.0    7.111625
9.0    7.377822
Name: tip_amount, dtype: float64

## Same query with coiled-datasets w/ metadata

In [8]:
%%time

ddf = dd.read_parquet(
    "s3://coiled-datasets/nyc-tlc-with-metadata/2019",
    engine="pyarrow",
    columns=["passenger_count", "tip_amount"],
)
ddf.groupby("passenger_count").tip_amount.mean().compute()

CPU times: user 220 ms, sys: 25.2 ms, total: 245 ms
Wall time: 13.6 s


passenger_count
0.0    2.122789
1.0    2.206790
2.0    2.214306
3.0    2.137775
4.0    2.023804
5.0    2.235441
6.0    2.221105
7.0    6.675962
8.0    7.111625
9.0    7.377822
Name: tip_amount, dtype: float64

In [15]:
client.restart()

0,1
Connection method: Cluster object,Cluster type: coiled.Cluster
Dashboard: http://3.235.46.20:8787,

0,1
Dashboard: http://3.235.46.20:8787,Workers: 3
Total threads: 6,Total memory: 22.43 GiB

0,1
Comm: tls://10.4.8.186:8786,Workers: 3
Dashboard: http://10.4.8.186:8787/status,Total threads: 6
Started: 5 minutes ago,Total memory: 22.43 GiB

0,1
Comm: tls://10.4.4.10:42781,Total threads: 2
Dashboard: http://10.4.4.10:41941/status,Memory: 7.48 GiB
Nanny: tls://10.4.4.10:34335,
Local directory: /dask-worker-space/worker-z2cu5f1g,Local directory: /dask-worker-space/worker-z2cu5f1g

0,1
Comm: tls://10.4.6.106:40989,Total threads: 2
Dashboard: http://10.4.6.106:33189/status,Memory: 7.48 GiB
Nanny: tls://10.4.6.106:34921,
Local directory: /dask-worker-space/worker-rma7aisv,Local directory: /dask-worker-space/worker-rma7aisv

0,1
Comm: tls://10.4.3.193:35553,Total threads: 2
Dashboard: http://10.4.3.193:34483/status,Memory: 7.48 GiB
Nanny: tls://10.4.3.193:38737,
Local directory: /dask-worker-space/worker-pln8_1bl,Local directory: /dask-worker-space/worker-pln8_1bl
