# Dask persist()

In [1]:
import dask.dataframe as dd
import pandas as pd

In [2]:
df = pd.DataFrame({"col1": ["a", "b", "c", "d"], "col2": [1, 2, 3, 4]})
ddf = dd.from_pandas(df, npartitions=2)

In [3]:
persisted_ddf = ddf.persist()

In [4]:
len(persisted_ddf)

4

## Large dataset persist

In [19]:
import coiled
import dask
import dask.dataframe as dd

In [20]:
cluster = coiled.Cluster(name="powers", n_workers=5)



Found software environment build
Created FW rules: coiled-dask-matthew24-72845-firewall
Created scheduler VM: coiled-dask-matthew24-72845-scheduler (type: t3a.medium, ip: ['52.3.226.154'])


In [21]:
client = dask.distributed.Client(cluster)


+-------------+-----------+-----------+-----------+
| Package     | client    | scheduler | workers   |
+-------------+-----------+-----------+-----------+
| cloudpickle | 1.6.0     | 2.0.0     | 2.0.0     |
| dask        | 2021.08.1 | 2021.10.0 | 2021.10.0 |
| distributed | 2021.08.1 | 2021.10.0 | 2021.10.0 |
| numpy       | 1.21.2    | 1.21.3    | 1.21.3    |
| pandas      | 1.3.2     | 1.3.4     | 1.3.4     |
+-------------+-----------+-----------+-----------+


In [22]:
ddf = dd.read_parquet(
    "s3://coiled-datasets/timeseries/20-years/parquet",
    storage_options={"anon": True, "use_ssl": True},
    engine="pyarrow",
)

In [23]:
res = ddf.loc[ddf["id"] > 1150]

In [10]:
%%time
len(res)

CPU times: user 405 ms, sys: 94.7 ms, total: 500 ms
Wall time: 1min 27s


1103

In [12]:
%%time
res.name.nunique().compute()

CPU times: user 257 ms, sys: 19.7 ms, total: 277 ms
Wall time: 1min 2s


26

In [13]:
%%time
persisted_res = res.persist()

CPU times: user 32.6 ms, sys: 3.22 ms, total: 35.8 ms
Wall time: 33.8 ms


In [17]:
%%time
len(persisted_res)

CPU times: user 131 ms, sys: 5.92 ms, total: 137 ms
Wall time: 1.95 s


1103

In [18]:
%%time
persisted_res.name.nunique().compute()

CPU times: user 217 ms, sys: 8.55 ms, total: 226 ms
Wall time: 2.43 s


26

distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
asyncio.exceptions.CancelledError


## "Persisting" by writing to disk

In [25]:
%%time
res.repartition(2).to_parquet(
    "s3://coiled-datasets/tmp/matt/disk-persist", engine="pyarrow"
)

CPU times: user 305 ms, sys: 56.6 ms, total: 361 ms
Wall time: 1min 10s


In [26]:
df = dd.read_parquet(
    "s3://coiled-datasets/tmp/matt/disk-persist",
    storage_options={"anon": True, "use_ssl": True},
    engine="pyarrow",
)

In [27]:
%%time
len(df)

CPU times: user 23.5 ms, sys: 9.03 ms, total: 32.5 ms
Wall time: 392 ms


1103

In [28]:
%%time
df.name.nunique().compute()

CPU times: user 25.4 ms, sys: 6.13 ms, total: 31.5 ms
Wall time: 331 ms


26

## Repartitioning and persisting

In [29]:
res2 = res.repartition(2)

In [31]:
persisted_res2 = res2.persist()

In [34]:
%%time
len(persisted_res2)

CPU times: user 23.2 ms, sys: 2.95 ms, total: 26.1 ms
Wall time: 260 ms


1103

In [35]:
%%time
persisted_res2.name.nunique().compute()

CPU times: user 24.1 ms, sys: 3.72 ms, total: 27.8 ms
Wall time: 274 ms


26

distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
asyncio.exceptions.CancelledError
