# Dask compute()

In [5]:
import dask.dataframe as dd
import pandas as pd

In [6]:
df = pd.DataFrame({"col1": ["a", "b", "c", "d"], "col2": [1, 2, 3, 4]})
ddf = dd.from_pandas(df, npartitions=2)

In [10]:
print(ddf.compute())

  col1  col2
0    a     1
1    b     2
2    c     3
3    d     4


In [9]:
type(ddf.compute())

pandas.core.frame.DataFrame

## Large compute

In [1]:
import coiled
import dask
import dask.dataframe as dd

In [2]:
cluster = coiled.Cluster(name="demo-cluster", n_workers=5)



Found software environment build
Created FW rules: coiled-dask-matthew24-72168-firewall
Created scheduler VM: coiled-dask-matthew24-72168-scheduler (type: t3a.medium, ip: ['34.231.169.11'])


In [3]:
client = dask.distributed.Client(cluster)


+-------------+-----------+-----------+-----------+
| Package     | client    | scheduler | workers   |
+-------------+-----------+-----------+-----------+
| cloudpickle | 1.6.0     | 2.0.0     | 2.0.0     |
| dask        | 2021.08.1 | 2021.10.0 | 2021.10.0 |
| distributed | 2021.08.1 | 2021.10.0 | 2021.10.0 |
| numpy       | 1.21.2    | 1.21.3    | 1.21.3    |
| pandas      | 1.3.2     | 1.3.4     | 1.3.4     |
+-------------+-----------+-----------+-----------+


In [4]:
ddf = dd.read_parquet(
    "s3://coiled-datasets/timeseries/20-years/parquet",
    storage_options={"anon": True, "use_ssl": True},
    engine="pyarrow",
)

### Filter then compute works

In [5]:
res = ddf.loc[ddf["id"] > 1150]

In [6]:
res.head()

Unnamed: 0_level_0,id,name,x,y
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [None]:
df = res.compute()

In [None]:
len(df)

In [None]:
df.head()

### Too much data to compute

In [5]:
df = ddf.compute()

## Avoid unnecessary compute calls

In [9]:
%%time

id_min, id_max = dask.compute(ddf.id.min(), ddf.id.max())

CPU times: user 288 ms, sys: 48.3 ms, total: 336 ms
Wall time: 32.9 s


In [8]:
%%time

id_min = ddf.id.min().compute()
id_max = ddf.id.max().compute()

CPU times: user 388 ms, sys: 117 ms, total: 505 ms
Wall time: 1min 3s


## Dask Array Example

In [10]:
import numpy as np

In [12]:
arr1 = np.array([[1, 2], [3, 4]])

In [14]:
darr1 = dask.array.from_array(arr1)

In [15]:
darr1.compute()

array([[1, 2],
       [3, 4]])

In [16]:
type(darr1.compute())

numpy.ndarray