In [1]:
import coiled
import dask.distributed
import dask.dataframe as dd

In [2]:
cluster = coiled.Cluster(configuration="coiled/default", n_workers=5)

In [3]:
client = dask.distributed.Client(cluster)

In [4]:
d2000 = dd.read_parquet("s3://coiled-datasets/timeseries/7d/parquet/2000")
d2001 = dd.read_parquet("s3://coiled-datasets/timeseries/7d/parquet/2001")
d2002 = dd.read_parquet("s3://coiled-datasets/timeseries/7d/parquet/2002")
d2003 = dd.read_parquet("s3://coiled-datasets/timeseries/7d/parquet/2003")
d2004 = dd.read_parquet("s3://coiled-datasets/timeseries/7d/parquet/2004")

In [5]:
ddf = dd.concat([d2000, d2001, d2002, d2003, d2004])

In [6]:
len(ddf)

157248000

In [7]:
ddf.npartitions

260

## pyarrow write setup

In [8]:
import pyarrow as pa

In [9]:
schema={"id": pa.uint32(), "name": pa.string(), "x": pa.float64(), "y": pa.float64()}

## noop write time

In [10]:
%%time
ddf.to_parquet("s3://coiled-datasets/tmp/shuffle/dask/noop3", write_index=False)

CPU times: user 245 ms, sys: 59.9 ms, total: 305 ms
Wall time: 59.5 s


In [38]:
%%time
ddf.to_parquet("s3://coiled-datasets/tmp/shuffle/dask/noop2", write_index=False, schema=schema)

CPU times: user 238 ms, sys: 21.4 ms, total: 259 ms
Wall time: 48.2 s


## shuffle times without writes

In [32]:
%%time
sorted_ddf_id = ddf.set_index("id")

CPU times: user 754 ms, sys: 73 ms, total: 827 ms
Wall time: 28.2 s


In [33]:
sorted_ddf_id.head()

Unnamed: 0_level_0,name,x,y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
832,Oliver,-0.015893,0.678568
832,Oliver,-0.015893,0.678568
832,Oliver,-0.015893,0.678568
832,Oliver,-0.015893,0.678568
832,Oliver,-0.015893,0.678568


In [34]:
sorted_ddf_id.npartitions

260

## shuffle write time

In [11]:
%%time
with coiled.performance_report():
    ddf.set_index("id").to_parquet("s3://coiled-datasets/tmp/shuffle/dask/shuffled3", schema=schema)



CPU times: user 1.57 s, sys: 221 ms, total: 1.79 s
Wall time: 1min 47s


## shuffle and repartition write time

In [37]:
%%time
ddf.set_index("id").repartition(50).to_parquet("s3://coiled-datasets/tmp/shuffle/dask/shuf_and_part", schema=schema)

CPU times: user 1.64 s, sys: 137 ms, total: 1.77 s
Wall time: 1min 57s
