In [1]:
import coiled
import dask.distributed
import dask.dataframe as dd

## Cluster setup

In [2]:
cluster = coiled.Cluster(configuration="coiled/default", n_workers=5)

Output()

Found software environment build


In [3]:
client = dask.distributed.Client(cluster)

## CSV files

In [24]:
ddf2000 = dd.read_csv("s3://coiled-datasets/timeseries/7d/csv/2000/*.csv")
ddf2001 = dd.read_csv("s3://coiled-datasets/timeseries/7d/csv/2001/*.csv")
ddf2002 = dd.read_csv("s3://coiled-datasets/timeseries/7d/csv/2002/*.csv")
ddf2003 = dd.read_csv("s3://coiled-datasets/timeseries/7d/csv/2003/*.csv")
ddf2004 = dd.read_csv("s3://coiled-datasets/timeseries/7d/csv/2004/*.csv")
ddf2005 = dd.read_csv("s3://coiled-datasets/timeseries/7d/csv/2005/*.csv")

In [25]:
ddf = dd.concat([ddf2000, ddf2001, ddf2002, ddf2003, ddf2004, ddf2005])

In [26]:
len(ddf)

188697600

In [27]:
%%time

len(ddf[ddf.id > 1170])

CPU times: user 268 ms, sys: 20.3 ms, total: 289 ms
Wall time: 57.6 s


6

## Parquet files

In [28]:
ddf2000 = dd.read_parquet("s3://coiled-datasets/timeseries/7d/parquet/2000")
ddf2001 = dd.read_parquet("s3://coiled-datasets/timeseries/7d/parquet/2001")
ddf2002 = dd.read_parquet("s3://coiled-datasets/timeseries/7d/parquet/2002")
ddf2003 = dd.read_parquet("s3://coiled-datasets/timeseries/7d/parquet/2003")
ddf2004 = dd.read_parquet("s3://coiled-datasets/timeseries/7d/parquet/2004")
ddf2005 = dd.read_parquet("s3://coiled-datasets/timeseries/7d/parquet/2005")

In [29]:
ddf = dd.concat([ddf2000, ddf2001, ddf2002, ddf2003, ddf2004, ddf2005])

In [30]:
%%time

len(ddf[ddf.id > 1170])

CPU times: user 173 ms, sys: 22.2 ms, total: 195 ms
Wall time: 32.9 s


6

## Predicate pushdown filtering

In [31]:
ddf2000 = dd.read_parquet("s3://coiled-datasets/timeseries/7d/parquet/2000", filters=[[('id', '>', 1170)]])
ddf2001 = dd.read_parquet("s3://coiled-datasets/timeseries/7d/parquet/2001", filters=[[('id', '>', 1170)]])
ddf2002 = dd.read_parquet("s3://coiled-datasets/timeseries/7d/parquet/2002", filters=[[('id', '>', 1170)]])
ddf2003 = dd.read_parquet("s3://coiled-datasets/timeseries/7d/parquet/2003", filters=[[('id', '>', 1170)]])
ddf2004 = dd.read_parquet("s3://coiled-datasets/timeseries/7d/parquet/2004", filters=[[('id', '>', 1170)]])
ddf2005 = dd.read_parquet("s3://coiled-datasets/timeseries/7d/parquet/2005", filters=[[('id', '>', 1170)]])

In [32]:
ddf = dd.concat([ddf2000, ddf2001, ddf2002, ddf2003, ddf2004, ddf2005])

In [33]:
len(ddf)

3628800

In [34]:
%%time

len(ddf[ddf.id > 1170])

CPU times: user 29.7 ms, sys: 3.1 ms, total: 32.8 ms
Wall time: 1.3 s


6

## Predicate pushdown filtering and column pruning

In [35]:
ddf2000 = dd.read_parquet("s3://coiled-datasets/timeseries/7d/parquet/2000", filters=[[('id', '>', 1170)]], columns=["id"])
ddf2001 = dd.read_parquet("s3://coiled-datasets/timeseries/7d/parquet/2001", filters=[[('id', '>', 1170)]], columns=["id"])
ddf2002 = dd.read_parquet("s3://coiled-datasets/timeseries/7d/parquet/2002", filters=[[('id', '>', 1170)]], columns=["id"])
ddf2003 = dd.read_parquet("s3://coiled-datasets/timeseries/7d/parquet/2003", filters=[[('id', '>', 1170)]], columns=["id"])
ddf2004 = dd.read_parquet("s3://coiled-datasets/timeseries/7d/parquet/2004", filters=[[('id', '>', 1170)]], columns=["id"])
ddf2005 = dd.read_parquet("s3://coiled-datasets/timeseries/7d/parquet/2005", filters=[[('id', '>', 1170)]], columns=["id"])

In [36]:
ddf = dd.concat([ddf2000, ddf2001, ddf2002, ddf2003, ddf2004, ddf2005])

In [37]:
len(ddf)

3628800

In [38]:
%%time

len(ddf[ddf.id > 1170])

CPU times: user 25.1 ms, sys: 2.94 ms, total: 28 ms
Wall time: 764 ms


6

## Understanding predicate pushdowns

Predicate pushdowns are applied at the row group level.  They filter out row groups that don't contain id's greater than 1170 in our example.  Note that the row groups that contain ids greater than 1170 will also contain ids less than 1170.  You still need to apply the "regular filtering" after applying the "predicate filters" to get the final result.

In [17]:
ddf2000 = dd.read_parquet("s3://coiled-datasets/timeseries/7d/parquet/2000", filters=[[('id', '>', 1170)]])

In [19]:
ddf2000.head()

Unnamed: 0_level_0,id,name,x,y
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-29 00:00:00,1081,Edith,0.050667,-0.556958
2000-01-29 00:00:01,1022,Ursula,-0.642827,0.659931
2000-01-29 00:00:02,984,Jerry,0.449249,0.782695
2000-01-29 00:00:03,996,Alice,-0.124976,0.327127
2000-01-29 00:00:04,992,Victor,0.274238,-0.320963


In [14]:
len(ddf2000)

604800

In [39]:
len(ddf2000[ddf2000.id > 1170])

1

distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
asyncio.exceptions.CancelledError


The predicate filtering makes it so our "regular filtering" only needs to process 604,800 rows of data.  If the predicate filters are not applied, then Dask needs to run "regular filtering" on 31 million rows of data, as shown below.

In [15]:
ddf2000 = dd.read_parquet("s3://coiled-datasets/timeseries/7d/parquet/2000")

In [16]:
len(ddf2000)

31449600