# How to speed up Dask query 10x

This notebook requires you to download a dataset from S3 to your local machine.

## Localhost data setup

In [2]:
!mkdir data

In [4]:
!aws s3 cp s3://coiled-datasets/h2o/G1_1e8_1e2_0_0/csv/G1_1e8_1e2_0_0.csv data/G1_1e8_1e2_0_0.csv

fatal error: An error occurred (404) when calling the HeadObject operation: Key "h2o/G1_1e7_1e2_0_0/csv/G1_1e8_1e2_0_0.csv" does not exist


## Query baselines

In [1]:
import dask.dataframe as dd

In [15]:
dd.read_csv("data/G1_1e8_1e2_0_0.csv").dtypes

id1     object
id2     object
id3     object
id4      int64
id5      int64
id6      int64
v1       int64
v2       int64
v3     float64
dtype: object

### Use horribly inefficient dtypes

In [9]:
dtypes = {
    "id1": "object",
    "id2": "object",
    "id3": "object",
    "id4": "object",
    "id5": "object",
    "id6": "object",
    "v1": "object",
    "v2": "object",
    "v3": "object",
}

In [10]:
ddf = dd.read_csv("data/G1_1e8_1e2_0_0.csv", dtype=dtypes)

In [11]:
ddf.head()

Unnamed: 0,id1,id2,id3,id4,id5,id6,v1,v2,v3
0,id016,id046,id0000109363,88,13,146094,4,6,18.837686
1,id039,id087,id0000466766,14,30,111330,4,14,46.797328
2,id047,id098,id0000307804,85,23,187639,3,5,47.577311
3,id043,id017,id0000344864,87,76,256509,2,5,80.462924
4,id054,id027,id0000433679,99,67,32736,1,7,15.796662


In [12]:
ddf.dtypes

id1    object
id2    object
id3    object
id4    object
id5    object
id6    object
v1     object
v2     object
v3     object
dtype: object

In [13]:
ddf["v1"] = ddf["v1"].astype("int64")

In [14]:
%%time
ddf.groupby("id1", dropna=False, observed=True).agg({"v1": "sum"}).compute()

CPU times: user 1min 55s, sys: 23.6 s, total: 2min 18s
Wall time: 2min 2s


Unnamed: 0_level_0,v1
id1,Unnamed: 1_level_1
id001,2997617
id002,2997194
id003,3001204
id004,3000945
id005,3001731
...,...
id096,2997704
id097,3004476
id098,2996139
id099,3001163


### Use better dtypes

In [15]:
better_dtypes = {
    "id1": "string[pyarrow]",
    "id2": "string[pyarrow]",
    "id3": "string[pyarrow]",
    "id4": "int64",
    "id5": "int64",
    "id6": "int64",
    "v1": "int64",
    "v2": "int64",
    "v3": "float64",
}

In [16]:
ddf = dd.read_csv("data/G1_1e8_1e2_0_0.csv", dtype=better_dtypes)

In [17]:
%%time
ddf.groupby("id1", dropna=False, observed=True).agg({"v1": "sum"}).compute()

CPU times: user 1min 28s, sys: 14 s, total: 1min 43s
Wall time: 1min 7s


Unnamed: 0_level_0,v1
id1,Unnamed: 1_level_1
id001,2997617
id002,2997194
id003,3001204
id004,3000945
id005,3001731
...,...
id096,2997704
id097,3004476
id098,2996139
id099,3001163


## Split files

In [19]:
ddf.repartition(partition_size="100MB").to_csv("data/csvs")

['/Users/powers/Documents/code/coiled/coiled-resources/local/data/csvs/000.part',
 '/Users/powers/Documents/code/coiled/coiled-resources/local/data/csvs/001.part',
 '/Users/powers/Documents/code/coiled/coiled-resources/local/data/csvs/002.part',
 '/Users/powers/Documents/code/coiled/coiled-resources/local/data/csvs/003.part',
 '/Users/powers/Documents/code/coiled/coiled-resources/local/data/csvs/004.part',
 '/Users/powers/Documents/code/coiled/coiled-resources/local/data/csvs/005.part',
 '/Users/powers/Documents/code/coiled/coiled-resources/local/data/csvs/006.part',
 '/Users/powers/Documents/code/coiled/coiled-resources/local/data/csvs/007.part',
 '/Users/powers/Documents/code/coiled/coiled-resources/local/data/csvs/008.part',
 '/Users/powers/Documents/code/coiled/coiled-resources/local/data/csvs/009.part',
 '/Users/powers/Documents/code/coiled/coiled-resources/local/data/csvs/010.part',
 '/Users/powers/Documents/code/coiled/coiled-resources/local/data/csvs/011.part',
 '/Users/powers/

In [18]:
ddf = dd.read_csv("data/csvs/*.part", dtype=better_dtypes)

In [19]:
%%time
ddf.groupby("id1", dropna=False, observed=True).agg({"v1": "sum"}).compute()

CPU times: user 1min 36s, sys: 8.27 s, total: 1min 44s
Wall time: 1min 1s


Unnamed: 0_level_0,v1
id1,Unnamed: 1_level_1
id001,2997617
id002,2997194
id003,3001204
id004,3000945
id005,3001731
...,...
id096,2997704
id097,3004476
id098,2996139
id099,3001163


## Using Parquet

In [23]:
ddf.to_parquet("data/parquet", engine="pyarrow", compression=None)

(None,)

In [20]:
ddf = dd.read_parquet("data/parquet", engine="pyarrow")

In [21]:
%%time
ddf.groupby("id1", dropna=False, observed=True).agg({"v1": "sum"}).compute()

CPU times: user 42.8 s, sys: 10.1 s, total: 52.9 s
Wall time: 39.1 s


Unnamed: 0_level_0,v1
id1,Unnamed: 1_level_1
id001,2997617
id002,2997194
id003,3001204
id004,3000945
id005,3001731
...,...
id096,2997704
id097,3004476
id098,2996139
id099,3001163


## Using Snappy compressed Parquet

In [4]:
ddf.to_parquet("data/snappy-parquet", engine="pyarrow", compression="snappy")

(None,)

In [22]:
ddf = dd.read_parquet("data/snappy-parquet", engine="pyarrow")

In [23]:
%%time
ddf.groupby("id1", dropna=False, observed=True).agg({"v1": "sum"}).compute()

CPU times: user 46.4 s, sys: 6.68 s, total: 53.1 s
Wall time: 37.8 s


Unnamed: 0_level_0,v1
id1,Unnamed: 1_level_1
id001,2997617
id002,2997194
id003,3001204
id004,3000945
id005,3001731
...,...
id096,2997704
id097,3004476
id098,2996139
id099,3001163


## Leveraging column pruning

In [36]:
ddf = dd.read_parquet("data/snappy-parquet", engine="pyarrow", columns=["id1", "v1"])

In [37]:
%%time
ddf.groupby("id1", dropna=False, observed=True).agg({"v1": "sum"}).compute()

CPU times: user 23.7 s, sys: 685 ms, total: 24.3 s
Wall time: 18.6 s


Unnamed: 0_level_0,v1
id1,Unnamed: 1_level_1
id001,2997617
id002,2997194
id003,3001204
id004,3000945
id005,3001731
...,...
id096,2997704
id097,3004476
id098,2996139
id099,3001163


## Pandas comparison

In [30]:
import pandas as pd

In [33]:
%%time
df = pd.read_csv("data/G1_1e8_1e2_0_0.csv")
df.groupby("id1", dropna=False, observed=True).agg({"v1": "sum"})

CPU times: user 1min 1s, sys: 33 s, total: 1min 34s
Wall time: 3min 2s


Unnamed: 0_level_0,v1
id1,Unnamed: 1_level_1
id001,2997617
id002,2997194
id003,3001204
id004,3000945
id005,3001731
...,...
id096,2997704
id097,3004476
id098,2996139
id099,3001163


In [34]:
df.dtypes

id1     object
id2     object
id3     object
id4      int64
id5      int64
id6      int64
v1       int64
v2       int64
v3     float64
dtype: object