# h2o benchmarks - query 7

In [1]:
import coiled
import dask
import dask.dataframe as dd

In [2]:
cluster = coiled.Cluster(
    name="powers-crt-004",
    software="crt-004",
    n_workers=16,
    worker_vm_types=["t3.2xlarge"],
)

Output()

In [3]:
client = dask.distributed.Client(cluster)

## 5 GB dataset

In [4]:
ddf = dd.read_parquet(
    "s3://coiled-datasets/h2o-benchmark/N_1e8_K_1e2_parquet",
    storage_options={"anon": True, "use_ssl": True},
    columns=["id3", "v1", "v2"],
)

In [5]:
%%time
ddf.groupby("id3", dropna=False, observed=True).agg({"v1": "max", "v2": "min"}).assign(
    range_v1_v2=lambda x: x["v1"] - x["v2"]
)[["range_v1_v2"]].compute()

CPU times: user 621 ms, sys: 261 ms, total: 882 ms
Wall time: 49.1 s


Unnamed: 0_level_0,range_v1_v2
id3,Unnamed: 1_level_1
id0000608844,4
id0000466449,4
id0000573987,4
id0000776204,4
id0000608718,4
...,...
id0000821599,4
id0000937302,4
id0000248458,4
id0000428431,4


In [6]:
len(ddf)

100000000

In [7]:
ddf.npartitions

100

In [8]:
%%time
ddf.groupby("id3", dropna=False, observed=True).agg(
    {"v1": "max", "v2": "min"}, split_out=2
).assign(range_v1_v2=lambda x: x["v1"] - x["v2"])[["range_v1_v2"]].compute()

CPU times: user 1.35 s, sys: 271 ms, total: 1.62 s
Wall time: 57.5 s


Unnamed: 0_level_0,range_v1_v2
id3,Unnamed: 1_level_1
id0000573987,4
id0000776204,4
id0000545596,4
id0000339296,4
id0000226214,4
...,...
id0000710890,4
id0000549495,4
id0000937302,4
id0000248458,4


In [9]:
%%time
ddf.groupby("id3", dropna=False, observed=True).agg(
    {"v1": "max", "v2": "min"}, split_out=20
).assign(range_v1_v2=lambda x: x["v1"] - x["v2"])[["range_v1_v2"]].compute()

CPU times: user 13.8 s, sys: 2.37 s, total: 16.2 s
Wall time: 3min 48s


Unnamed: 0_level_0,range_v1_v2
id3,Unnamed: 1_level_1
id0000280302,4
id0000285456,4
id0000276808,4
id0000997868,4
id0000476252,4
...,...
id0000236113,4
id0000867213,4
id0000801592,4
id0000192565,4


## 50 GB dataset

In [9]:
ddf = dd.read_parquet(
    "s3://coiled-datasets/h2o-benchmark/N_1e9_K_1e2_parquet",
    storage_options={"anon": True, "use_ssl": True},
    columns=["id3", "v1", "v2"],
).repartition(200)

In [10]:
ddf.memory_usage_per_partition(deep=True).compute()

0      516761748
1      516709446
2      516752157
3      516725454
4      516729111
         ...    
195    516726213
196    516708756
197    516726627
198    516774099
199    516718209
Length: 200, dtype: int64

In [11]:
ddf.memory_usage(deep=True).compute().sum()

102810430763

In [12]:
len(ddf)

1000000000

In [13]:
ddf.npartitions

200

In [14]:
%%time
ddf.groupby("id3", dropna=False, observed=True).agg({"v1": "max", "v2": "min"}).assign(
    range_v1_v2=lambda x: x["v1"] - x["v2"]
)[["range_v1_v2"]].compute()

CPU times: user 10.5 s, sys: 2.47 s, total: 13 s
Wall time: 19min 2s


Unnamed: 0_level_0,range_v1_v2
id3,Unnamed: 1_level_1
id0001128815,4
id0005623825,4
id0003397739,4
id0003179275,4
id0009018447,4
...,...
id0005766523,4
id0004568305,4
id0006240462,4
id0003527892,4


In [13]:
%%time
ddf.groupby("id3", dropna=False, observed=True).agg(
    {"v1": "max", "v2": "min"}, split_out=4
).assign(range_v1_v2=lambda x: x["v1"] - x["v2"])[["range_v1_v2"]].compute()

KilledWorker: ("('aggregate-combine-c9139dd3b2368cb17d16b5da9fea5749', 0, 3, 0)", <WorkerState 'tls://10.4.0.137:40037', name: powers-crt-004-worker-4f85fa2e51, status: closed, memory: 0, processing: 7>)

## Try with pyarrow[string] column

In [15]:
ddf.dtypes

id3    category
v1        Int32
v2        Int32
dtype: object

In [17]:
ddf['id3'] = ddf['id3'].astype('string[pyarrow]')

In [18]:
ddf.memory_usage(deep=True).compute().sum()

34000000000

In [19]:
ddf.dtypes

id3    string
v1      Int32
v2      Int32
dtype: object

In [20]:
%%time
ddf.groupby("id3", dropna=False, observed=True).agg({"v1": "max", "v2": "min"}).assign(
    range_v1_v2=lambda x: x["v1"] - x["v2"]
)[["range_v1_v2"]].compute()

CPU times: user 5.09 s, sys: 1.73 s, total: 6.82 s
Wall time: 12min 35s


Unnamed: 0_level_0,range_v1_v2
id3,Unnamed: 1_level_1
id0000000001,4
id0000000002,4
id0000000003,4
id0000000004,4
id0000000005,4
...,...
id0009999996,4
id0009999997,4
id0009999998,4
id0009999999,4
