## Grouby queries from h2o-benchmarks (parquet)

In [None]:
from dask.distributed import Client
import coiled
import dask.dataframe as dd
from dask.distributed import performance_report
import pandas as pd

In [None]:
cluster = coiled.Cluster(
    name=f"h2o-benchmarks",
    n_workers=10,
    worker_vm_types=["t3.large"],  # 2CPU, 8GiB
    scheduler_vm_types=["t3.large"],
    package_sync=True,
)

In [None]:
client = Client(cluster)
client

In [None]:
data_size = {"05GB": "s3://coiled-datasets/h2o-benchmark/N_1e7_K_1e2_parquet/*.parquet",
             "5GB":  "s3://coiled-datasets/h2o-benchmark/N_1e8_K_1e2_parquet/*.parquet",
             "50GB": "s3://coiled-datasets/h2o-benchmark/N_1e9_K_1e2_parquet/*.parquet"}

In [None]:
def read_data(data_uri):
    ddf = dd.read_parquet(
                data_uri, engine="pyarrow", storage_options={"anon": True}
            )
    return ddf

In [None]:
ds = "50GB" # choose "05GB" , "5GB" or "50GB"
ddf = read_data(data_size[ds])

### Q1

In [None]:
with performance_report(filename=f"performance-reports/q1_data_{ds}.html"):
    
    ddf_q1 = ddf[["id1", "v1"]]
    ddf_q1.groupby("id1", dropna=False, observed=True).agg({"v1": "sum"}).compute()

### Q2

In [None]:
with performance_report(filename=f"performance-reports/q2_data_{ds}.html"):
    
    ddf_q2 = ddf[["id1", "id2", "v1"]]
    (
        ddf_q2.groupby(["id1", "id2"], dropna=False, observed=True)
        .agg({"v1": "sum"})
        .compute()
    )

### Q3 

"50GB" got

```python
KilledWorker: ("('aggregate-chunk-1352eeaf67172ec25f4661a39965a066-d11c92fa9cd76cfe076460d4406ad2c6', 1736)", <WorkerState 'tls://10.0.11.26:40507', name: h2o-benchmarks-worker-e7b4d2ced4, status: closed, memory: 0, processing: 45>)

WARNING:root:error sending AWS credentials to cluster: Timed out trying to connect to 
tls://10.0.10.223:38937 after 30 s


```

In [None]:
with performance_report(filename=f"performance-reports/q3_data_{ds}.html"):
    
    ddf_q3 = ddf[["id3", "v1", "v3"]]
    (
        ddf_q3.groupby("id3", dropna=False, observed=True)
        .agg({"v1": "sum", "v3": "mean"})
        .compute()
    )

### Q4

In [None]:
with performance_report(filename=f"performance-reports/q4_data_{ds}.html"):
    
    ddf_q4 = ddf[["id4", "v1", "v2", "v3"]]
    (
        ddf_q4.groupby("id4", dropna=False, observed=True)
        .agg({"v1": "mean", "v2": "mean", "v3": "mean"})
        .compute()
    )

### Q5
50GB we see some spilling about 5GB of spilling, some workers dying but restarting very close to finishing, untile we got a:

```python
KilledWorker: ("('aggregate-combine-0c4783a5d5d1ec1968c74fb8baec15e9', 0, 3)", <WorkerState 'tls://10.0.15.17:36179', name: h2o-benchmarks-worker-7d835dad59, status: closed, memory: 0, processing: 1>)
```

In [None]:
with performance_report(filename=f"performance-reports/q5_data_{ds}.html"):
    
    ddf_q5 = ddf[["id6", "v1", "v2", "v3"]]
    (
        ddf_q5.groupby("id6", dropna=False, observed=True)
        .agg({"v1": "sum", "v2": "sum", "v3": "sum"})
        .compute()
    )

### Q7 
50GB 
```python
KilledWorker: ("('aggregate-chunk-351c6302b956751c118f4e66ba112f29-25ffa2e10908bad5aab7bf2896ccaca1', 1742)", <WorkerState 'tls://10.0.10.247:40875', name: h2o-benchmarks-worker-fa206d90a7, status: closed, memory: 0, processing: 40>)
```

In [None]:
with performance_report(filename=f"performance-reports/q7_data_{ds}.html"):
    
    ddf_q7 = ddf[["id3", "v1", "v2"]]
    (
        ddf_q7.groupby("id3", dropna=False, observed=True)
        .agg({"v1": "max", "v2": "min"})
        .assign(range_v1_v2=lambda x: x["v1"] - x["v2"])[["range_v1_v2"]]
        .compute()
    )

### Q8
Takes a long time in general. 

In [None]:
with performance_report(filename=f"performance-reports/q8_data_{ds}.html"):
    
    ddf_q8 = ddf[["id6", "v1", "v2", "v3"]]
    (
        ddf_q8[~ddf_q8["v3"].isna()][["id6", "v3"]]
        .groupby("id6", dropna=False, observed=True)
        .apply(
            lambda x: x.nlargest(2, columns="v3"),
            meta={"id6": "Int64", "v3": "float64"},
        )[["v3"]]
        .compute()
    )

### Q9

In [None]:
with performance_report(filename=f"performance-reports/q9_data_{ds}.html"):
    
    ddf_q9 = ddf[["id2", "id4", "v1", "v2"]]
    (
        ddf_q9[["id2", "id4", "v1", "v2"]]
        .groupby(["id2", "id4"], dropna=False, observed=True)
        .apply(
            lambda x: pd.Series({"r2": x.corr()["v1"]["v2"] ** 2}),
            meta={"r2": "float64"},
        )
        .compute()
    )