In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import coiled
from dask.distributed import Client
import dask.dataframe as dd
import pandas as pd
import numpy as np

from src.run_ms_queries import run

In [2]:
cluster = coiled.Cluster(
    name="h2o-benchmarks",
    n_workers=10,
    worker_vm_types=["t3.large"],  # 2CPU, 8GiB
    scheduler_vm_types=["t3.large"],
    package_sync=True,
)

Dropped Package - PyYAML, Wheel built from local egg
Dropped Package - appnope, 0.1.2 has no install candidate for linux-64


Output()

ClusterCreationError: Cluster status is error (reason: Scheduler Stopped -> Software environment exited with error code 1) (cluster_id: 80064)

In [None]:
client = Client(cluster)
client.amm.start()
client

In [None]:
data_size = {
    "05GB": "s3://coiled-datasets/h2o-benchmark/N_1e7_K_1e2_parquet/*.parquet",
    "5GB": "s3://coiled-datasets/h2o-benchmark/N_1e8_K_1e2_parquet/*.parquet",
    "50GB": "s3://coiled-datasets/h2o-benchmark/N_1e9_K_1e2_parquet/*.parquet",
}
# Select a data_size

ds = "50GB" # choose "05GB" , "5GB" or "50GB"

In [None]:
ddf = dd.read_parquet(
    data_size[ds],
    engine="pyarrow",
    storage_options={"anon": True},
)
ddf.ids = ddf.id3.astype("string[pyarrow]")

In [None]:
# from src.run_ms_queries import run
run(client, ddf, "sept_run_with_p2p_shuffle_arrow_nightly.csv")

In [None]:
client.shutdown()

## Analyze datasets after two files have been collected

In [None]:
def format_df(fname, release_name):
    df = pd.read_csv(fname)
    df = df.rename(columns={'0': "time (sec)"})
    df["time (sec)"] = pd.to_timedelta(df['time (sec)']).dt.total_seconds()
    df = df.set_index("time (sec)")
    cols = df.columns.tolist()
    cols = [f"{col}_{release_name}" for col in cols]
    df.columns = cols
    return df

In [None]:
files = ["june_run_20_workers_1.csv", "sept_run_20_workers.csv"]

df = format_df(files[0], 'june')
df2 = format_df(files[1], 'sept')

In [None]:
total_df = pd.merge(df, df2, left_index=True, right_index=True, how="outer")
total_df.head()

In [None]:
total_df = total_df.assign(query_6_june = np.nan)
total_df = total_df.assign(query_7_june = np.nan)
total_df = total_df.assign(query_3_june = np.nan)
total_df = total_df.assign(query_6_sept = np.nan)

In [None]:
total_df.head()

In [None]:
cols = total_df.columns.tolist()
cols

In [None]:
cols = []
total_df = total_df[cols]

In [None]:
fig, axes = plt.subplots(3,3, figsize=(12,12), sharey=True)
fig.suptitle('H2O Benchmarks on 50GB Parquet Dataset For Naive Query --  Memory against Runtime (sec)\n20 Workers AWS t3.large instances - 144GiB Total Memory\nAMM On For Sept Release')
total_df[['query_1_june', 'query_1_sept']].plot(ax=axes[0][0], title="Query 1")
total_df[['query_2_june', 'query_2_sept']].plot(ax=axes[0][1], title="Query 2")
total_df[['query_3_june', 'query_3_sept']].plot(ax=axes[0][2], title="Query 3 - June Release Fails")
total_df[['query_4_june', 'query_4_sept']].plot(ax=axes[1][0], title="Query 4")
total_df[['query_5_june', 'query_5_sept']].plot(ax=axes[1][1], title="Query 5")
total_df[['query_6_june', 'query_6_sept']].plot(ax=axes[1][2], title="Query 6 - Not Implemented", legend=False)
total_df[['query_7_june', 'query_7_sept']].plot(ax=axes[2][0], title="Query 7")
total_df[['query_8_june', 'query_8_sept']].plot(ax=axes[2][1], title="Query 8 - June Release Fails")
total_df[['query_9_june', 'query_9_sept']].plot(ax=axes[2][2], title="Query 9")
plt.text(x=0, y=0, s="Queries 8 & 9 Do Not Implement Column Projection")
plt.tight_layout()

In [None]:
fig.savefig("data/H2O_50GB_June_vs_Sept_20_Workers.png")