## Groupby data to S3 - coiled

Use this notebook to write to s3 groupby data for cases N= 1e7, 1e8, 1e9. 

In [None]:
import numpy as np
import pandas as pd
from dask.distributed import Client, wait

In [None]:
import coiled

In [None]:
def create_single_df(N, K, nfiles, dir, i):
    """
    Creates a single pandas dataframe that contains nrows=N/nfiles

    Parameters
    ----------
    N: int,
     Total number of rows
    K: int,
     Number of groups
    nfiles: int,
     Number of output files
    dir: str,
     Output directory
    i: int,
     Integer to assign to the multiple files e.g. range(nfiles)
    """

    nrows = int(N / nfiles)

    sample_id12 = [f"id{str(x).zfill(3)}" for x in range(1, K + 1)]
    sample_id3 = [f"id{str(x).zfill(10)}" for x in range(1, int(N / K) + 1)]

    id1 = np.random.choice(sample_id12, size=nrows, replace=True)
    id2 = np.random.choice(sample_id12, size=nrows, replace=True)
    id3 = np.random.choice(sample_id3, size=nrows, replace=True)
    id4 = np.random.choice(K, size=nrows, replace=True)
    id5 = np.random.choice(K, size=nrows, replace=True)
    id6 = np.random.choice(int(N / K), size=nrows, replace=True)
    v1 = np.random.choice(5, size=nrows, replace=True)
    v2 = np.random.choice(15, size=nrows, replace=True)
    v3 = np.random.uniform(0, 100, size=nrows)

    df = pd.DataFrame(
        dict(
            zip(
                [f"id{x}" for x in range(1, 7)] + ["v1", "v2", "v3"],
                [id1, id2, id3, id4, id5, id6, v1, v2, v3],
            )
        )
    )

    #DATA DF AS TYPE
    # df.astype({"id3": "string[pyarrow]"})
    
    df.to_csv(
        f"{dir}/groupby-N_{N}_K_{K}_file_{i}.csv",
        index=False,
        float_format="{:.6f}".format,
    )

In [None]:
cluster = coiled.Cluster(n_workers=50, 
                         package_sync=True)

In [None]:
cluster = coiled.Cluster(name="ncclementi-26ca1857-6", 
                        scheduler_options={"idle_timeout": "2 hour"})

In [None]:
client = Client(cluster)
client

In [None]:
N = 1e9 #change accordingly N= 1e7, 1e8, 1e9
K = 1e2

In [None]:
s3_dir = "s3://coiled-datasets/h2o-benchmark/"

In [None]:
N = int(N)
K = int(K)
nfiles = 1000
dir = s3_dir + "N_1e9_K_1e2"  #change accordingly N= 1e7, 1e8, 1e9

In [None]:
futures = client.map(
            lambda i: create_single_df(N, K, nfiles, dir, i), range(nfiles)
        )