In [None]:
import coiled
from dask.distributed import Client
import dask

In [None]:
import dask.dataframe as dd

In [None]:
cluster = coiled.Cluster(n_workers=100)
client = cluster.get_client()

In [None]:
client

In [None]:
dask.config.set({"dataframe.convert-string": True})

In [None]:
df = dd.read_parquet(
    "s3://coiled-datasets/uber-lyft-tlc/",
    storage_options={'anon': True}
)

In [None]:
df_full = df

In [None]:
df.dtypes

In [None]:
df.head()

In [None]:
dask.utils.format_bytes(
    df.memory_usage(deep=True).sum().compute()
)

In [None]:
df_sample = df.sample(frac=0.2)

In [None]:
df_sample = df_sample.persist()

In [None]:
dask.utils.format_bytes(
    df_sample.memory_usage(deep=True).sum().compute()
)

In [None]:
from dask.sizeof import sizeof

In [None]:
partitions_mem_stats = df_sample.map_partitions(sizeof).compute()

In [None]:
(partitions_mem_stats / 1024**2).describe() #in MiB

In [None]:
dask.utils.format_bytes(df_sample.partitions[0].memory_usage(deep=True).compute().sum())

In [None]:
%%time
#passanger fare
df_sample.base_passenger_fare.sum().compute() / 1e9

In [None]:
%%time
#tip
df_sample.tips.sum().compute() / 1e6

In [None]:
%%time
df_sample.loc[lambda x: x.tips > 0].groupby("hvfhs_license_num").tips.agg(["sum", "mean"]).compute()

## Partition size 1MB 

Runs are ~11X slower compared to 13MB partitions

In [None]:
df_sample = df_sample.repartition(partition_size="10MB").persist()

In [None]:
dask.utils.format_bytes(df_sample.partitions[0].memory_usage(deep=True).compute().sum())

In [None]:
%%time
#passanger fare
df_sample.base_passenger_fare.sum().compute() / 1e9

In [None]:
%%time
#tip
df_sample.tips.sum().compute() / 1e6

In [None]:
%%time
df_sample.loc[lambda x: x.tips > 0].groupby("hvfhs_license_num").tips.agg(["sum", "mean"]).compute()

## Write 1MB partition data to parquet and csv

In [None]:
df_sample.to_parquet("s3://coiled-datasets/uber-lyft-tlc-sample/parquet-0.2-10/");

In [None]:
df_sample.to_csv("s3://coiled-datasets/uber-lyft-tlc-sample/csv-0.2-10/");

In [None]:
ddf = dd.read_csv(
    "s3://coiled-datasets/uber-lyft-tlc-sample/csv-0.2-10/*", 
    dtype={"wav_match_flag": "category"},
)

In [None]:
dask.utils.format_bytes(
    ddf.memory_usage(deep=True).sum().compute()
)