## Pandas memory usage

In [55]:
import pandas as pd
import dask

In [56]:
df = pd.DataFrame(
    {"nums": [1, 2, 3, 4, 5, 6], "letters": ["a", "b", "c", "d", "e", "f"]}
)

In [57]:
df

Unnamed: 0,nums,letters
0,1,a
1,2,b
2,3,c
3,4,d
4,5,e
5,6,f


In [58]:
df.memory_usage(deep=True)

Index      128
nums        48
letters    348
dtype: int64

In [59]:
df.memory_usage(deep=True).sum()

524

In [60]:
dask.sizeof.sizeof(df)

2524

## Localhost Dask Memory Usage

In [61]:
from dask import dataframe as dd 

In [62]:
ddf = dd.from_pandas(df, npartitions=2)

In [63]:
for i in range(ddf.npartitions):
    print(ddf.partitions[i].compute())

   nums letters
0     1       a
1     2       b
2     3       c
   nums letters
3     4       d
4     5       e
5     6       f


In [64]:
ddf.partitions[0].memory_usage(deep=True).compute()

Index      128
letters    174
nums        24
dtype: int64

In [69]:
(ddf
    .map_partitions(lambda x: x.memory_usage(deep=True).sum())
    .compute())

0    326
1    330
dtype: int64

In [67]:
(ddf
   .map_partitions(lambda x: x.memory_usage(deep=False).sum())
   .compute())

0    176
1    180
dtype: int64

## Dask memory usage

In [70]:
import coiled
import dask.dataframe as dd
import dask
from dask.utils import format_bytes
from dask.distributed import Client

In [71]:
cluster = coiled.Cluster(name="demo-parquet", n_workers=5)

Output()

In [3]:
client = Client(cluster)


+---------+--------+-----------+---------+
| Package | client | scheduler | workers |
+---------+--------+-----------+---------+
| numpy   | 1.21.1 | 1.21.2    | 1.21.2  |
| pandas  | 1.3.1  | 1.3.2     | 1.3.2   |
+---------+--------+-----------+---------+


In [72]:
ddf = dd.read_parquet(
    "s3://coiled-datasets/timeseries/20-years/parquet",
    storage_options={"anon": True, 'use_ssl': True}
)

In [73]:
%%time

ddf.map_partitions(lambda x: x.memory_usage(deep=True).sum()).compute()

KeyboardInterrupt: 

In [74]:
(ddf
    .map_partitions(lambda x: format_bytes(x.memory_usage(deep=True)).sum())
    .compute())

ValueError: Metadata inference failed in `lambda`.

You have supplied a custom function and Dask is unable to 
determine the type of output that that function returns. 

To resolve this please provide a meta= keyword.
The docstring of the Dask function you ran should have more information.

Original error is below:
------------------------
ValueError('The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().')

Traceback:
---------
  File "/Users/powers/opt/miniconda3/envs/standard-coiled/lib/python3.9/site-packages/dask/dataframe/utils.py", line 176, in raise_on_meta_error
    yield
  File "/Users/powers/opt/miniconda3/envs/standard-coiled/lib/python3.9/site-packages/dask/dataframe/core.py", line 5672, in _emulate
    return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))
  File "/var/folders/d2/116lnkgd0l7f51xr7msb2jnh0000gn/T/ipykernel_1844/1939325964.py", line 2, in <lambda>
    .map_partitions(lambda x: format_bytes(x.memory_usage(deep=True)).sum())
  File "/Users/powers/opt/miniconda3/envs/standard-coiled/lib/python3.9/site-packages/dask/utils.py", line 1483, in format_bytes
    if n >= k * 0.9:
  File "/Users/powers/opt/miniconda3/envs/standard-coiled/lib/python3.9/site-packages/pandas/core/generic.py", line 1534, in __nonzero__
    raise ValueError(


In [11]:
%%time

ddf.map_partitions(lambda x: dask.sizeof.sizeof(x)).compute()

CPU times: user 2.06 s, sys: 197 ms, total: 2.26 s
Wall time: 1min 32s


0       56822960
1       57125360
2       56822960
3       57246320
4       57306800
          ...   
1090    56974160
1091    57004400
1092    57337040
1093    56822960
1094    57004400
Length: 1095, dtype: int64

In [34]:
%%time

ddf.map_partitions(lambda x: x.memory_usage(deep=True)).compute()

CPU times: user 1.96 s, sys: 212 ms, total: 2.17 s
Wall time: 1min 15s


Index     4838400
id        4838400
name     37707427
x         4838400
y         4838400
           ...   
Index     4838400
id        4838400
name     37708389
x         4838400
y         4838400
Length: 5475, dtype: int64

## Filter DataFrame

In [29]:
filtered_ddf = ddf.loc[ddf["id"] > 1150]

In [37]:
%%time

filtered_ddf.map_partitions(lambda x: x.memory_usage(deep=True).sum()).compute()

CPU times: user 1.8 s, sys: 236 ms, total: 2.04 s
Wall time: 1min 3s


0         0
1        94
2         0
3         0
4       187
       ... 
1090      0
1091    189
1092      0
1093      0
1094      0
Length: 1095, dtype: int64

In [30]:
%%time

filtered_ddf.map_partitions(lambda x: dask.sizeof.sizeof(x)).compute()

CPU times: user 1.94 s, sys: 218 ms, total: 2.16 s
Wall time: 1min 32s


0       2000
1       2094
2       2000
3       2000
4       2187
        ... 
1090    2000
1091    2189
1092    2000
1093    2000
1094    2000
Length: 1095, dtype: int64

In [32]:
filtered_ddf.map_partitions(len).compute()

0       0
1       1
2       0
3       0
4       2
       ..
1090    0
1091    2
1092    0
1093    0
1094    0
Length: 1095, dtype: int64

In [33]:
filtered_ddf.repartition(2).map_partitions(lambda x: dask.sizeof.sizeof(x)).compute()

0    56889
1    50854
dtype: int64

## Assessing imbalance

In [None]:
def partition_report(ddf):
    series = memory_by_partition(ddf)
    lt_1kb = series.where(lambda x : x < 1000).count()
    lt_1mb = series.where(lambda x : x < 1000000).count()
    gt_1gb = series.where(lambda x : x > 1000000000).count()
    total = series.count()
    print(f"Total number of partitions: {total}")
    print(f"Num partitions < 1 KB: {lt_1kb} (YY%)")
    print(f"Num partitions < 1 MB: XX (YY%)")
    print(f"Num partitions > 1 GB: XX (YY%)")

## Single partition

In [31]:
df = ddf.partitions[0]

In [9]:
df

Unnamed: 0_level_0,id,name,x,y
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-01 00:00:00,1008,Dan,-0.259374,-0.118314
2000-01-01 00:00:01,987,Patricia,0.069601,0.755351
2000-01-01 00:00:02,980,Zelda,-0.281843,-0.510507
2000-01-01 00:00:03,1020,Ursula,-0.569904,0.523132
2000-01-01 00:00:04,967,Michael,-0.251460,0.810930
...,...,...,...,...
2000-01-07 23:59:55,984,Michael,-0.147591,0.265310
2000-01-07 23:59:56,1030,Yvonne,-0.817655,0.345774
2000-01-07 23:59:57,979,Ingrid,0.653402,-0.366750
2000-01-07 23:59:58,970,Charlie,0.423712,0.533407


In [11]:
df.memory_usage(deep=True)

Index     4838400
id        4838400
name     37707427
x         4838400
y         4838400
dtype: int64

In [12]:
df.memory_usage(deep=True).sum()

57061027

In [10]:
format_bytes(df["id"].memory_usage(deep=True))

'9.23 MiB'