# Merge Dask DataFrames

## Merge Large Dask DataFrame to Small Pandas DataFrame

In [1]:
import coiled

In [2]:
import pandas as pd
import dask.dataframe as dd
import dask

In [3]:
cluster = coiled.Cluster(
    name="dask-merge",
    n_workers=50,
    backend_options={'spot':'True'},
    software='coiled-examples/numpy-zarr',
)

In [4]:
from distributed import Client
client = Client(cluster)
client


+-------------+---------------+---------------+---------------+
| Package     | client        | scheduler     | workers       |
+-------------+---------------+---------------+---------------+
| dask        | 2021.11.2     | 2021.11.1     | 2021.11.1     |
| distributed | 2021.11.2     | 2021.11.1     | 2021.11.1     |
| lz4         | None          | 3.1.3         | 3.1.3         |
| msgpack     | 1.0.3         | 1.0.2         | 1.0.2         |
| numpy       | 1.21.4        | 1.21.2        | 1.21.2        |
| pandas      | 1.3.5         | 1.3.4         | 1.3.4         |
| python      | 3.9.9.final.0 | 3.9.6.final.0 | 3.9.6.final.0 |
+-------------+---------------+---------------+---------------+
Notes: 
-  msgpack: Variation is ok, as long as everything is above 0.6


0,1
Connection method: Cluster object,Cluster type: coiled.Cluster
Dashboard: http://44.198.171.216:8787,

0,1
Dashboard: http://44.198.171.216:8787,Workers: 50
Total threads: 100,Total memory: 383.35 GiB

0,1
Comm: tls://10.4.4.132:8786,Workers: 50
Dashboard: http://10.4.4.132:8787/status,Total threads: 100
Started: 12 minutes ago,Total memory: 383.35 GiB

0,1
Comm: tls://10.4.2.235:41121,Total threads: 2
Dashboard: http://10.4.2.235:45615/status,Memory: 7.67 GiB
Nanny: tls://10.4.2.235:32879,
Local directory: /dask-worker-space/worker-gh9vzp03,Local directory: /dask-worker-space/worker-gh9vzp03

0,1
Comm: tls://10.4.3.234:39331,Total threads: 2
Dashboard: http://10.4.3.234:43721/status,Memory: 7.67 GiB
Nanny: tls://10.4.3.234:41509,
Local directory: /dask-worker-space/worker-fsxm2b4t,Local directory: /dask-worker-space/worker-fsxm2b4t

0,1
Comm: tls://10.4.1.96:41995,Total threads: 2
Dashboard: http://10.4.1.96:43671/status,Memory: 7.67 GiB
Nanny: tls://10.4.1.96:33617,
Local directory: /dask-worker-space/worker-_b_nszsj,Local directory: /dask-worker-space/worker-_b_nszsj

0,1
Comm: tls://10.4.14.195:41747,Total threads: 2
Dashboard: http://10.4.14.195:43475/status,Memory: 7.67 GiB
Nanny: tls://10.4.14.195:35527,
Local directory: /dask-worker-space/worker-3muph73q,Local directory: /dask-worker-space/worker-3muph73q

0,1
Comm: tls://10.4.15.131:35905,Total threads: 2
Dashboard: http://10.4.15.131:45377/status,Memory: 7.67 GiB
Nanny: tls://10.4.15.131:45307,
Local directory: /dask-worker-space/worker-_mhzbvii,Local directory: /dask-worker-space/worker-_mhzbvii

0,1
Comm: tls://10.4.4.148:36011,Total threads: 2
Dashboard: http://10.4.4.148:43557/status,Memory: 7.67 GiB
Nanny: tls://10.4.4.148:43845,
Local directory: /dask-worker-space/worker-ghrrqop3,Local directory: /dask-worker-space/worker-ghrrqop3

0,1
Comm: tls://10.4.15.3:43799,Total threads: 2
Dashboard: http://10.4.15.3:33771/status,Memory: 7.67 GiB
Nanny: tls://10.4.15.3:33443,
Local directory: /dask-worker-space/worker-0lprs0ab,Local directory: /dask-worker-space/worker-0lprs0ab

0,1
Comm: tls://10.4.3.184:42237,Total threads: 2
Dashboard: http://10.4.3.184:43623/status,Memory: 7.67 GiB
Nanny: tls://10.4.3.184:36677,
Local directory: /dask-worker-space/worker-14uxz7ki,Local directory: /dask-worker-space/worker-14uxz7ki

0,1
Comm: tls://10.4.5.186:36053,Total threads: 2
Dashboard: http://10.4.5.186:38841/status,Memory: 7.67 GiB
Nanny: tls://10.4.5.186:34287,
Local directory: /dask-worker-space/worker-ub21q1l_,Local directory: /dask-worker-space/worker-ub21q1l_

0,1
Comm: tls://10.4.10.229:39203,Total threads: 2
Dashboard: http://10.4.10.229:43891/status,Memory: 7.67 GiB
Nanny: tls://10.4.10.229:36083,
Local directory: /dask-worker-space/worker-h7hp9osv,Local directory: /dask-worker-space/worker-h7hp9osv

0,1
Comm: tls://10.4.1.91:36631,Total threads: 2
Dashboard: http://10.4.1.91:33227/status,Memory: 7.67 GiB
Nanny: tls://10.4.1.91:41259,
Local directory: /dask-worker-space/worker-mv469wdl,Local directory: /dask-worker-space/worker-mv469wdl

0,1
Comm: tls://10.4.13.129:34631,Total threads: 2
Dashboard: http://10.4.13.129:45595/status,Memory: 7.67 GiB
Nanny: tls://10.4.13.129:33505,
Local directory: /dask-worker-space/worker-7r3a52du,Local directory: /dask-worker-space/worker-7r3a52du

0,1
Comm: tls://10.4.2.59:42543,Total threads: 2
Dashboard: http://10.4.2.59:46071/status,Memory: 7.67 GiB
Nanny: tls://10.4.2.59:42429,
Local directory: /dask-worker-space/worker-o9_jhgwi,Local directory: /dask-worker-space/worker-o9_jhgwi

0,1
Comm: tls://10.4.3.181:41687,Total threads: 2
Dashboard: http://10.4.3.181:44655/status,Memory: 7.67 GiB
Nanny: tls://10.4.3.181:38741,
Local directory: /dask-worker-space/worker-tyf4pa2t,Local directory: /dask-worker-space/worker-tyf4pa2t

0,1
Comm: tls://10.4.12.26:42745,Total threads: 2
Dashboard: http://10.4.12.26:41639/status,Memory: 7.67 GiB
Nanny: tls://10.4.12.26:42683,
Local directory: /dask-worker-space/worker-aob5zw5o,Local directory: /dask-worker-space/worker-aob5zw5o

0,1
Comm: tls://10.4.11.152:46391,Total threads: 2
Dashboard: http://10.4.11.152:33405/status,Memory: 7.67 GiB
Nanny: tls://10.4.11.152:33525,
Local directory: /dask-worker-space/worker-ndel1obp,Local directory: /dask-worker-space/worker-ndel1obp

0,1
Comm: tls://10.4.2.36:34405,Total threads: 2
Dashboard: http://10.4.2.36:43489/status,Memory: 7.67 GiB
Nanny: tls://10.4.2.36:35079,
Local directory: /dask-worker-space/worker-c0i7nghk,Local directory: /dask-worker-space/worker-c0i7nghk

0,1
Comm: tls://10.4.4.17:36569,Total threads: 2
Dashboard: http://10.4.4.17:37059/status,Memory: 7.67 GiB
Nanny: tls://10.4.4.17:39757,
Local directory: /dask-worker-space/worker-osrl0x01,Local directory: /dask-worker-space/worker-osrl0x01

0,1
Comm: tls://10.4.0.164:44351,Total threads: 2
Dashboard: http://10.4.0.164:39043/status,Memory: 7.67 GiB
Nanny: tls://10.4.0.164:37111,
Local directory: /dask-worker-space/worker-fd9ko7uo,Local directory: /dask-worker-space/worker-fd9ko7uo

0,1
Comm: tls://10.4.12.79:44305,Total threads: 2
Dashboard: http://10.4.12.79:35627/status,Memory: 7.67 GiB
Nanny: tls://10.4.12.79:34155,
Local directory: /dask-worker-space/worker-ij5zm6iq,Local directory: /dask-worker-space/worker-ij5zm6iq

0,1
Comm: tls://10.4.12.38:35927,Total threads: 2
Dashboard: http://10.4.12.38:35169/status,Memory: 7.67 GiB
Nanny: tls://10.4.12.38:34883,
Local directory: /dask-worker-space/worker-nnc4ni18,Local directory: /dask-worker-space/worker-nnc4ni18

0,1
Comm: tls://10.4.6.146:41311,Total threads: 2
Dashboard: http://10.4.6.146:41241/status,Memory: 7.67 GiB
Nanny: tls://10.4.6.146:36919,
Local directory: /dask-worker-space/worker-olsmp4kl,Local directory: /dask-worker-space/worker-olsmp4kl

0,1
Comm: tls://10.4.1.236:42063,Total threads: 2
Dashboard: http://10.4.1.236:34269/status,Memory: 7.67 GiB
Nanny: tls://10.4.1.236:42365,
Local directory: /dask-worker-space/worker-1yhvr1uy,Local directory: /dask-worker-space/worker-1yhvr1uy

0,1
Comm: tls://10.4.8.83:44483,Total threads: 2
Dashboard: http://10.4.8.83:46781/status,Memory: 7.67 GiB
Nanny: tls://10.4.8.83:42417,
Local directory: /dask-worker-space/worker-q3x49_8y,Local directory: /dask-worker-space/worker-q3x49_8y

0,1
Comm: tls://10.4.10.32:44533,Total threads: 2
Dashboard: http://10.4.10.32:36625/status,Memory: 7.67 GiB
Nanny: tls://10.4.10.32:41183,
Local directory: /dask-worker-space/worker-mguseh9e,Local directory: /dask-worker-space/worker-mguseh9e

0,1
Comm: tls://10.4.13.93:34745,Total threads: 2
Dashboard: http://10.4.13.93:39715/status,Memory: 7.67 GiB
Nanny: tls://10.4.13.93:46007,
Local directory: /dask-worker-space/worker-0n_rvl6p,Local directory: /dask-worker-space/worker-0n_rvl6p

0,1
Comm: tls://10.4.15.105:39813,Total threads: 2
Dashboard: http://10.4.15.105:36135/status,Memory: 7.67 GiB
Nanny: tls://10.4.15.105:46435,
Local directory: /dask-worker-space/worker-r829oq2u,Local directory: /dask-worker-space/worker-r829oq2u

0,1
Comm: tls://10.4.14.202:43549,Total threads: 2
Dashboard: http://10.4.14.202:34601/status,Memory: 7.67 GiB
Nanny: tls://10.4.14.202:43877,
Local directory: /dask-worker-space/worker-bkkgbkz9,Local directory: /dask-worker-space/worker-bkkgbkz9

0,1
Comm: tls://10.4.5.219:37053,Total threads: 2
Dashboard: http://10.4.5.219:43119/status,Memory: 7.67 GiB
Nanny: tls://10.4.5.219:43661,
Local directory: /dask-worker-space/worker-dr4b78c_,Local directory: /dask-worker-space/worker-dr4b78c_

0,1
Comm: tls://10.4.3.154:38019,Total threads: 2
Dashboard: http://10.4.3.154:46541/status,Memory: 7.67 GiB
Nanny: tls://10.4.3.154:34089,
Local directory: /dask-worker-space/worker-3323ka6j,Local directory: /dask-worker-space/worker-3323ka6j

0,1
Comm: tls://10.4.14.16:43071,Total threads: 2
Dashboard: http://10.4.14.16:33185/status,Memory: 7.67 GiB
Nanny: tls://10.4.14.16:45427,
Local directory: /dask-worker-space/worker-v2omn951,Local directory: /dask-worker-space/worker-v2omn951

0,1
Comm: tls://10.4.11.40:44249,Total threads: 2
Dashboard: http://10.4.11.40:45719/status,Memory: 7.67 GiB
Nanny: tls://10.4.11.40:43995,
Local directory: /dask-worker-space/worker-4cf9ch5x,Local directory: /dask-worker-space/worker-4cf9ch5x

0,1
Comm: tls://10.4.9.251:45873,Total threads: 2
Dashboard: http://10.4.9.251:38081/status,Memory: 7.67 GiB
Nanny: tls://10.4.9.251:46641,
Local directory: /dask-worker-space/worker-ix33w8tx,Local directory: /dask-worker-space/worker-ix33w8tx

0,1
Comm: tls://10.4.8.220:35935,Total threads: 2
Dashboard: http://10.4.8.220:46359/status,Memory: 7.67 GiB
Nanny: tls://10.4.8.220:46387,
Local directory: /dask-worker-space/worker-rgdpe8vh,Local directory: /dask-worker-space/worker-rgdpe8vh

0,1
Comm: tls://10.4.14.246:34023,Total threads: 2
Dashboard: http://10.4.14.246:34109/status,Memory: 7.67 GiB
Nanny: tls://10.4.14.246:42125,
Local directory: /dask-worker-space/worker-q32oqr16,Local directory: /dask-worker-space/worker-q32oqr16

0,1
Comm: tls://10.4.9.137:36161,Total threads: 2
Dashboard: http://10.4.9.137:43097/status,Memory: 7.67 GiB
Nanny: tls://10.4.9.137:39155,
Local directory: /dask-worker-space/worker-hj1194jq,Local directory: /dask-worker-space/worker-hj1194jq

0,1
Comm: tls://10.4.13.32:35483,Total threads: 2
Dashboard: http://10.4.13.32:35271/status,Memory: 7.67 GiB
Nanny: tls://10.4.13.32:35229,
Local directory: /dask-worker-space/worker-nwyhpi1a,Local directory: /dask-worker-space/worker-nwyhpi1a

0,1
Comm: tls://10.4.15.179:38773,Total threads: 2
Dashboard: http://10.4.15.179:46049/status,Memory: 7.67 GiB
Nanny: tls://10.4.15.179:34377,
Local directory: /dask-worker-space/worker-_4sr860l,Local directory: /dask-worker-space/worker-_4sr860l

0,1
Comm: tls://10.4.11.234:37815,Total threads: 2
Dashboard: http://10.4.11.234:45587/status,Memory: 7.67 GiB
Nanny: tls://10.4.11.234:36255,
Local directory: /dask-worker-space/worker-y_650ate,Local directory: /dask-worker-space/worker-y_650ate

0,1
Comm: tls://10.4.5.223:39711,Total threads: 2
Dashboard: http://10.4.5.223:41139/status,Memory: 7.67 GiB
Nanny: tls://10.4.5.223:38531,
Local directory: /dask-worker-space/worker-gmdnh04e,Local directory: /dask-worker-space/worker-gmdnh04e

0,1
Comm: tls://10.4.3.24:37541,Total threads: 2
Dashboard: http://10.4.3.24:46027/status,Memory: 7.67 GiB
Nanny: tls://10.4.3.24:39513,
Local directory: /dask-worker-space/worker-7rguggto,Local directory: /dask-worker-space/worker-7rguggto

0,1
Comm: tls://10.4.13.250:41839,Total threads: 2
Dashboard: http://10.4.13.250:39545/status,Memory: 7.67 GiB
Nanny: tls://10.4.13.250:45323,
Local directory: /dask-worker-space/worker-m6n1vi62,Local directory: /dask-worker-space/worker-m6n1vi62

0,1
Comm: tls://10.4.9.220:42215,Total threads: 2
Dashboard: http://10.4.9.220:32943/status,Memory: 7.67 GiB
Nanny: tls://10.4.9.220:38079,
Local directory: /dask-worker-space/worker-0fh06h7w,Local directory: /dask-worker-space/worker-0fh06h7w

0,1
Comm: tls://10.4.13.204:33231,Total threads: 2
Dashboard: http://10.4.13.204:44229/status,Memory: 7.67 GiB
Nanny: tls://10.4.13.204:43709,
Local directory: /dask-worker-space/worker-vb6h9ehf,Local directory: /dask-worker-space/worker-vb6h9ehf

0,1
Comm: tls://10.4.11.246:38141,Total threads: 2
Dashboard: http://10.4.11.246:37371/status,Memory: 7.67 GiB
Nanny: tls://10.4.11.246:41487,
Local directory: /dask-worker-space/worker-0elm3odz,Local directory: /dask-worker-space/worker-0elm3odz

0,1
Comm: tls://10.4.12.183:45165,Total threads: 2
Dashboard: http://10.4.12.183:38571/status,Memory: 7.67 GiB
Nanny: tls://10.4.12.183:38071,
Local directory: /dask-worker-space/worker-l22hlp4m,Local directory: /dask-worker-space/worker-l22hlp4m

0,1
Comm: tls://10.4.15.21:40013,Total threads: 2
Dashboard: http://10.4.15.21:43913/status,Memory: 7.67 GiB
Nanny: tls://10.4.15.21:36491,
Local directory: /dask-worker-space/worker-rh52tazc,Local directory: /dask-worker-space/worker-rh52tazc

0,1
Comm: tls://10.4.4.253:46161,Total threads: 2
Dashboard: http://10.4.4.253:45113/status,Memory: 7.67 GiB
Nanny: tls://10.4.4.253:41011,
Local directory: /dask-worker-space/worker-jmwygsf0,Local directory: /dask-worker-space/worker-jmwygsf0

0,1
Comm: tls://10.4.14.72:32877,Total threads: 2
Dashboard: http://10.4.14.72:38835/status,Memory: 7.67 GiB
Nanny: tls://10.4.14.72:36887,
Local directory: /dask-worker-space/worker-sj5dmhec,Local directory: /dask-worker-space/worker-sj5dmhec

0,1
Comm: tls://10.4.15.50:40803,Total threads: 2
Dashboard: http://10.4.15.50:44843/status,Memory: 7.67 GiB
Nanny: tls://10.4.15.50:41889,
Local directory: /dask-worker-space/worker-5bqunu74,Local directory: /dask-worker-space/worker-5bqunu74


In [5]:
client.restart()

distributed.utils - ERROR - 'tls://10.4.1.96:41995'
Traceback (most recent call last):
  File "/Users/rpelgrim/mambaforge/envs/numpy-zarr/lib/python3.9/site-packages/distributed/utils.py", line 653, in log_errors
    yield
  File "/Users/rpelgrim/mambaforge/envs/numpy-zarr/lib/python3.9/site-packages/distributed/deploy/cluster.py", line 196, in _watch_worker_status
    self._update_worker_status(op, msg)
  File "/Users/rpelgrim/mambaforge/envs/numpy-zarr/lib/python3.9/site-packages/distributed/deploy/cluster.py", line 206, in _update_worker_status
    del self.scheduler_info["workers"][msg]
KeyError: 'tls://10.4.1.96:41995'


0,1
Connection method: Cluster object,Cluster type: coiled.Cluster
Dashboard: http://44.198.171.216:8787,

0,1
Dashboard: http://44.198.171.216:8787,Workers: 1
Total threads: 2,Total memory: 7.67 GiB

0,1
Comm: tls://10.4.4.132:8786,Workers: 1
Dashboard: http://10.4.4.132:8787/status,Total threads: 2
Started: 13 minutes ago,Total memory: 7.67 GiB

0,1
Comm: tls://10.4.0.164:44351,Total threads: 2
Dashboard: http://10.4.0.164:39043/status,Memory: 7.67 GiB
Nanny: tls://10.4.0.164:37111,
Local directory: /dask-worker-space/worker-fd9ko7uo,Local directory: /dask-worker-space/worker-fd9ko7uo


In [9]:
large = dask.datasets.timeseries(start="2000", end="2005", freq="1H")
small = dask.datasets.timeseries(start="2000", end="2005", freq="1D", dtypes={"z": int})

In [10]:
large.dtypes

id        int64
name     object
x       float64
y       float64
dtype: object

In [11]:
large

Unnamed: 0_level_0,id,name,x,y
npartitions=1827,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-01,int64,object,float64,float64
2000-01-02,...,...,...,...
...,...,...,...,...
2004-12-31,...,...,...,...
2005-01-01,...,...,...,...


In [13]:
large.memory_usage().compute()

Index    350784
id       350784
name     350784
x        350784
y        350784
dtype: int64

In [14]:
large.to_parquet(
    's3://coiled-datasets/dask-merge/large.parquet',
    engine="pyarrow"
)

KilledWorker: ("('reset_index-f62e4e433c74fe34223d80ef584137e5', 388)", <WorkerState 'tls://10.4.10.229:39433', name: coiled-dask-rrpelgr71-97896-worker-4a9861ed52, status: closed, memory: 0, processing: 9>)

In [4]:
small = small.compute()

In [5]:
small

Unnamed: 0_level_0,z
timestamp,Unnamed: 1_level_1
2000-01-01,984
2000-01-02,1021
2000-01-03,979
2000-01-04,1050
2000-01-05,1003
2000-01-06,1009
2000-01-07,984
2000-01-08,979
2000-01-09,977
2000-01-10,971


In [6]:
result = large.merge(small, how="left", on=["timestamp"])

In [7]:
result.head()

Unnamed: 0_level_0,id,name,x,y,z
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-01-01 00:00:00,1021,Laura,-0.198011,-0.835659,984.0
2000-01-01 00:00:10,993,George,-0.711381,0.904073,
2000-01-01 00:00:20,915,Laura,0.807961,-0.47337,
2000-01-01 00:00:30,953,Ray,0.12,0.743861,
2000-01-01 00:00:40,979,Tim,-0.104925,0.134858,


## Merge two Dask DataFrames

In [8]:
left = dask.datasets.timeseries(dtypes={"foo": int})
right_one = dask.datasets.timeseries(dtypes={"bar": int})
right_two = dask.datasets.timeseries(dtypes={"baz": int})

In [9]:
# timeseries returns a dataframe indexed by
# timestamp, we don't need to set_index.

# left.set_index("timestamp")

#left.to_parquet("left", overwrite=True)
#left = dd.read_parquet("left")

# If the dataframe can fit in RAM, you can also use persist

# left = left.persist()


result1 = left.merge(
    right_one, how="left", left_index=True, right_index=True)

In [13]:
result2 = result1.merge(
    right_two, how="left", left_index=True, right_index=True)

In [11]:
left.head()

Unnamed: 0_level_0,foo
timestamp,Unnamed: 1_level_1
2000-01-01 00:00:00,1032
2000-01-01 00:00:01,993
2000-01-01 00:00:02,993
2000-01-01 00:00:03,995
2000-01-01 00:00:04,995


In [10]:
result1.head()

Unnamed: 0_level_0,foo,bar
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2000-01-01 00:00:00,1032,1016
2000-01-01 00:00:01,993,949
2000-01-01 00:00:02,993,1004
2000-01-01 00:00:03,995,1023
2000-01-01 00:00:04,995,1042


In [14]:
result2.head()

Unnamed: 0_level_0,foo,bar,baz
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-01-01 00:00:00,1032,1016,1014
2000-01-01 00:00:01,993,949,1010
2000-01-01 00:00:02,993,1004,984
2000-01-01 00:00:03,995,1023,1005
2000-01-01 00:00:04,995,1042,981


## Simple Example

In [35]:
import dask.dataframe as dd
import pandas as pd
df_large = pd.DataFrame(
    {
        "Name": ["Azza", "Brandon", "Cedric", "Devonte", "Eli", "Fabio"], 
        "Age": [29, 30, 21, 57, 32, 19]
    }
)
large = dd.from_pandas(df_large, npartitions=2)

In [36]:
small = pd.DataFrame(
    {
        "Name": ["Azza", "Cedric", "Fabio"], 
        "City": ["Beirut", "Dublin", "Rosario"]
    }
)

In [40]:
join = large.merge(small, how="left", on=["Name"])

In [41]:
join.compute()

Unnamed: 0,Name,Age,City
0,Azza,29,Beirut
1,Brandon,30,
2,Cedric,21,Dublin
0,Devonte,57,
1,Eli,32,
2,Fabio,19,Rosario
