In [5]:
import dask.dataframe as dd
import pandas as pd

## Vertical concatenation

In [63]:
df = pd.DataFrame(
    {"nums": [1, 2, 3, 4, 5, 6], "letters": ["a", "b", "c", "d", "e", "f"]}
)
ddf1 = dd.from_pandas(df, npartitions=2)

In [None]:
df = pd.DataFrame({"nums": [88, 99], "letters": ["xx", "yy"]})
ddf2 = dd.from_pandas(df, npartitions=1)

In [None]:
ddf3 = dd.concat([ddf1, ddf2])

In [None]:
print(ddf3.compute())

   nums letters
0     1       a
1     2       b
2     3       c
3     4       d
4     5       e
5     6       f
0    88      xx
1    99      yy


In [None]:
ddf3.npartitions

3

## Divisions

In [21]:
def print_partitions(ddf):
    for i in range(ddf.npartitions):
        print(ddf.partitions[i].compute())

In [78]:
df = pd.DataFrame(
    {"nums": [1, 2, 3, 4, 5, 6], "letters": ["a", "b", "c", "d", "e", "f"]}
)
ddf1 = dd.from_pandas(df, npartitions=2)

In [79]:
print_partitions(ddf1)

   nums letters
0     1       a
1     2       b
2     3       c
   nums letters
3     4       d
4     5       e
5     6       f


In [66]:
ddf1.divisions

(0, 3, 5)

In [67]:
df = pd.DataFrame({"nums": [88, 99], "letters": ["xx", "yy"]})
ddf2 = dd.from_pandas(df, npartitions=1)

In [80]:
print_partitions(ddf2)

   nums letters
0    88      xx
1    99      yy


In [68]:
ddf2.divisions

(0, 1)

In [69]:
ddf3 = dd.concat([ddf1, ddf2])

In [70]:
ddf3.divisions

(None, None, None, None)

In [81]:
ddf3_interleave = dd.concat([ddf1, ddf2], interleave_partitions=True)

In [82]:
print(ddf3_interleave.compute())

   nums letters
0     1       a
0    88      xx
1     2       b
2     3       c
1    99      yy
3     4       d
4     5       e
5     6       f


In [83]:
ddf3_interleave.divisions

(0, 1, 3, 5)

In [84]:
ddf3_interleave.npartitions

3

In [86]:
print_partitions(ddf3_interleave)

   nums letters
0     1       a
0    88      xx
   nums letters
1     2       b
2     3       c
1    99      yy
   nums letters
3     4       d
4     5       e
5     6       f


## Interleaving isn't necessary if divisions don't overlap

In [22]:
df = pd.DataFrame(
    {"nums": [1, 2, 3, 4], "letters": ["a", "b", "c", "d"], "some_index": [4, 5, 6, 7]}
)
ddf1 = dd.from_pandas(df, npartitions=2)

In [23]:
ddf1 = ddf1.set_index("some_index")

In [24]:
print_partitions(ddf1)

            nums letters
some_index              
4              1       a
5              2       b
            nums letters
some_index              
6              3       c
7              4       d


In [25]:
df = pd.DataFrame({"nums": [88, 99], "letters": ["xx", "yy"], "some_index": [10, 20]})
ddf2 = dd.from_pandas(df, npartitions=1)

In [26]:
ddf2 = ddf2.set_index("some_index")

In [27]:
ddf3 = dd.concat([ddf1, ddf2])

In [28]:
print_partitions(ddf3)

            nums letters
some_index              
4              1       a
5              2       b
            nums letters
some_index              
6              3       c
7              4       d
            nums letters
some_index              
10            88      xx
20            99      yy


In [29]:
ddf3.divisions

(4, 6, 10, 20)

distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
asyncio.exceptions.CancelledError


## Mismatched schemas

In [87]:
df = pd.DataFrame(
    {
        "animal": ["cat", "dolphin", "shark", "starfish"],
        "is_mammal": [True, True, False, False],
    }
)
ddf1 = dd.from_pandas(df, npartitions=2)

In [88]:
df = pd.DataFrame({"animal": ["hippo", "lion"], "likes_water": [True, False]})
ddf2 = dd.from_pandas(df, npartitions=1)

In [89]:
ddf3 = dd.concat([ddf1, ddf2])

In [90]:
print(ddf3.compute())

     animal is_mammal likes_water
0       cat      True         NaN
1   dolphin      True         NaN
2     shark     False         NaN
3  starfish     False         NaN
0     hippo       NaN        True
1      lion       NaN       False


In [91]:
ddf3.dtypes

animal         object
is_mammal      object
likes_water    object
dtype: object

## Vertically concatenating large DataFrames

In [1]:
import coiled
import dask

In [2]:
cluster = coiled.Cluster(name="concat-cluster", n_workers=5)



Found software environment build
Created FW rules: coiled-dask-matthew24-46567-firewall
Created scheduler VM: coiled-dask-matthew24-46567-scheduler (type: t3.medium, ip: ['34.205.81.18'])


In [3]:
client = dask.distributed.Client(cluster)


+-------------+-----------+-----------+-----------+
| Package     | client    | scheduler | workers   |
+-------------+-----------+-----------+-----------+
| cloudpickle | 1.6.0     | 2.0.0     | 2.0.0     |
| dask        | 2021.08.1 | 2021.09.0 | 2021.09.0 |
| distributed | 2021.08.1 | 2021.09.0 | 2021.09.0 |
| pandas      | 1.3.2     | 1.3.3     | 1.3.3     |
+-------------+-----------+-----------+-----------+


In [6]:
ddf2000 = dd.read_parquet(
    "s3://coiled-datasets/timeseries/7d/parquet/2000",
    storage_options={"anon": True, "use_ssl": True},
    engine="pyarrow"
)

In [7]:
ddf2000.head()

Unnamed: 0_level_0,id,name,x,y
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-01 00:00:00,1008,Dan,-0.259374,-0.118314
2000-01-01 00:00:01,987,Patricia,0.069601,0.755351
2000-01-01 00:00:02,980,Zelda,-0.281843,-0.510507
2000-01-01 00:00:03,1020,Ursula,-0.569904,0.523132
2000-01-01 00:00:04,967,Michael,-0.25146,0.81093


In [8]:
len(ddf2000)

31449600

In [9]:
ddf2000.npartitions

52

In [10]:
ddf2000.memory_usage(deep=True).compute()

Index     251596800
id        251596800
name     1960759872
x         251596800
y         251596800
dtype: int64

In [11]:
ddf2001 = dd.read_parquet(
    "s3://coiled-datasets/timeseries/7d/parquet/2001",
    storage_options={"anon": True, "use_ssl": True},
    engine="pyarrow"
)

In [12]:
len(ddf2001)

31449600

In [14]:
ddf2001.npartitions

52

In [15]:
ddf = dd.concat([ddf2000, ddf2001])

In [16]:
%%time

len(ddf)

CPU times: user 110 ms, sys: 27.3 ms, total: 138 ms
Wall time: 8.53 s


62899200

In [17]:
ddf.npartitions

104

In [110]:
ddf.divisions

(Timestamp('2000-01-01 00:00:00'),
 Timestamp('2000-01-08 00:00:00'),
 Timestamp('2000-01-15 00:00:00'),
 Timestamp('2000-01-22 00:00:00'),
 Timestamp('2000-01-29 00:00:00'),
 Timestamp('2000-02-05 00:00:00'),
 Timestamp('2000-02-12 00:00:00'),
 Timestamp('2000-02-19 00:00:00'),
 Timestamp('2000-02-26 00:00:00'),
 Timestamp('2000-03-04 00:00:00'),
 Timestamp('2000-03-11 00:00:00'),
 Timestamp('2000-03-18 00:00:00'),
 Timestamp('2000-03-25 00:00:00'),
 Timestamp('2000-04-01 00:00:00'),
 Timestamp('2000-04-08 00:00:00'),
 Timestamp('2000-04-15 00:00:00'),
 Timestamp('2000-04-22 00:00:00'),
 Timestamp('2000-04-29 00:00:00'),
 Timestamp('2000-05-06 00:00:00'),
 Timestamp('2000-05-13 00:00:00'),
 Timestamp('2000-05-20 00:00:00'),
 Timestamp('2000-05-27 00:00:00'),
 Timestamp('2000-06-03 00:00:00'),
 Timestamp('2000-06-10 00:00:00'),
 Timestamp('2000-06-17 00:00:00'),
 Timestamp('2000-06-24 00:00:00'),
 Timestamp('2000-07-01 00:00:00'),
 Timestamp('2000-07-08 00:00:00'),
 Timestamp('2000-07-

In [18]:
ddf = dd.concat([ddf2000, ddf2001], interleave_partitions=True)

In [19]:
%%time

len(ddf)

CPU times: user 70.6 ms, sys: 7.41 ms, total: 78 ms
Wall time: 7.08 s


62899200

In [20]:
ddf.divisions

(Timestamp('2000-01-01 00:00:00'),
 Timestamp('2000-01-08 00:00:00'),
 Timestamp('2000-01-15 00:00:00'),
 Timestamp('2000-01-22 00:00:00'),
 Timestamp('2000-01-29 00:00:00'),
 Timestamp('2000-02-05 00:00:00'),
 Timestamp('2000-02-12 00:00:00'),
 Timestamp('2000-02-19 00:00:00'),
 Timestamp('2000-02-26 00:00:00'),
 Timestamp('2000-03-04 00:00:00'),
 Timestamp('2000-03-11 00:00:00'),
 Timestamp('2000-03-18 00:00:00'),
 Timestamp('2000-03-25 00:00:00'),
 Timestamp('2000-04-01 00:00:00'),
 Timestamp('2000-04-08 00:00:00'),
 Timestamp('2000-04-15 00:00:00'),
 Timestamp('2000-04-22 00:00:00'),
 Timestamp('2000-04-29 00:00:00'),
 Timestamp('2000-05-06 00:00:00'),
 Timestamp('2000-05-13 00:00:00'),
 Timestamp('2000-05-20 00:00:00'),
 Timestamp('2000-05-27 00:00:00'),
 Timestamp('2000-06-03 00:00:00'),
 Timestamp('2000-06-10 00:00:00'),
 Timestamp('2000-06-17 00:00:00'),
 Timestamp('2000-06-24 00:00:00'),
 Timestamp('2000-07-01 00:00:00'),
 Timestamp('2000-07-08 00:00:00'),
 Timestamp('2000-07-

In [114]:
ddf.head()

Unnamed: 0_level_0,id,name,x,y
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-01 00:00:00,1008,Dan,-0.259374,-0.118314
2000-01-01 00:00:01,987,Patricia,0.069601,0.755351
2000-01-01 00:00:02,980,Zelda,-0.281843,-0.510507
2000-01-01 00:00:03,1020,Ursula,-0.569904,0.523132
2000-01-01 00:00:04,967,Michael,-0.25146,0.81093
