# Converting Dask DataFrame to Dask Array

In [15]:
import dask.dataframe as dd

import numpy as np
import pandas as pd

In [2]:
pdf = pd.DataFrame(
    {"num1": [1, 2, 3, 4], "num2": [7, 8, 9, 10]},
)

In [3]:
ddf = dd.from_pandas(pdf, npartitions=2)

In [4]:
my_arr = ddf.to_dask_array()

In [5]:
my_arr

Unnamed: 0,Array,Chunk
Bytes,unknown,unknown
Shape,"(nan, 2)","(nan, 2)"
Count,4 Tasks,2 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes unknown unknown Shape (nan, 2) (nan, 2) Count 4 Tasks 2 Chunks Type int64 numpy.ndarray",,

Unnamed: 0,Array,Chunk
Bytes,unknown,unknown
Shape,"(nan, 2)","(nan, 2)"
Count,4 Tasks,2 Chunks
Type,int64,numpy.ndarray


In [6]:
my_arr.compute()

array([[ 1,  7],
       [ 2,  8],
       [ 3,  9],
       [ 4, 10]])

In [9]:
my_arr = ddf.to_dask_array(lengths=True)

In [10]:
my_arr

Unnamed: 0,Array,Chunk
Bytes,64 B,32 B
Shape,"(4, 2)","(2, 2)"
Count,4 Tasks,2 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 64 B 32 B Shape (4, 2) (2, 2) Count 4 Tasks 2 Chunks Type int64 numpy.ndarray",2  4,

Unnamed: 0,Array,Chunk
Bytes,64 B,32 B
Shape,"(4, 2)","(2, 2)"
Count,4 Tasks,2 Chunks
Type,int64,numpy.ndarray


In [23]:
my_arr = ddf.to_dask_array(lengths=[3, 1])

In [24]:
my_arr

Unnamed: 0,Array,Chunk
Bytes,64 B,48 B
Shape,"(4, 2)","(3, 2)"
Count,4 Tasks,2 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 64 B 48 B Shape (4, 2) (3, 2) Count 4 Tasks 2 Chunks Type int64 numpy.ndarray",2  4,

Unnamed: 0,Array,Chunk
Bytes,64 B,48 B
Shape,"(4, 2)","(3, 2)"
Count,4 Tasks,2 Chunks
Type,int64,numpy.ndarray


In [16]:
my_arr = ddf.to_dask_array(lengths=True, meta=np.float64)

In [17]:
my_arr

AttributeError: 'getset_descriptor' object has no attribute 'itemsize'

dask.array<values, shape=(nan, 2), dtype=<attribute 'dtype' of 'numpy.generic' objects>, chunksize=(nan, 2), chunktype=builtins.type>

## heterogeneous data

In [25]:
pdf = pd.DataFrame(
    {"num": [1, 2, 3, 4], "letter": ["a", "b", "c", "d"]},
)

In [26]:
ddf = dd.from_pandas(pdf, npartitions=2)

In [31]:
my_arr = ddf.to_dask_array(lengths=True)

In [32]:
my_arr

Unnamed: 0,Array,Chunk
Bytes,64 B,32 B
Shape,"(4, 2)","(2, 2)"
Count,4 Tasks,2 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 64 B 32 B Shape (4, 2) (2, 2) Count 4 Tasks 2 Chunks Type object numpy.ndarray",2  4,

Unnamed: 0,Array,Chunk
Bytes,64 B,32 B
Shape,"(4, 2)","(2, 2)"
Count,4 Tasks,2 Chunks
Type,object,numpy.ndarray


In [33]:
my_arr.compute()

array([[1, 'a'],
       [2, 'b'],
       [3, 'c'],
       [4, 'd']], dtype=object)

## Large dataset

In [2]:
import coiled
import dask
import dask.dataframe as dd

In [3]:
cluster = coiled.Cluster(name="powers-crt-003", software="crt-003", n_workers=5)

Output()

In [4]:
client = dask.distributed.Client(cluster)

In [5]:
ddf = dd.read_parquet(
    "s3://coiled-datasets/timeseries/20-years/parquet",
    storage_options={"anon": True, "use_ssl": True},
)

In [6]:
ddf.dtypes

id        int64
name     object
x       float64
y       float64
dtype: object

In [7]:
ddf = dd.read_parquet(
    "s3://coiled-datasets/timeseries/20-years/parquet",
    storage_options={"anon": True, "use_ssl": True},
    columns=["x", "y"],
)

In [8]:
some_arr = ddf.to_dask_array(lengths=True)

In [9]:
some_arr

Unnamed: 0,Array,Chunk
Bytes,9.87 GiB,9.23 MiB
Shape,"(662256000, 2)","(604800, 2)"
Count,2190 Tasks,1095 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 9.87 GiB 9.23 MiB Shape (662256000, 2) (604800, 2) Count 2190 Tasks 1095 Chunks Type float64 numpy.ndarray",2  662256000,

Unnamed: 0,Array,Chunk
Bytes,9.87 GiB,9.23 MiB
Shape,"(662256000, 2)","(604800, 2)"
Count,2190 Tasks,1095 Chunks
Type,float64,numpy.ndarray


distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
asyncio.exceptions.CancelledError
Traceback (most recent call last):
  File "/Users/powers/opt/miniconda3/envs/crt-003/lib/python3.9/site-packages/distributed/comm/tcp.py", line 409, in connect
    stream = await self.client.connect(
  File "/Users/powers/opt/miniconda3/envs/crt-003/lib/python3.9/site-packages/tornado/tcpclient.py", line 275, in connect
    af, addr, stream = await connector.start(connect_timeout=timeout)
asyncio.exceptions.CancelledError

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/powers/opt/miniconda3/envs/crt-003/lib/python3.9/asyncio/tasks.py", line 490, in wait_for
    return fut.result()
asyncio.exceptions.CancelledError

The above exception was the direct cause of the following excep