# Dask

In [1]:
from dask.distributed import Client

In [2]:
client = Client(n_workers=1, threads_per_worker=8, processes=False, memory_limit='4GB')
client

0,1
Client  Scheduler: inproc://192.168.0.183/150300/1  Dashboard: http://192.168.0.183:8787/status,Cluster  Workers: 1  Cores: 8  Memory: 4.00 GB


In [3]:
import dask

In [4]:
df = dask.datasets.timeseries()

In [5]:
df

Unnamed: 0_level_0,id,name,x,y
npartitions=30,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-01,int64,object,float64,float64
2000-01-02,...,...,...,...
...,...,...,...,...
2000-01-30,...,...,...,...
2000-01-31,...,...,...,...


In [6]:
import os
import datetime

In [7]:
if not os.path.exists('data'):
    os.mkdir('data')

In [8]:
def name(i):
    """ Provide date for filename given index

    Examples
    --------
    >>> name(0)
    '2000-01-01'
    >>> name(10)
    '2000-01-11'
    """
    return str(datetime.date(2000, 1, 1) + i * datetime.timedelta(days=1))

In [9]:
# make a bunch of csv files
# already done
#df.to_csv('data/*.csv', name_function=name);

In [10]:
import dask.dataframe as dd

# note with read_csv, dask name is 'from-delayed'
df = dd.read_csv('data/2000-*-*.csv', parse_dates=['timestamp'])
df



Unnamed: 0_level_0,timestamp,id,name,x,y
npartitions=30,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,datetime64[ns],int64,object,float64,float64
,...,...,...,...,...
...,...,...,...,...,...
,...,...,...,...,...
,...,...,...,...,...


In [11]:
df.head()

Unnamed: 0,timestamp,id,name,x,y
0,2000-01-01 00:00:00,975,Frank,0.214951,-0.066426
1,2000-01-01 00:00:01,1039,Oliver,-0.547168,0.230221
2,2000-01-01 00:00:02,990,Alice,-0.777825,-0.867442
3,2000-01-01 00:00:03,972,Ursula,0.065803,-0.22457
4,2000-01-01 00:00:04,1035,Kevin,0.130403,0.604034


In [12]:
df.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 5 entries, timestamp to y
dtypes: datetime64[ns](1), object(1), float64(2), int64(1)

In [13]:
df.info

<bound method DataFrame.info of Dask DataFrame Structure:
                     timestamp     id    name        x        y
npartitions=30                                                 
                datetime64[ns]  int64  object  float64  float64
                           ...    ...     ...      ...      ...
...                        ...    ...     ...      ...      ...
                           ...    ...     ...      ...      ...
                           ...    ...     ...      ...      ...
Dask Name: from-delayed, 90 tasks>

In [14]:
df.describe

<bound method _Frame.describe of Dask DataFrame Structure:
                     timestamp     id    name        x        y
npartitions=30                                                 
                datetime64[ns]  int64  object  float64  float64
                           ...    ...     ...      ...      ...
...                        ...    ...     ...      ...      ...
                           ...    ...     ...      ...      ...
                           ...    ...     ...      ...      ...
Dask Name: from-delayed, 90 tasks>

In [15]:
df.describe()

Unnamed: 0_level_0,id,x,y
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,float64,float64,float64
,...,...,...


In [16]:
# memory efficient but 'slow'
%time
df.groupby('name').x.mean().compute()

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 3.34 µs


name
Alice      -0.000451
Bob        -0.003964
Charlie    -0.003123
Dan         0.003863
Edith      -0.000875
Frank       0.001855
George     -0.000439
Hannah      0.000771
Ingrid      0.000342
Jerry       0.000512
Kevin      -0.001705
Laura       0.001074
Michael     0.000081
Norbert    -0.002193
Oliver     -0.001725
Patricia    0.000033
Quinn       0.000675
Ray        -0.000274
Sarah       0.000381
Tim        -0.002564
Ursula     -0.000375
Victor     -0.000221
Wendy       0.000914
Xavier      0.000232
Yvonne     -0.002146
Zelda       0.001200
Name: x, dtype: float64

In [17]:
# write data to a parquet file
# already done
#df.to_parquet('data/2000-01.parquet', engine='pyarrow')

In [18]:
# read in the parquet file
df = dd.read_parquet('data/2000-01.parquet', engine='pyarrow')
df

Unnamed: 0_level_0,timestamp,id,name,x,y
npartitions=30,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,datetime64[ns],int64,object,float64,float64
,...,...,...,...,...
...,...,...,...,...,...
,...,...,...,...,...
,...,...,...,...,...


In [19]:
df.head()

Unnamed: 0_level_0,timestamp,id,name,x,y
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2000-01-01 00:00:00,975,Frank,0.214951,-0.066426
1,2000-01-01 00:00:01,1039,Oliver,-0.547168,0.230221
2,2000-01-01 00:00:02,990,Alice,-0.777825,-0.867442
3,2000-01-01 00:00:03,972,Ursula,0.065803,-0.22457
4,2000-01-01 00:00:04,1035,Kevin,0.130403,0.604034


In [20]:
df.info

<bound method DataFrame.info of Dask DataFrame Structure:
                     timestamp     id    name        x        y
npartitions=30                                                 
                datetime64[ns]  int64  object  float64  float64
                           ...    ...     ...      ...      ...
...                        ...    ...     ...      ...      ...
                           ...    ...     ...      ...      ...
                           ...    ...     ...      ...      ...
Dask Name: read-parquet, 30 tasks>

In [21]:
df.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 5 entries, timestamp to y
dtypes: datetime64[ns](1), object(1), float64(2), int64(1)

In [22]:
df.describe

<bound method _Frame.describe of Dask DataFrame Structure:
                     timestamp     id    name        x        y
npartitions=30                                                 
                datetime64[ns]  int64  object  float64  float64
                           ...    ...     ...      ...      ...
...                        ...    ...     ...      ...      ...
                           ...    ...     ...      ...      ...
                           ...    ...     ...      ...      ...
Dask Name: read-parquet, 30 tasks>

In [23]:
df.describe()

Unnamed: 0_level_0,id,x,y
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,float64,float64,float64
,...,...,...


In [24]:
# same command as before, but much faster
%time
df.groupby('name').x.mean().compute()

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.29 µs


name
Alice      -0.000451
Bob        -0.003964
Charlie    -0.003123
Dan         0.003863
Edith      -0.000875
Frank       0.001855
George     -0.000439
Hannah      0.000771
Ingrid      0.000342
Jerry       0.000512
Kevin      -0.001705
Laura       0.001074
Michael     0.000081
Norbert    -0.002193
Oliver     -0.001725
Patricia    0.000033
Quinn       0.000675
Ray        -0.000274
Sarah       0.000381
Tim        -0.002564
Ursula     -0.000375
Victor     -0.000221
Wendy       0.000914
Xavier      0.000232
Yvonne     -0.002146
Zelda       0.001200
Name: x, dtype: float64

In [25]:
# can pull out only the columns you need to save time
# woohoo look at the time difference here to read the file and compute!
%time
df = dd.read_parquet('data/2000-01.parquet', columns=['name', 'x'], engine='pyarrow')
df.groupby('name').x.mean().compute()

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.68 µs


name
Alice      -0.000451
Bob        -0.003964
Charlie    -0.003123
Dan         0.003863
Edith      -0.000875
Frank       0.001855
George     -0.000439
Hannah      0.000771
Ingrid      0.000342
Jerry       0.000512
Kevin      -0.001705
Laura       0.001074
Michael     0.000081
Norbert    -0.002193
Oliver     -0.001725
Patricia    0.000033
Quinn       0.000675
Ray        -0.000274
Sarah       0.000381
Tim        -0.002564
Ursula     -0.000375
Victor     -0.000221
Wendy       0.000914
Xavier      0.000232
Yvonne     -0.002146
Zelda       0.001200
Name: x, dtype: float64

In [26]:
client.close()