# Dask

In [1]:
from dask.distributed import Client

In [2]:
client = Client(n_workers=1, threads_per_worker=8, processes=False, memory_limit='4GB')
client

0,1
Client  Scheduler: inproc://192.168.0.183/46172/1  Dashboard: http://192.168.0.183:8787/status,Cluster  Workers: 1  Cores: 8  Memory: 4.00 GB


In [3]:
import dask

In [4]:
df = dask.datasets.timeseries()

In [5]:
df

Unnamed: 0_level_0,id,name,x,y
npartitions=30,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-01,int64,object,float64,float64
2000-01-02,...,...,...,...
...,...,...,...,...
2000-01-30,...,...,...,...
2000-01-31,...,...,...,...


In [6]:
import os
import datetime

In [7]:
if not os.path.exists('data'):
    os.mkdir('data')

In [8]:
def name(i):
    """ Provide date for filename given index

    Examples
    --------
    >>> name(0)
    '2000-01-01'
    >>> name(10)
    '2000-01-11'
    """
    return str(datetime.date(2000, 1, 1) + i * datetime.timedelta(days=1))

In [9]:
# make a bunch of csv files
# already done
#df.to_csv('data/*.csv', name_function=name);

In [10]:
import dask.dataframe as dd

# note with read_csv, dask name is 'from-delayed'
df = dd.read_csv('data/2000-*-*.csv', parse_dates=['timestamp'])
df

Unnamed: 0_level_0,timestamp,id,name,x,y
npartitions=30,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,datetime64[ns],int64,object,float64,float64
,...,...,...,...,...
...,...,...,...,...,...
,...,...,...,...,...
,...,...,...,...,...


In [11]:
df.head()

Unnamed: 0,timestamp,id,name,x,y
0,2000-01-01 00:00:00,975,Frank,0.214951,-0.066426
1,2000-01-01 00:00:01,1039,Oliver,-0.547168,0.230221
2,2000-01-01 00:00:02,990,Alice,-0.777825,-0.867442
3,2000-01-01 00:00:03,972,Ursula,0.065803,-0.22457
4,2000-01-01 00:00:04,1035,Kevin,0.130403,0.604034


In [12]:
# with dask datafrmae, you must call compute() to get the actual df
df.compute().info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2592000 entries, 0 to 86399
Data columns (total 5 columns):
 #   Column     Dtype         
---  ------     -----         
 0   timestamp  datetime64[ns]
 1   id         int64         
 2   name       object        
 3   x          float64       
 4   y          float64       
dtypes: datetime64[ns](1), float64(2), int64(1), object(1)
memory usage: 118.7+ MB


In [13]:
df.compute().describe()

Unnamed: 0,id,x,y
count,2592000.0,2592000.0,2592000.0
mean,1000.023,-0.0003122162,7.878137e-05
std,31.62326,0.5773607,0.5773813
min,850.0,-0.9999995,-0.9999983
25%,979.0,-0.5006739,-0.4999062
50%,1000.0,-0.0005362927,9.349257e-05
75%,1021.0,0.4994839,0.4998504
max,1182.0,0.9999991,0.9999994


In [14]:
# memory efficient but 'slow'
%time df.groupby('name').x.mean().compute()

CPU times: user 3.24 s, sys: 186 ms, total: 3.42 s
Wall time: 1.77 s


name
Alice      -0.000451
Bob        -0.003964
Charlie    -0.003123
Dan         0.003863
Edith      -0.000875
Frank       0.001855
George     -0.000439
Hannah      0.000771
Ingrid      0.000342
Jerry       0.000512
Kevin      -0.001705
Laura       0.001074
Michael     0.000081
Norbert    -0.002193
Oliver     -0.001725
Patricia    0.000033
Quinn       0.000675
Ray        -0.000274
Sarah       0.000381
Tim        -0.002564
Ursula     -0.000375
Victor     -0.000221
Wendy       0.000914
Xavier      0.000232
Yvonne     -0.002146
Zelda       0.001200
Name: x, dtype: float64

In [15]:
# write data to a parquet file
# already done
#df.to_parquet('data/2000-01.parquet', engine='pyarrow')

In [16]:
# read in the parquet file
df = dd.read_parquet('data/2000-01.parquet', engine='pyarrow')
df.head()

Unnamed: 0_level_0,timestamp,id,name,x,y
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2000-01-01 00:00:00,975,Frank,0.214951,-0.066426
1,2000-01-01 00:00:01,1039,Oliver,-0.547168,0.230221
2,2000-01-01 00:00:02,990,Alice,-0.777825,-0.867442
3,2000-01-01 00:00:03,972,Ursula,0.065803,-0.22457
4,2000-01-01 00:00:04,1035,Kevin,0.130403,0.604034


In [17]:
df.compute().info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2592000 entries, 0 to 86399
Data columns (total 5 columns):
 #   Column     Dtype         
---  ------     -----         
 0   timestamp  datetime64[ns]
 1   id         int64         
 2   name       object        
 3   x          float64       
 4   y          float64       
dtypes: datetime64[ns](1), float64(2), int64(1), object(1)
memory usage: 118.7+ MB


In [18]:
df.compute().describe()

Unnamed: 0,id,x,y
count,2592000.0,2592000.0,2592000.0
mean,1000.023,-0.0003122162,7.878137e-05
std,31.62326,0.5773607,0.5773813
min,850.0,-0.9999995,-0.9999983
25%,979.0,-0.5006739,-0.4999062
50%,1000.0,-0.0005362927,9.349257e-05
75%,1021.0,0.4994839,0.4998504
max,1182.0,0.9999991,0.9999994


In [19]:
# same command as before, but much faster
%time df.groupby('name').x.mean().compute()

CPU times: user 1.16 s, sys: 101 ms, total: 1.26 s
Wall time: 727 ms


name
Alice      -0.000451
Bob        -0.003964
Charlie    -0.003123
Dan         0.003863
Edith      -0.000875
Frank       0.001855
George     -0.000439
Hannah      0.000771
Ingrid      0.000342
Jerry       0.000512
Kevin      -0.001705
Laura       0.001074
Michael     0.000081
Norbert    -0.002193
Oliver     -0.001725
Patricia    0.000033
Quinn       0.000675
Ray        -0.000274
Sarah       0.000381
Tim        -0.002564
Ursula     -0.000375
Victor     -0.000221
Wendy       0.000914
Xavier      0.000232
Yvonne     -0.002146
Zelda       0.001200
Name: x, dtype: float64

In [20]:
# can pull out only the columns you need to save time
# woohoo look at the time difference here to read the file and compute!

In [21]:
%%time
df = dd.read_parquet('data/2000-01.parquet', columns=['name', 'x'], engine='pyarrow')
df.groupby('name').x.mean().compute()

CPU times: user 1.02 s, sys: 42.5 ms, total: 1.06 s
Wall time: 644 ms


name
Alice      -0.000451
Bob        -0.003964
Charlie    -0.003123
Dan         0.003863
Edith      -0.000875
Frank       0.001855
George     -0.000439
Hannah      0.000771
Ingrid      0.000342
Jerry       0.000512
Kevin      -0.001705
Laura       0.001074
Michael     0.000081
Norbert    -0.002193
Oliver     -0.001725
Patricia    0.000033
Quinn       0.000675
Ray        -0.000274
Sarah       0.000381
Tim        -0.002564
Ursula     -0.000375
Victor     -0.000221
Wendy       0.000914
Xavier      0.000232
Yvonne     -0.002146
Zelda       0.001200
Name: x, dtype: float64

In [22]:
client.close()