In [1]:
import dask.dataframe as dd

# Dask Schedulers

## tldr; Use distributed

Recommended by developers for use on a single machine


* Diagnostic UI
* Takes account of data locality
* Futures only work w/ `distributed`

## Firing up distributed

In [2]:
from dask.distributed import Client
client = Client()

In [3]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:53861  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 17.18 GB


In [4]:
client.close()

### Adjust the number of workers

In [5]:
from dask.distributed import LocalCluster

In [6]:
cluster = LocalCluster(
    n_workers=2, threads_per_worker=1, memory_limit='4GB')

In [7]:
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:53882  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 2  Cores: 2  Memory: 8.00 GB


## Managing distributed

## Understanding the task graph

## Creating `dask` `DataFrames`

* read_csv
* read_table
* read_parquet
* read_json
* from_pandas
* from_delayed

## Selecting data

In [None]:
# df[df.x > 0]

In [None]:
# df.loc[4.0:10.5]

## Using the `index`

In [None]:
# fast
dd.merge(df1, df2, left_index=True, right_index=True)

In [None]:
# fast
dd.merge(df1, df2, on=['idx', 'x']) # idx is index for both

## Querying the data

## Merge/Join

In [None]:
# fast
dd.merge(df1, df2, left_index=True, right_index=True)

In [None]:
# fast
dd.merge(df1, df2, on=['idx', 'x']) # idx is index for both

In [None]:
# join against another DataFrame
dd.merge(df1, df2, on='id')

## Groupby/Apply

In [None]:
df.groupby('x')['y'].max()

In [None]:
df.groupby(['idx', 'x']).apply()

## General computations

In [None]:
df1.x + df2.y # fast

In [None]:
df.rolling()

In [None]:
df.where(df.x > 5, np.nan) # keep > 5 only, others nan
df.mask(df.5 < 5) # < 5 is nan

## Optimizing

Use categories if possible

## Don't do the shuffle

In [None]:
df.set_index(df.x)

In [None]:
dd.merge(df1, df2, on='not_index')

## Reshape/Pivot

* get_dummies
* pivot_table
* melt

## Dask Specific

df.map_partitions(

In [None]:
repartition(divisions, npartitions, freq, partition_size) # one of these

In [None]:
df.random_split([0.8, 0.2])

In [2]:
df.rolling.map_overlap

# https://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.rolling.map_overlap

NameError: name 'df' is not defined

### Series

In [None]:
map_overlap

In [None]:
nunique_approx

`split_every`: Group partitions into groups of this size while performing a tree-reduction. If set to False, no tree-reduction will be used. Default is 8.

## Save Dataframes

* to_csv
* to_parquet
* to_hdf
* to_json