# Dask Array in 3 minutes

// From https://www.youtube.com/watch?v=9h_61hXCDuI)

In [36]:
def print_obj(obj, tag=None):
    """
    Print an object in terms of type and then display is.
    """
    if tag:
        print(tag)
    print("type=", type(obj))
    display(obj)
    

def print_dask(obj, visualize_graph=True, compute=True):
    """
    Print information about a dask task graph.
    """
    print("type=", type(obj))
    print("obj=", obj)
    #
    print("# display")
    display(obj)
    #
    print("# dask")
    display(obj.dask)
    #
    if visualize_graph:
        print("# visualize")
        display(obj.visualize())
    #
    if compute:
        print("# compute")
        res = obj.compute()
        print(type(res))
        print(res)

In [25]:
# https://stackoverflow.com/questions/59070260/dask-client-detect-local-default-cluster-already-running
import os
os.environ['DASK_SCHEDULER_ADDRESS'] = 'tcp://localhost:8787'

if not ("cluster" in globals() and "client" in globals()):
    from dask.distributed import Client, LocalCluster
    cluster = LocalCluster(dashboard_address=':8787')
    client = Client(cluster)
    print(client, client.dashboard_link)

<Client: 'tcp://127.0.0.1:35637' processes=4 threads=8, memory=31.01 GiB> http://127.0.0.1:8787/status


## Small array

In [None]:
import numpy as np

x = np.ones(15)
x

In [None]:
import dask.array as da

x = da.ones(15, chunks=(5, ))
x

In [None]:
# The return type is a scalar.
x.sum()

In [None]:
# Dask is lazy by default.
x.sum().compute()

## Medium array

In [None]:
x = da.ones((10_000, 10_000), chunks=(5000, 5000))
print_dask(x)

In [None]:
y = x + x.T

print_dask(y)

In [None]:
y.compute()

## Larger array

In [None]:
x = da.ones((10_000, 10_000), chunks=(1000, 1000))
x

In [None]:
y = x + x.T
print_dask(y)

In [None]:
y.compute()

# Dask DataFrame: An introduction

// https://www.youtube.com/watch?v=AT2XtFehFSQ&t=37s

In [None]:
import dask
import dask.dataframe as dd

# Get an example large dataset.
df_orig = dask.datasets.timeseries()
print_obj(df_orig)

# It has 30 partitions.

In [None]:
# Save to disk in chunks.
df_orig.to_csv("data")
!ls -lh data

In [None]:
# Load one chunk.
import pandas as pd
df = pd.read_csv("data/00.part", parse_dates=["timestamp"])
df

In [None]:
df.x.mean()

In [None]:
df.groupby("name").x.std()

In [None]:
import dask.dataframe as dd

# Read one partition with Dask.
#df = dd.read_csv("data/00.part", parse_dates=["timestamp"])

# Read all partitions with Dask.
df = dd.read_csv("data/*.part", parse_dates=["timestamp"])

print_obj(df)

# head() materializes the data.
df.head()

In [None]:
# We get a "lazy result". Dask reads from disk only when one asks for a result.
obj = df.x.mean()

print_obj(obj)

In [None]:
print_dask(obj, compute=False)

In [None]:
df.x.mean().compute()

In [None]:
obj = df.groupby("name").x.std()

obj.compute()

## Index, partitions, and sorting

In [None]:
# The original data read is made of 30 partitions.
# Each partition can be read in parallel and independently.
df

In [None]:
df.partitions[5]

In [None]:
obj = df.partitions[5].compute()

print_obj(obj)

In [None]:
# Apply a function across all the partitions.
#df.map_partitions(type).compute()
df.map_partitions(len).compute()

In [None]:
# Read the first partition.
df.head()

In [None]:
# Read the last partition.
df.tail()

In [None]:
# This forces Dask to read the data but it doesn't compute.
df = df.set_index("timestamp")
print_obj(df)

# The partitions host data between two different timestamps.
# In this way Dask knows in which file chunks of data.

In [None]:
# Save files to Parquet.
df.to_parquet("myfile.parquet")

In [None]:
!ls myfile.parquet

# Dask Bag

From https://www.youtube.com/watch?v=-qIiJ1XtSv0

In [None]:
import dask.bag as db

# Create a bag storing 10 elements.
b = db.from_sequence([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], npartitions=4)
print_obj(b)

In [None]:
# This produces a new bag.
obj = b.map(lambda x: x ** 2)

print_obj(b)

In [None]:
# Execute.
obj.compute()

In [None]:
# One can chain computations: e.g., filter, square and sum.
obj = b.filter(lambda x : x % 2 == 0).map(lambda x: x ** 2).sum()
print_obj(obj)

In [None]:
print_dask(obj)

In [None]:
obj.compute()

## An example with JSON data

In [None]:
#!wget https://archive.analytics.mybinder.org/events-2019-06-17.jsonl

In [None]:
!pip install aiohttp requests

In [18]:
import os
import requests

#os.system("rm -rf data_json")
os.system("mkdir data_json")

for month in range(6, 7):
    for day in range(1, 30):
        file = "events-2019-%02d-%02d.jsonl" % (month, day)
        dst_file = f'data_json/{file}'
        print(dst_file)
        if os.path.exists(dst_file):
            continue
        url = "https://archive.analytics.mybinder.org/%s" % file
        print(url)
        r = requests.get(url, allow_redirects=True)
        open(dst_file, 'wb').write(r.content)


mkdir: cannot create directory ‘data_json’: File exists


data_json/events-2019-06-01.jsonl
data_json/events-2019-06-02.jsonl
data_json/events-2019-06-03.jsonl
data_json/events-2019-06-04.jsonl
data_json/events-2019-06-05.jsonl
data_json/events-2019-06-06.jsonl
data_json/events-2019-06-07.jsonl
data_json/events-2019-06-08.jsonl
data_json/events-2019-06-09.jsonl
data_json/events-2019-06-10.jsonl
data_json/events-2019-06-11.jsonl
data_json/events-2019-06-12.jsonl
data_json/events-2019-06-13.jsonl
data_json/events-2019-06-14.jsonl
data_json/events-2019-06-15.jsonl
https://archive.analytics.mybinder.org/events-2019-06-15.jsonl
data_json/events-2019-06-16.jsonl
https://archive.analytics.mybinder.org/events-2019-06-16.jsonl
data_json/events-2019-06-17.jsonl
https://archive.analytics.mybinder.org/events-2019-06-17.jsonl
data_json/events-2019-06-18.jsonl
https://archive.analytics.mybinder.org/events-2019-06-18.jsonl
data_json/events-2019-06-19.jsonl
https://archive.analytics.mybinder.org/events-2019-06-19.jsonl
data_json/events-2019-06-20.jsonl
https

In [19]:
!ls data_json/*
!du -h data_json

data_json/events-2019-06-01.jsonl  data_json/events-2019-06-16.jsonl
data_json/events-2019-06-02.jsonl  data_json/events-2019-06-17.jsonl
data_json/events-2019-06-03.jsonl  data_json/events-2019-06-18.jsonl
data_json/events-2019-06-04.jsonl  data_json/events-2019-06-19.jsonl
data_json/events-2019-06-05.jsonl  data_json/events-2019-06-20.jsonl
data_json/events-2019-06-06.jsonl  data_json/events-2019-06-21.jsonl
data_json/events-2019-06-07.jsonl  data_json/events-2019-06-22.jsonl
data_json/events-2019-06-08.jsonl  data_json/events-2019-06-23.jsonl
data_json/events-2019-06-09.jsonl  data_json/events-2019-06-24.jsonl
data_json/events-2019-06-10.jsonl  data_json/events-2019-06-25.jsonl
data_json/events-2019-06-11.jsonl  data_json/events-2019-06-26.jsonl
data_json/events-2019-06-12.jsonl  data_json/events-2019-06-27.jsonl
data_json/events-2019-06-13.jsonl  data_json/events-2019-06-28.jsonl
data_json/events-2019-06-14.jsonl  data_json/events-2019-06-29.jsonl
data_json/events-2019-06-15.jsonl


In [3]:
!head data_json/events-2019-06-14.jsonl

!du -h data_json/events-2019-06-14.jsonl

{"timestamp": "2019-06-14T00:00:00+00:00", "schema": "binderhub.jupyter.org/launch", "version": 3, "provider": "GitHub", "spec": "QuantStack/xeus-cling/stable", "status": "success", "origin": "gke.mybinder.org"}
{"timestamp": "2019-06-14T00:00:00+00:00", "schema": "binderhub.jupyter.org/launch", "version": 3, "provider": "GitHub", "spec": "TheZetner/jupyter-examples-2019/master", "status": "success", "origin": "gke.mybinder.org"}
{"timestamp": "2019-06-14T00:00:00+00:00", "schema": "binderhub.jupyter.org/launch", "version": 3, "provider": "GitHub", "spec": "binder-examples/r/master", "status": "success", "origin": "gke.mybinder.org"}
{"timestamp": "2019-06-14T00:00:00+00:00", "schema": "binderhub.jupyter.org/launch", "version": 3, "provider": "GitHub", "spec": "binder-examples/r/master", "status": "success", "origin": "gke.mybinder.org"}
{"timestamp": "2019-06-14T00:00:00+00:00", "schema": "binderhub.jupyter.org/launch", "version": 3, "provider": "GitHub", "spec": "DS-100/textbook/mast

In [21]:
import dask.bag as db

# Read a single file.
#lines = db.read_text("data_json/events-2019-06-14.jsonl")

# Read all files.
lines = db.read_text("data_json/events-*.jsonl")

# Read the first 2 lines.
lines.take(2)

('{"timestamp": "2019-06-01T00:00:00+00:00", "schema": "binderhub.jupyter.org/launch", "version": 2, "provider": "GitHub", "spec": "sbl-sdsc/mmtf-workshop-2018/master", "status": "success"}\n',
 '{"timestamp": "2019-06-01T00:01:00+00:00", "schema": "binderhub.jupyter.org/launch", "version": 2, "provider": "GitHub", "spec": "DS-100/textbook/master", "status": "success"}\n')

In [22]:
# It has a certain number of partitions, one per original file.
lines

dask.bag<bag-from-delayed, npartitions=29>

In [23]:
# Transform the JSON lines into structured data.
import json

records = lines.map(json.loads)
records.take(2)

({'timestamp': '2019-06-01T00:00:00+00:00',
  'schema': 'binderhub.jupyter.org/launch',
  'version': 2,
  'provider': 'GitHub',
  'spec': 'sbl-sdsc/mmtf-workshop-2018/master',
  'status': 'success'},
 {'timestamp': '2019-06-01T00:01:00+00:00',
  'schema': 'binderhub.jupyter.org/launch',
  'version': 2,
  'provider': 'GitHub',
  'spec': 'DS-100/textbook/master',
  'status': 'success'})

In [26]:
# Do a frequency count to find binders that run the most often.
records.map(lambda d: d["spec"]).frequencies(sort=True).compute()

[('ipython/ipython-in-depth/master', 163735),
 ('jupyterlab/jupyterlab-demo/master', 33724),
 ('ines/spacy-io-binder/live', 16367),
 ('DS-100/textbook/master', 11167),
 ('bokeh/bokeh-notebooks/master', 7834),
 ('ines/spacy-course/binder', 4778),
 ('binder-examples/requirements/master', 4754),
 ('rationalmatter/juno-demo-notebooks/master', 4553),
 ('binder-examples/r/master', 4547),
 ('QuantStack/xeus-cling/stable', 3602),
 ('numba/numba-examples/master', 2179),
 ('binder-examples/julia-python/master', 1988),
 ('dask/dask-examples/master', 1852),
 ('noamross/gams-in-r-course/master', 1850),
 ('ELC/8fdc0f490b3058872a7014f01416dfb6/master', 1697),
 ('ComputoCienciasUniandes/FISI2028-201910/master', 1230),
 ('wshuyi/demo-spacy-text-processing/master', 1224),
 ('data-8/textbook/gh-pages', 1223),
 ('jmsevillam/Calculadoras/master', 1220),
 ('jupyter/jupyter-book/gh-pages', 1009),
 ('freechipsproject/chisel-bootcamp/master', 964),
 ('rasahq/docs-binder/master', 937),
 ('ELC/380e584b87227b1572

In [30]:
# Look for records that have "dask" in the specs.
obj = records.filter(lambda d: "dask" in d["spec"])

# Convert to strings and saves.
obj = obj.map(json.dumps).to_textfiles("data/analysis/*.json")

In [31]:
!ls -l data/analysis

total 440
-rw-r--r-- 1 root root  8233 Apr  4 15:14 00.json
-rw-r--r-- 1 root root  6815 Apr  4 15:14 01.json
-rw-r--r-- 1 root root  6801 Apr  4 15:14 02.json
-rw-r--r-- 1 root root 10753 Apr  4 15:14 03.json
-rw-r--r-- 1 root root 12211 Apr  4 15:14 04.json
-rw-r--r-- 1 root root 14101 Apr  4 15:14 05.json
-rw-r--r-- 1 root root 12530 Apr  4 15:14 06.json
-rw-r--r-- 1 root root  5071 Apr  4 15:14 07.json
-rw-r--r-- 1 root root  6622 Apr  4 15:14 08.json
-rw-r--r-- 1 root root 12887 Apr  4 15:14 09.json
-rw-r--r-- 1 root root 20549 Apr  4 15:14 10.json
-rw-r--r-- 1 root root 21964 Apr  4 15:14 11.json
-rw-r--r-- 1 root root 15047 Apr  4 15:14 12.json
-rw-r--r-- 1 root root 23266 Apr  4 15:14 13.json
-rw-r--r-- 1 root root  9631 Apr  4 15:14 14.json
-rw-r--r-- 1 root root 10921 Apr  4 15:14 15.json
-rw-r--r-- 1 root root 18209 Apr  4 15:14 16.json
-rw-r--r-- 1 root root 21744 Apr  4 15:14 17.json
-rw-r--r-- 1 root root 18538 Apr  4 15:14 18.json
-rw-r--r-- 1 root ro

In [33]:
!head -20 data/analysis/00.json

{"timestamp": "2019-06-01T00:11:00+00:00", "schema": "binderhub.jupyter.org/launch", "version": 2, "provider": "GitHub", "spec": "dask/dask-examples/master", "status": "success"}
{"timestamp": "2019-06-01T01:09:00+00:00", "schema": "binderhub.jupyter.org/launch", "version": 2, "provider": "GitHub", "spec": "dask/dask-examples/master", "status": "success"}
{"timestamp": "2019-06-01T03:37:00+00:00", "schema": "binderhub.jupyter.org/launch", "version": 2, "provider": "GitHub", "spec": "dask/dask-examples/master", "status": "success"}
{"timestamp": "2019-06-01T04:17:00+00:00", "schema": "binderhub.jupyter.org/launch", "version": 2, "provider": "GitHub", "spec": "dask/dask-examples/master", "status": "success"}
{"timestamp": "2019-06-01T05:03:00+00:00", "schema": "binderhub.jupyter.org/launch", "version": 2, "provider": "GitHub", "spec": "dask/dask-examples/master", "status": "success"}
{"timestamp": "2019-06-01T05:28:00+00:00", "schema": "binderhub.jupyter.org/launch", "version": 2, "

In [37]:
# Instead of using Bag, one can use DataFrame.

df = records.to_dataframe()
print_obj(df)

# It still a lazy result.

type= <class 'dask.dataframe.core.DataFrame'>


Unnamed: 0_level_0,timestamp,schema,version,provider,spec,status
npartitions=29,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
,object,object,int64,object,object,object
,...,...,...,...,...,...
...,...,...,...,...,...,...
,...,...,...,...,...,...
,...,...,...,...,...,...


In [39]:
df.spec.value_counts().nlargest(20).to_frame().compute()

Unnamed: 0,spec
ipython/ipython-in-depth/master,163735
jupyterlab/jupyterlab-demo/master,33724
ines/spacy-io-binder/live,16367
DS-100/textbook/master,11167
bokeh/bokeh-notebooks/master,7834
ines/spacy-course/binder,4778
binder-examples/requirements/master,4754
rationalmatter/juno-demo-notebooks/master,4553
binder-examples/r/master,4547
QuantStack/xeus-cling/stable,3602
