# How to Check Progress of Dask Computations

## Local Computations

In [1]:
import dask.dataframe as dd # Uses threaded scheduler

In [2]:
df = dd.read_csv(
    "s3://nyc-tlc/trip data/yellow_tripdata_2019-01.csv",
    dtype={
        "payment_type": "UInt8",
        "VendorID": "UInt8",
        "passenger_count": "UInt8",
        "RatecodeID": "UInt8",
    },
)

### ProgressBar

In [3]:
from dask.diagnostics import ProgressBar

#### 1. Global Registration

In [4]:
pbar = ProgressBar()                
pbar.register() # global registration

In [5]:
df.groupby("passenger_count").tip_amount.mean().compute()

[########################################] | 100% Completed |  2min 36.8s


passenger_count
0    1.786901
1    1.828308
2    1.833877
3    1.795579
4    1.702710
5    1.869868
6    1.856830
7    6.542632
8    6.480690
9    3.116667
Name: tip_amount, dtype: float64

In [6]:
pbar.last_duration 

156.777657374

In [7]:
pbar.unregister()

#### 2. Context Managers

In [8]:
# Context manager
with pbar:
    df.groupby("passenger_count").tip_amount.mean().compute()

[########################################] | 100% Completed |  2min 36.0s


### Profilers

In [9]:
from dask.diagnostics import Profiler, ResourceProfiler, CacheProfiler

In [10]:
prof = Profiler()
rprof = ResourceProfiler()
cprof = CacheProfiler()

In [12]:
with prof, rprof, cprof:
    df.groupby("passenger_count").tip_amount.mean().compute()

In [11]:
from bokeh.plotting import output_notebook
output_notebook()

In [13]:
prof.results[:2]

[TaskData(key=('read-csv-19b96a3dea475d91fd0eedeb908a6a93', 0), task=(subgraph_callable-e53d7957-2f8a-44ae-897e-6cbc0f56df26, [(<function read_block_from_file at 0x7f878de603a0>, <OpenFile 'nyc-tlc/trip data/yellow_tripdata_2019-01.csv'>, 0, 64000000, b'\n'), None, True, False]), start_time=729.439709938, end_time=846.541354457, worker_id=123145707692032),
 TaskData(key=('read-csv-19b96a3dea475d91fd0eedeb908a6a93', 1), task=(subgraph_callable-e53d7957-2f8a-44ae-897e-6cbc0f56df26, [(<function read_block_from_file at 0x7f878de603a0>, <OpenFile 'nyc-tlc/trip data/yellow_tripdata_2019-01.csv'>, 64000000, 64000000, b'\n'), None, False, False]), start_time=729.439725731, end_time=853.16867582, worker_id=123145724481536)]

In [14]:
prof.visualize()

In [15]:
rprof.results[:5]

[ResourceData(time=0.230385191, mem=306.028544, cpu=0.0),
 ResourceData(time=1.242952088, mem=306.270208, cpu=4.4),
 ResourceData(time=2.244945291, mem=306.364416, cpu=5.5),
 ResourceData(time=3.246121416, mem=306.823168, cpu=7.4),
 ResourceData(time=4.246472866, mem=307.613696, cpu=9.0)]

In [16]:
rprof.visualize()

In [17]:
cprof.results[:2]

[CacheData(key=('read-csv-19b96a3dea475d91fd0eedeb908a6a93', 2), task=(subgraph_callable-e53d7957-2f8a-44ae-897e-6cbc0f56df26, [(<function read_block_from_file at 0x7f878de603a0>, <OpenFile 'nyc-tlc/trip data/yellow_tripdata_2019-01.csv'>, 128000000, 64000000, b'\n'), None, False, False]), metric=1, cache_time=809.216166928, free_time=809.266780843),
 CacheData(key=('read-csv-19b96a3dea475d91fd0eedeb908a6a93', 4), task=(subgraph_callable-e53d7957-2f8a-44ae-897e-6cbc0f56df26, [(<function read_block_from_file at 0x7f878de603a0>, <OpenFile 'nyc-tlc/trip data/yellow_tripdata_2019-01.csv'>, 256000000, 64000000, b'\n'), None, False, False]), metric=1, cache_time=827.4490766, free_time=827.505900753)]

In [18]:
cprof.visualize()

## Distributed Computation

In [19]:
from dask.distributed import Client
client = Client()

### 1. Progress Bar

In [20]:
from dask.distributed import progress

result = df.groupby("passenger_count").tip_amount.mean().persist()
progress(result)

VBox()

### 2. Diagnostic Dashboard

In [25]:
client.dashboard_link

'http://127.0.0.1:8787/status'

### 3. Dask JupyterLab Extension

In [22]:
df.groupby("passenger_count").tip_amount.sum().compute()

passenger_count
0     209748.20
1    9975467.18
2    2043133.07
3     565106.33
4     239661.51
5     605541.83
6     372871.93
7        124.31
8        187.94
9         28.05
Name: tip_amount, dtype: float64

In [None]:
client.close()