# Running Dask on AzureML

This notebook shows how to run a batch job on a Dask cluster running on an AzureML Compute cluster. 
For setup instructions of your python environment, please see the [Readme](../README.md)

In [1]:
from azureml.core import Workspace, Experiment
from azureml.train.estimator import Estimator
from azureml.widgets import RunDetails
from azureml.core.runconfig import MpiConfiguration
from azureml.core import VERSION
import uuid
import time
VERSION


'1.12.0'

In [2]:
ws = Workspace.from_config()

## Download the NYC Taxi dataset and upload to the workspace default blob storage

In [9]:
import io
import os
import sys
import urllib.request
from tqdm import tqdm
from time import sleep

cwd = os.getcwd()

data_dir = os.path.abspath(os.path.join(cwd, 'data'))
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

taxidir = os.path.join(data_dir, 'nyctaxi')
if not os.path.exists(taxidir):
    os.makedirs(taxidir)

filenames = []
local_paths = []
for i in range(1, 13):
    filename = "yellow_tripdata_2015-{month:02d}.csv".format(month=i)
    filenames.append(filename)
    
    local_path = os.path.join(taxidir, filename)
    local_paths.append(local_path)

for idx, filename in enumerate(filenames):
    url = "http://dask-data.s3.amazonaws.com/nyc-taxi/2015/" + filename
    print("- Downloading " + url)
    if not os.path.exists(local_paths[idx]):
        with open(local_paths[idx], 'wb') as file:
            with urllib.request.urlopen(url) as resp:
                length = int(resp.getheader('content-length'))
                blocksize = max(4096, length // 100)
                with tqdm(total=length, file=sys.stdout) as pbar:
                    while True:
                        buff = resp.read(blocksize)
                        if not buff:
                            break
                        file.write(buff)
                        pbar.update(len(buff))
    else:
        print("- File already exists locally")

print("- Uploading taxi data... ")
ws = Workspace.from_config()
ds = ws.get_default_datastore()

ds.upload(
    src_dir=taxidir,
    target_path='nyctaxi',
    show_progress=True)

print("- Data transfer complete")

- Downloading http://dask-data.s3.amazonaws.com/nyc-taxi/2015/yellow_tripdata_2015-01.csv
100%|██████████| 1985964692/1985964692 [00:30<00:00, 65604283.41it/s] 
- Downloading http://dask-data.s3.amazonaws.com/nyc-taxi/2015/yellow_tripdata_2015-02.csv
100%|██████████| 1945357622/1945357622 [00:29<00:00, 65506177.65it/s]
- Downloading http://dask-data.s3.amazonaws.com/nyc-taxi/2015/yellow_tripdata_2015-03.csv
100%|██████████| 2087971794/2087971794 [00:33<00:00, 62180625.55it/s] 
- Downloading http://dask-data.s3.amazonaws.com/nyc-taxi/2015/yellow_tripdata_2015-04.csv
100%|██████████| 2046225765/2046225765 [00:31<00:00, 65746019.73it/s]
- Downloading http://dask-data.s3.amazonaws.com/nyc-taxi/2015/yellow_tripdata_2015-05.csv
100%|██████████| 2061869121/2061869121 [00:27<00:00, 73939136.66it/s] 
- Downloading http://dask-data.s3.amazonaws.com/nyc-taxi/2015/yellow_tripdata_2015-06.csv
100%|██████████| 1932049357/1932049357 [00:29<00:00, 64596156.85it/s]
- Downloading http://dask-data.s3.ama

## Starting the cluster

In [10]:
# we assume the AML compute training cluster is already created
dask_cluster = ws.compute_targets['daniel-big']

Starting the Dask cluster using an Estimator with MpiConfiguration. Make sure the cluster is able to scale up to 10 nodes or change the `node_count` below. 

In [14]:
est = Estimator('dask', 
                compute_target=dask_cluster, 
                entry_script='startDask.py', 
                conda_dependencies_file='environment.yml', 
                script_params={'--datastore': ws.get_default_datastore(),
                              '--script': 'batch.py'},
                node_count=10,
                distributed_training=MpiConfiguration())

run = Experiment(ws, 'dask').submit(est)

In [15]:
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

## Shut cluster down

In [50]:
for run in ws.experiments['dask'].get_runs():
    if run.get_status() == "Running":
        print(f'cancelling run {run.id}')
        run.cancel()

cancelling run dask_1575974502_b8643732
cancelling run dask_1575973181_99433e88


### Just for convenience, get the latest running Run

In [87]:
for run in ws.experiments['dask'].get_runs():
    if run.get_status() == "Running":
        print(f'latest running run is {run.id}')
        break

latest running run is dask_1574792066_49c85fe4
