In [1]:
from dask.distributed import Client
import os

import dask

If you have an existing Dask cluster running already, set the scheduler address below. Otherwise, leave it to `None` and a local cluster will be created.

In [2]:
scheduler_address = None #"tcp://10.2.168.161:8786"

if scheduler_address is None:
    from dask_cuda import LocalCUDACluster
    cluster = LocalCUDACluster()
    c = Client(cluster)
else:
    c = Client(scheduler_address)
c

Port 8787 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.


0,1
Client  Scheduler: tcp://127.0.0.1:37488  Dashboard: http://127.0.0.1:36592/status,Cluster  Workers: 8  Cores: 8  Memory: 540.95 GB


In [3]:
from nccl_example import nccl, inject_comms_on_handle, SimpleReduce
from cuml.common.handle import Handle

import random
from dask.distributed import wait

In [4]:
from dask import delayed
import dask.dataframe as dd
import numba.cuda
import cudf
import numpy as np

In [5]:
from tornado import gen
from dask.distributed import default_client
from toolz import first
import logging
import dask.dataframe as dd

import dask_cudf
import numpy as np
import cudf
import pandas as pd

from dask.distributed import wait


def parse_host_port(address):
    if '://' in address:
        address = address.rsplit('://', 1)[1]
    host, port = address.split(':')
    port = int(port)
    return host, port

import dask_cudf


@gen.coroutine
def extract_ddf_partitions(ddf):
    """
    Given a Dask cuDF, return a tuple with (worker, future) for each partition
    """
    client = default_client()
    
    delayed_ddf = ddf.to_delayed()
    parts = client.compute(delayed_ddf)
    yield wait(parts)
    
    key_to_part_dict = dict([(str(part.key), part) for part in parts])
    who_has = yield client.who_has(parts)

    worker_map = []
    for key, workers in who_has.items():
        worker = parse_host_port(first(workers))
        worker_map.append((worker, key_to_part_dict[key]))

    gpu_data = [(worker, part) for worker, part in worker_map]

    yield wait(gpu_data)

    raise gen.Return(gpu_data)
    
    
def create_df(f, m, n, c):
    """
    Generates a cudf of the given size with all values initialized to 1 
    """
    from sklearn.datasets.samples_generator import make_blobs
    X, y = make_blobs(n_samples=m, centers=c, n_features=n, random_state=0)
    ret = cudf.DataFrame([(i,
                           X[:, i].astype(np.float64)) for i in range(n)],
                         index=cudf.dataframe.RangeIndex(f * m,
                                                         f * m + m, 1))
    return ret

def get_meta(df):
    ret = df.iloc[:0]
    return ret

def gen_dask_cudf(nrows, ncols, clusters):
    workers = c.has_what().keys()

    # Create dfs on each worker (gpu)
    dfs = [c.submit(create_df, n, nrows, ncols, clusters, workers=[worker])
           for worker, n in list(zip(workers, list(range(len(workers)))))]
    # Wait for completion
    wait(dfs)

    meta = c.submit(get_meta, dfs[0]).result()
    return dask_cudf.from_delayed(dfs, meta=meta)

In [12]:
from cuml.cluster import KMeans as cumlKMeans

class KMeans:
    
    def __init__(self, n_clusters = 8, init_method = "random", verbose = 0):
        self.client = default_client()

        self.init_(n_clusters = n_clusters, init_method = init_method, verbose = verbose)
    
    
    @staticmethod
    def func_init_(workerId, nWorkers, uniqueId, n_clusters, init, verbose = 0):
        """
        Initialize ncclComm_t on worker
        """
        w = dask.distributed.get_worker()

        # 1. initialize any necessary comms on each worker
        n = nccl()
        n.init(nWorkers, uniqueId, workerId)

        # 2. Initialize cumlCommunicator and inject into cumlHandle
        handle = Handle()
        inject_comms_on_handle(handle, n, nWorkers, workerId)

        # 3. Use the cumlHandle w/ cumlCommunicator for model
        a = cumlKMeans(handle = handle, init = init, n_clusters = n_clusters, verbose = verbose)
        #a = SimpleReduce(workerId, nWorkers, n.get_comm())

        return (n, a)
    
    
    
    @staticmethod
    def get_meta(df):
        return df.iloc[:0]
    
    @staticmethod
    def func_fit(model, df, r):
        """
        This function is executed on the workers and performs the necessary
        data preparation, as well as calling the cython-wrapped C++ "algorithm"
        function(s), returning a cuDF with results, if necessary. 
        
        The client will construct a Dask cuDF out of the cuDFs returned from
        this function, if necessary. 
        """

        nccl_comm, model = model
        
        # Execute our cython-wrapped C++ "algorithm"
        return model.fit(df)
    
    @staticmethod
    def func_predict(model, df, r):
        """
        This function is executed on the workers and performs the necessary
        data preparation, as well as calling the cython-wrapped C++ "algorithm"
        function(s), returning a cuDF with results, if necessary. 
        
        The client will construct a Dask cuDF out of the cuDFs returned from
        this function, if necessary. 
        """
        nccl_comm, model = model
        
        # Execute our cython-wrapped C++ "algorithm"
        return model.predict(df)

    
    def fit(self, X):
        """
        An end to end example to mimic a typical Dask cuML algorithm using
        OPG semantics. This function is executed on the client.
        
        The following steps are taken with Dask cuDF as input:
        1. Co-locate Dask cuDF partitions with nccl communicator and our demo "model" on each worker
        2. Run the algorithm, extracting the Numba device memory pointer for each partition
           and allocating necessary output memory on device for constructing the cuDF 
           partition(s) that will be returned to the user. 
        3. Construct Dask cuDF from the futures containing the cuDFs returned from local 
           "algorithm" functions on each worker. 
        """
        
        # Keep the futures around so the GPU memory doesn't get
        # deallocated on the workers.
        gpu_futures = c.sync(extract_ddf_partitions, X)

        worker_model_map = dict(map(lambda x: (x[1], x[2]), self.clique))

        # Run our "algorithm" to perform reduce 
        f = [c.submit(KMeans.func_fit, # Function to run on worker
                      worker_model_map[w],               # tuple(nccl_comm, KMeans instance)
                      f,                                 # Input DataFrame partition
                      random.random())                   # Makes sure all workers call function 
             for w, f in gpu_futures] 
        wait(f)
        
    def predict(self, X):
        
        # Keep the futures around so the GPU memory doesn't get
        # deallocated on the workers.
        gpu_futures = c.sync(extract_ddf_partitions, X)

        worker_model_map = dict(map(lambda x: (x[1], x[2]), self.clique))

        # Run our "algorithm" to perform reduce 
        f = [c.submit(KMeans.func_predict, # Function to run on worker
                      worker_model_map[w],               # tuple(nccl_comm, KMeans instance)
                      f,                                 # Input DataFrame partition
                      random.random())                   # Makes sure all workers call function 
             for w, f in gpu_futures] 
        wait(f)
        
        # Convert result back into a dask_cudf
        dfs = [d for d in f if d.type != type(None)]
        meta = c.submit(KMeans.get_meta, dfs[0]).result()
        ddf = dd.from_delayed(dfs, meta=meta)
        
        return ddf

    def worker_ranks(self):
        """
        Builds a dictionary of { (worker_address, worker_port) : worker_rank }
        """
        return dict(list(map(lambda x: (x[1], x[0]), self.clique)))

    def run_func_on_workers(self, func):
        """
        Simple helper function to schedule a function on all workers
        of a clique
        """
        f = [c.submit(func, a, random.random()) for i, w, a in self.clique]
        wait(f)
        return [a.result() for a in f]

    def get_workers_(self):
        """
        Return the list of workers parsed as [(address, port)]
        """
        return list(map(lambda x: parse_host_port(x), self.client.has_what().keys()))
    
    def init_(self, n_clusters, init_method, verbose = 0):
        """
        Use nccl-py to initialize ncclComm_t on each worker and 
        store the futures for this instance. 
        """
        uniqueId = nccl.get_unique_id()

        workers = self.get_workers_()
        workers_indices = list(zip(workers, range(len(workers))))

        self.clique = [(idx, worker, self.client.submit(KMeans.func_init_, 
                                           idx, 
                                           len(workers), 
                                           uniqueId,
                                           n_clusters,
                                           init_method,
                                           verbose,
                                           workers=[worker]))
             for worker, idx in workers_indices]
        

In [13]:
n_clusters = 10

First, a Dask-cuML `KMeans` instance is created, which initializes it's own NCCL clique

In [11]:
demo = KMeans(n_clusters, init_method = "random", verbose = 1)

NameError: name 'Dask_NCCL_Demo' is not defined

Print out the ranks assigned to the workers in the NCCL clique

In [None]:
demo.worker_ranks()

Crate a Dask cuDF using sklearn's `make_blobs` for testing

In [None]:
X = gen_dask_cudf(10, 50, 8)

Demonstrate we have one cuDF partition per worker

In [35]:
c.has_what()

{'tcp://127.0.0.1:33214': ('func_predict-71015d670d5c778aa1c954b4d6ce123e',
  'create_df-c6a8bfc80639c2fbdb9cbd02de40084a'),
 'tcp://127.0.0.1:39329': ('func_predict-ad7748343b8119d547580ba4b129c22d',
  'create_df-b5596daa0e9d4e1b90ebd3350d7c694a'),
 'tcp://127.0.0.1:39363': ('func_predict-ea3166be694179ae83ebad35561e40e1',
  'create_df-b5d2222a97e4f5d2fe6366a99349ebb3'),
 'tcp://127.0.0.1:39822': ('create_df-de921811da61b39207148eaa08a3679a',
  'func_predict-5410147a2215d9d513bfc0a57ab180fd'),
 'tcp://127.0.0.1:40041': ('func_predict-dbc507e01bb580ea8f49a4fe532fce6e',
  'create_df-d1136d784920bf0bf7b42b8eee43c2bf'),
 'tcp://127.0.0.1:41081': ('create_df-960aa22cb1b56865a10f6f2a1ca47eda',
  'func_predict-6e49a2d1dd9eb2654c655e22a4c391e0'),
 'tcp://127.0.0.1:45709': ('create_df-e0c3cedc252d8426008c0808533a7dc3',
  'func_predict-de99e03ddeb6e459f60f8d2fca81140f'),
 'tcp://127.0.0.1:46718': ('create_df-ef885c3a54df7aa085194e450fe7df64',
  'func_predict-2dbe4e05edb0ea47c16d5dcb250edb34')}

Fit the KMeans MNMG model

In [36]:
demo.fit(X)

Predict labels for the same inputs we trained on

In [37]:
result = demo.predict(X)

TypeError: expected bytes, int found

In [28]:
print(str(result))

<dask_cudf.Series | 16 tasks | 8 npartitions>


In [29]:
print(str(result.compute()))

0    0
1    2
2    2
3    2
4    2
5    2
6    5
7    2
8    0
9    1
[70 more rows]
dtype: int32
