In [1]:
from dask.distributed import Client
import os

import dask

If you have an existing Dask cluster running already, set the scheduler address below. Otherwise, leave it to `None` and a local cluster will be created.

In [2]:
scheduler_address = None #"tcp://10.2.168.161:8786"

if scheduler_address is None:
    from dask_cuda import LocalCUDACluster
    cluster = LocalCUDACluster()
    c = Client(cluster)
else:
    c = Client(scheduler_address)
c

Port 8787 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.


0,1
Client  Scheduler: tcp://127.0.0.1:36097  Dashboard: http://127.0.0.1:45869/status,Cluster  Workers: 8  Cores: 8  Memory: 540.95 GB


In [3]:
from nccl_example import nccl, inject_comms_on_handle
from cuml.common.handle import Handle

In [4]:
from dask import delayed
import dask.dataframe as dd
from dask.distributed import wait

import numba.cuda
import cudf
import numpy as np

In [5]:
from tornado import gen
from dask.distributed import default_client
from toolz import first
import logging
import dask.dataframe as dd

import dask_cudf
import numpy as np
import cudf
import pandas as pd

from dask.distributed import wait


def parse_host_port(address):
    if '://' in address:
        address = address.rsplit('://', 1)[1]
    host, port = address.split(':')
    port = int(port)
    return host, port

import dask_cudf


@gen.coroutine
def extract_ddf_partitions(ddf):
    """
    Given a Dask cuDF, return a tuple with (worker, future) for each partition
    """
    client = default_client()
    
    delayed_ddf = ddf.to_delayed()
    parts = client.compute(delayed_ddf)
    yield wait(parts)
    
    key_to_part_dict = dict([(str(part.key), part) for part in parts])
    who_has = yield client.who_has(parts)

    worker_map = []
    for key, workers in who_has.items():
        worker = parse_host_port(first(workers))
        worker_map.append((worker, key_to_part_dict[key]))

    gpu_data = [(worker, part) for worker, part in worker_map]

    yield wait(gpu_data)

    raise gen.Return(gpu_data)
    
    
def create_df(f, m, n, c):
    """
    Generates a cudf of the given size with sklearn's make_blobs 
    """
    from sklearn.datasets.samples_generator import make_blobs
    X, y = make_blobs(n_samples=m, centers=c, n_features=n, random_state=0)
    ret = cudf.DataFrame([(i,
                           X[:, i].astype(np.float64)) for i in range(n)],
                         index=cudf.dataframe.RangeIndex(f * m,
                                                         f * m + m, 1))
    return ret

def get_meta(df):
    ret = df.iloc[:0]
    return ret

def gen_dask_cudf(nrows, ncols, clusters):
    workers = c.has_what().keys()

    # Create dfs on each worker (gpu)
    dfs = [c.submit(create_df, n, nrows, ncols, clusters, workers=[worker])
           for worker, n in list(zip(workers, list(range(len(workers)))))]
    # Wait for completion
    wait(dfs)

    meta = c.submit(get_meta, dfs[0]).result()
    return dask_cudf.from_delayed(dfs, meta=meta)

In [6]:
class NCCLBase:
    
    def __init__(self):
        self.client = default_client()
        self.init_comms()
        
    def get_workers_(self):
        """
        Return the list of workers parsed as [(address, port)]
        """
        return list(map(lambda x: parse_host_port(x), self.client.has_what().keys()))
    
    def get_workers_(self):
        """
        Return the list of workers parsed as [(address, port)]
        """
        return list(map(lambda x: parse_host_port(x), self.client.has_what().keys()))

        
    @staticmethod
    def func_init_comms(workerId, nWorkers, uniqueId):
        """
        Initialize ncclComm_t on worker
        """
        w = dask.distributed.get_worker()

        # 1. initialize any necessary comms on each worker
        n = nccl()
        n.init(nWorkers, uniqueId, workerId)

        # 2. Initialize cumlCommunicator and inject into cumlHandle
        handle = Handle()
        inject_comms_on_handle(handle, n, nWorkers, workerId)

        # Each worker will hold onto (nccl, handle) in self.clique
        return (n, handle)
    
    def init_comms(self):
        """
        Use nccl-py to initialize ncclComm_t on each worker and 
        store the futures for this instance. 
        """
        uniqueId = nccl.get_unique_id()

        workers = self.get_workers_()
        workers_indices = list(zip(workers, range(len(workers))))

        self.clique = [(idx, worker, self.client.submit(KMeans.func_init_comms, 
                                           idx, 
                                           len(workers), 
                                           uniqueId,
                                           workers=[worker]))
             for worker, idx in workers_indices]
        

In [7]:
from cuml.cluster import KMeans as cumlKMeans

class KMeans(NCCLBase):
    
    def __init__(self, n_clusters = 8, init_method = "random", verbose = 0):
        
        super(KMeans, self).__init__()
        self.init_(n_clusters = n_clusters, init_method = init_method, verbose = verbose)
    
    
    @staticmethod
    def func_build_kmeans_(a, n_clusters, init_method, verbose, wid):
        """
        Create local KMeans instance on worker
        """
        w = dask.distributed.get_worker()
        
        nccl_inst, handle = a

        return cumlKMeans(handle = handle, init = init_method, n_clusters = n_clusters, verbose = verbose)
    
    @staticmethod
    def get_meta(df):
        return df.iloc[:0]
    
    @staticmethod
    def func_fit(model, df, wid): return model.fit(df)
    
    @staticmethod
    def func_predict(model, df, wid): return model.predict(df)

    
    def fit(self, X):
        """
        An end to end example to mimic a typical Dask cuML algorithm using
        OPG semantics. This function is executed on the client.
        
        The following steps are taken with Dask cuDF as input:
        1. Co-locate Dask cuDF partitions with nccl communicator and our demo "model" on each worker
        2. Run the algorithm, extracting the Numba device memory pointer for each partition
           and allocating necessary output memory on device for constructing the cuDF 
           partition(s) that will be returned to the user. 
        3. Construct Dask cuDF from the futures containing the cuDFs returned from local 
           "algorithm" functions on each worker. 
        """
        # Keep the futures around so the GPU memory doesn't get
        # deallocated on the workers.
        gpu_futures = c.sync(extract_ddf_partitions, X)

        worker_model_map = dict(map(lambda x: (x[0], x[1]), self.kmeans))
        worker_rank_map = self.worker_ranks()

        # Run our function to fit on all the workers 
        f = [c.submit(KMeans.func_fit, # Function to run on worker
                      worker_model_map[w],               # tuple(nccl_comm, KMeans instance)
                      f,                                 # Input DataFrame partition
                      worker_rank_map[w])                # Makes sure all workers call function 
             for w, f in gpu_futures] 
        wait(f)
        
    def predict(self, X):
        
        # Keep the futures around so the GPU memory doesn't get
        # deallocated on the workers.
        gpu_futures = c.sync(extract_ddf_partitions, X)

        worker_model_map = dict(map(lambda x: (x[0], x[1]), self.kmeans))
        worker_rank_map = self.worker_ranks()

        # Run our function to predict on all the workers
        f = [c.submit(KMeans.func_predict,               # Function to run on worker
                      worker_model_map[w],               # tuple(nccl_comm, KMeans instance)
                      f,                                 # Input DataFrame partition
                      worker_rank_map[w])                # Makes sure all workers call function 
             for w, f in gpu_futures] 
        wait(f)
        
        # Convert result back into a dask_cudf
        dfs = [d for d in f if d.type != type(None)]
        meta = c.submit(KMeans.get_meta, dfs[0]).result()
        ddf = dd.from_delayed(dfs, meta=meta)
        
        return ddf
    
    def fit_predict(self, X):
        self.fit(X)
        return self.predict(X)

    def worker_ranks(self):
        """
        Builds a dictionary of { (worker_address, worker_port) : worker_rank }
        """
        return dict(list(map(lambda x: (x[1], x[0]), self.clique)))


    def init_(self, n_clusters, init_method, verbose = 0):
        """
        Creates local kmeans instance on each worker
        """
        self.kmeans = [(w, c.submit(KMeans.func_build_kmeans_, 
                                    a, n_clusters, init_method, verbose, i, 
                                    workers=[w])) for i, w, a in self.clique]
        wait(self.kmeans)

In [8]:
n_clusters = 10

First, a Dask-cuML `KMeans` instance is created, which initializes it's own NCCL clique

In [9]:
demo = KMeans(n_clusters, init_method = "random", verbose = 1)

Print out the ranks assigned to the workers in the NCCL clique

In [10]:
demo.worker_ranks()

{('127.0.0.1', 34235): 0,
 ('127.0.0.1', 34498): 1,
 ('127.0.0.1', 34687): 2,
 ('127.0.0.1', 36540): 3,
 ('127.0.0.1', 40261): 4,
 ('127.0.0.1', 41791): 5,
 ('127.0.0.1', 42878): 6,
 ('127.0.0.1', 46195): 7}

Crate a Dask cuDF using sklearn's `make_blobs` for testing

In [11]:
X = gen_dask_cudf(10, 50, 8)

Demonstrate we have one cuDF partition per worker

In [12]:
c.has_what()

{'tcp://127.0.0.1:34235': ('func_build_kmeans_-ddb2496c0110f3968fbf4c1a14b356fe',
  'func_init_comms-73fcd2d0a0cfebe72d1df3d4ed12a1d9',
  'create_df-9cb693d1c092136f9c8843b4d3341508'),
 'tcp://127.0.0.1:34498': ('func_init_comms-f6caf1095f47cb275cf8e8c0a330ac3c',
  'func_build_kmeans_-34bc88c7706bbaa413af2c167904c8b5',
  'create_df-46e12f8f08f37c3a28b4f105d8540e24'),
 'tcp://127.0.0.1:34687': ('func_init_comms-0ba14a7e060cf642473a6206e34ef645',
  'create_df-843d18bf29374c0399dc7b65fdff838c',
  'func_build_kmeans_-043ba23bf430e9cdd7b3fd25a3d703dc'),
 'tcp://127.0.0.1:36540': ('func_build_kmeans_-5bae078907f5d2bcf8a8db67e3009feb',
  'create_df-1b191bffd30470a97402e6826fce58a4',
  'func_init_comms-70629aac16736b6494c23e88a169c3a1'),
 'tcp://127.0.0.1:40261': ('create_df-7120cd2ba58cc66818613cf9dfd9642d',
  'func_build_kmeans_-7fdf1e9798aa57ea660e3485d97e6a0b',
  'func_init_comms-6e2857051a8e193201308bfd145bfed8'),
 'tcp://127.0.0.1:41791': ('func_build_kmeans_-7d1f5c01ef241f792b00391fbd92

Fit the KMeans MNMG model

In [13]:
demo.fit(X)

NameError: name 'random' is not defined

Predict labels for the same inputs we trained on

In [14]:
result = demo.predict(X)

In [15]:
print(str(result))

<dask_cudf.Series | 16 tasks | 8 npartitions>


In [16]:
print(str(result.compute()))

0    0
1    2
2    2
3    2
4    2
5    2
6    5
7    2
8    0
9    1
[70 more rows]
dtype: int32
