In [1]:
from dask.distributed import Client
import os

import dask

## Build Dask Cluster

If you have an existing Dask cluster running already, set the scheduler address below. Otherwise, leave it to `None` and a local cluster will be created.

In [2]:
scheduler_address = None #"tcp://10.2.168.161:8786"

if scheduler_address is None:
    from dask_cuda import LocalCUDACluster
    cluster = LocalCUDACluster()
    c = Client(cluster)
else:
    c = Client(scheduler_address)
c

Port 8787 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.


0,1
Client  Scheduler: tcp://127.0.0.1:33499  Dashboard: http://127.0.0.1:42206/status,Cluster  Workers: 8  Cores: 8  Memory: 540.95 GB


## Imports

In [3]:
from nccl_example import nccl, inject_comms_on_handle
from cuml.common.handle import Handle

In [4]:
from dask import delayed
import dask.dataframe as dd
from dask.distributed import wait

import numba.cuda
import cudf
import numpy as np

## Helper functions

In [5]:
from tornado import gen
from dask.distributed import default_client
from toolz import first
import logging
import dask.dataframe as dd

import dask_cudf
import numpy as np
import cudf
import pandas as pd

from dask.distributed import wait


def parse_host_port(address):
    if '://' in address:
        address = address.rsplit('://', 1)[1]
    host, port = address.split(':')
    port = int(port)
    return host, port

import dask_cudf


@gen.coroutine
def extract_ddf_partitions(ddf):
    """
    Given a Dask cuDF, return a tuple with (worker, future) for each partition
    """
    client = default_client()
    
    delayed_ddf = ddf.to_delayed()
    parts = client.compute(delayed_ddf)
    yield wait(parts)
    
    key_to_part_dict = dict([(str(part.key), part) for part in parts])
    who_has = yield client.who_has(parts)

    worker_map = []
    for key, workers in who_has.items():
        worker = parse_host_port(first(workers))
        worker_map.append((worker, key_to_part_dict[key]))

    gpu_data = [(worker, part) for worker, part in worker_map]

    yield wait(gpu_data)

    raise gen.Return(gpu_data)
    
    
def create_df(f, m, n, c):
    """
    Generates a cudf of the given size with sklearn's make_blobs 
    """
    from sklearn.datasets.samples_generator import make_blobs
    X, y = make_blobs(n_samples=m, centers=c, n_features=n, random_state=0)
    ret = cudf.DataFrame([(i,
                           X[:, i].astype(np.float64)) for i in range(n)],
                         index=cudf.dataframe.RangeIndex(f * m,
                                                         f * m + m, 1))
    return ret

def get_meta(df):
    ret = df.iloc[:0]
    return ret

def gen_dask_cudf(nrows, ncols, clusters):
    workers = c.has_what().keys()

    # Create dfs on each worker (gpu)
    dfs = [c.submit(create_df, n, nrows, ncols, clusters, workers=[worker])
           for worker, n in list(zip(workers, list(range(len(workers)))))]
    # Wait for completion
    wait(dfs)

    meta = c.submit(get_meta, dfs[0]).result()
    return dask_cudf.from_delayed(dfs, meta=meta)

def to_dask_cudf(futures):
    # Convert a list of futures containing dfs back into a dask_cudf
    dfs = [d for d in futures if d.type != type(None)]
    meta = c.submit(get_meta, dfs[0]).result()
    return dd.from_delayed(dfs, meta=meta)


## Reusable NCCL Base Class

In [6]:
class NCCLBase:
    
    def __init__(self):
        self.client = default_client()
        self.init_comms()
        
    def get_workers_(self):
        """
        Return the list of workers parsed as [(address, port)]
        """
        return list(map(lambda x: parse_host_port(x), self.client.has_what().keys()))
    
    
    def worker_ranks(self):
        """
        Builds a dictionary of { (worker_address, worker_port) : worker_rank }
        """
        return dict(list(map(lambda x: (x[1], x[0]), self.clique)))
        
    @staticmethod
    def func_init_comms(workerId, nWorkers, uniqueId):
        """
        Initialize ncclComm_t on worker
        """
        w = dask.distributed.get_worker()

        # 1. initialize any necessary comms on each worker
        n = nccl()
        n.init(nWorkers, uniqueId, workerId)

        # 2. Initialize cumlCommunicator and inject into cumlHandle
        handle = Handle()
        inject_comms_on_handle(handle, n, nWorkers, workerId)

        # Each worker will hold onto (nccl, handle) in self.clique
        return (n, handle)
    
    def init_comms(self):
        """
        Use nccl-py to initialize ncclComm_t on each worker and 
        store the futures for this instance. 
        """
        uniqueId = nccl.get_unique_id()

        workers = self.get_workers_()
        workers_indices = list(zip(workers, range(len(workers))))

        self.clique = [(idx, worker, self.client.submit(KMeans.func_init_comms, 
                                           idx, 
                                           len(workers), 
                                           uniqueId,
                                           workers=[worker]))
             for worker, idx in workers_indices]

## Dask-cuML OPG KMeans Implementation

In [7]:
from cuml.cluster import KMeans as cumlKMeans

class KMeans(NCCLBase):
    
    def __init__(self, n_clusters = 8, init_method = "random", verbose = 0):
        super(KMeans, self).__init__()
        self.init_(n_clusters = n_clusters, init_method = init_method, verbose = verbose)
    

    def init_(self, n_clusters, init_method, verbose = 0):
        """
        Creates local kmeans instance on each worker
        """
        self.kmeans = [(w, c.submit(KMeans.func_build_kmeans_, 
                                    a, n_clusters, init_method, verbose, i, 
                                    workers=[w])) for i, w, a in self.clique]
        wait(self.kmeans)

    
    @staticmethod
    def func_build_kmeans_(a, n_clusters, init_method, verbose, wid):
        """
        Create local KMeans instance on worker
        """
        w = dask.distributed.get_worker()
        
        nccl_inst, handle = a
        return cumlKMeans(handle = handle, init = init_method, n_clusters = n_clusters, verbose = verbose)
    
    @staticmethod
    def func_fit(model, df, wid): return model.fit(df)
    
    @staticmethod
    def func_predict(model, df, wid): return model.predict(df)

    def run_model_func_on_dask_cudf(self, func, X):
        
        gpu_futures = c.sync(extract_ddf_partitions, X)

        worker_model_map = dict(map(lambda x: (x[0], x[1]), self.kmeans))
        worker_rank_map = self.worker_ranks()

        f = [c.submit(func,                              # Function to run on worker
                      worker_model_map[w],               # Model instance
                      f,                                 # Input DataFrame partition
                      worker_rank_map[w])                # Worker ID
             for w, f in gpu_futures] 
        wait(f)
        return f
    
    def fit(self, X):
        self.run_model_func_on_dask_cudf(KMeans.func_fit, X)
        return self
        
    def predict(self, X):
        f = self.run_model_func_on_dask_cudf(KMeans.func_predict, X)
        return to_dask_cudf(f)
    
    def fit_predict(self, X):
        return self.fit(X).predict(X)

## Execute End-To-End Example

In [8]:
n_samples_per_worker = 10
n_features = 50
n_clusters = 10

Create a Dask cuDF using sklearn's `make_blobs` for testing

In [9]:
X = gen_dask_cudf(n_samples_per_worker, n_features, n_clusters)

First, a Dask-cuML `KMeans` instance is created, which initializes it's own NCCL clique

In [10]:
demo = KMeans(n_clusters, init_method = "random", verbose = 1)

Print out the ranks assigned to the workers in the NCCL clique

In [11]:
demo.worker_ranks()

{('127.0.0.1', 35583): 0,
 ('127.0.0.1', 37540): 1,
 ('127.0.0.1', 39280): 2,
 ('127.0.0.1', 40647): 3,
 ('127.0.0.1', 40649): 4,
 ('127.0.0.1', 41089): 5,
 ('127.0.0.1', 41503): 6,
 ('127.0.0.1', 41944): 7}

Verify we have one cuDF partition per worker

In [12]:
c.has_what()

{'tcp://127.0.0.1:35583': ('func_init_comms-88bf1f691ee635a9c9ecee0ae29494e2',
  'create_df-0f5e4119970d28ce56c3ac689ddf264f',
  'func_build_kmeans_-f5fcf6604239dd67712a8abc8eb5db62'),
 'tcp://127.0.0.1:37540': ('create_df-c8a1f8155fe4f975062ada1f40b78566',
  'func_init_comms-9c68f7f5562823a47d87cbf764ca962e',
  'func_build_kmeans_-ab243dab8e81b7b9a7ffe2ec03b20a83'),
 'tcp://127.0.0.1:39280': ('create_df-e5ecc725e850cb87669ebfd9f3b85972',
  'func_build_kmeans_-d5b6edf689e90a6d1d276754577e530f',
  'func_init_comms-b4cb5ffbb1446ab06756b8ad4bef3160'),
 'tcp://127.0.0.1:40647': ('func_init_comms-cc6a31bbe133c6716922eb64d31ee709',
  'func_build_kmeans_-48c0808fb6d29065b2c49f91d17dfa27',
  'create_df-1e09b354095c0f8997ffa5c7177e6d2d'),
 'tcp://127.0.0.1:40649': ('func_build_kmeans_-03d06d0512ecc30a4943216bc7f95c56',
  'func_init_comms-8852cb899c0f742c1c9f47cbc41deb44',
  'create_df-c6eb6de6bdc9f9c6a38da9321c958ad2'),
 'tcp://127.0.0.1:41089': ('func_init_comms-3b78048187699cf9d942ac7f3ec5795

Fit the KMeans MNMG model

In [13]:
demo.fit(X)

<__main__.KMeans at 0x7fd12a0d3a20>

Predict labels for the same inputs we trained on

In [14]:
result = demo.predict(X)

In [15]:
print(str(result))

<dask_cudf.Series | 16 tasks | 8 npartitions>


In [16]:
print(str(result.compute()))

0    1
1    0
2    0
3    1
4    1
5    2
6    3
7    2
8    0
9    1
[70 more rows]
dtype: int32
