In [19]:
from dask.distributed import Client
import os

import dask

In [20]:
from dask_cuda import LocalCUDACluster
cluster = LocalCUDACluster()

Port 8787 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.


In [21]:
c = Client(cluster)
c

0,1
Client  Scheduler: tcp://127.0.0.1:39400  Dashboard: http://127.0.0.1:35883/status,Cluster  Workers: 2  Cores: 2  Memory: 50.39 GB


In [22]:
from nccl_example import NCCL_Clique, unique_id
import random
from dask.distributed import wait

In [23]:
from dask import delayed
from helpers import _get_mg_info, gen_dask_cudf

In [24]:
class Dask_NCCL_Demo:
    
    def __init__(self, client):
        self.client = client
    
    @staticmethod
    def func_parse_host_port_(address):
        if '://' in address:
            address = address.rsplit('://', 1)[1]
        host, port = address.split(':')
        port = int(port)
        return host, port
    
    @staticmethod
    def func_init_(workerId, nWorkers, uniqueId):
        w = dask.distributed.get_worker()
        
        print("UNIQUEID: "  + str(uniqueId))
        
        print("Hello World! from ip=%s worker=%s/%d" % \
              (w.address, w.name, nWorkers))
        a = NCCL_Clique(workerId, nWorkers)
        a.create_clique(uniqueId)

        return a
    
    @staticmethod
    def func_get_size_(world, r):
        return world.get_clique_size()
    
    @staticmethod
    def func_get_rank_(world, r):
        return world.get_rank()
    
    @staticmethod
    def func_get_device_(world, r):
        return world.get_device()

    @staticmethod
    def func_test_all_reduce_(world, r):
        return world.test_all_reduce()
    
    @staticmethod
    def func_test_reduce_on_partition(world_df, root_rank, r):
        world, df = world_df
        world.test_on_partition(df, root_rank)
        
        
    def test_on_dask_cudf(self, dask_cudf):
        # Keep the futures around so the GPU memory doesn't get
        # deallocated on the workers.
        gpu_futures, cols = c.sync(_get_mg_info, X)
        
        # Combine dask_cudf partitions with their local "world" instance. 
        worker_world_map = dict(map(lambda x: (x[1], x[2]), self.clique))
        root_rank = demo.get_rank()[0]
        
        combined = list(map(delayed, [(worker_world_map[w], f) for w, f in gpu_futures]))
        combined = c.compute(combined)
        wait(combined)
        
        print(str(combined))

        f = [c.submit(Dask_NCCL_Demo.func_test_reduce_on_partition, f, root_rank, random.random()) for f in combined]
        wait(f)
        
    
    def worker_ranks(self):
        return dict(list(map(lambda x: (x[1], x[0]), self.clique)))
    
    
    def run_func_on_workers(self, func):
        f = [c.submit(func, a, random.random()) for i, w, a in self.clique]
        wait(f)
        return [a.result() for a in f]

    def get_workers_(self):
        return list(map(lambda x: Dask_NCCL_Demo.func_parse_host_port_(x), self.client.has_what().keys()))
    
    def init(self, uniqueId):
        workers = self.get_workers_()
        workers_indices = list(zip(workers, range(len(workers))))

        self.clique = [(idx, worker, self.client.submit(Dask_NCCL_Demo.func_init_, 
                                           idx, 
                                           len(workers), 
                                           uniqueId,
                                           workers=[worker]))
             for worker, idx in workers_indices]
        
    def get_clique_size(self):
        return self.run_func_on_workers(Dask_NCCL_Demo.func_get_size_)

    def get_rank(self):
        return self.run_func_on_workers(Dask_NCCL_Demo.func_get_rank_)
    
    def test_all_reduce(self):
        return self.run_func_on_workers(Dask_NCCL_Demo.func_test_all_reduce_)

First demonstration will show that NCCL works within Dask as expected. This includes:
1. Creation of a NCCL clique using Dask workers to broadcast the ncclUniqueId
2. Demonstrating the NCCL clique successfully performs collective comms
3. All calls to the underlying NCCL comm are made through the cuML comms facade. 

In [25]:
world_id = unique_id()

b'\x02\x00\xd9\x81\xc0\xa8\x01\xcf\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00%\x7f\x00\x00\xc0Wi\xcb\xdaU\x00\x00p\x80\x11\xcd\xdaU\x00\x00>\xe2\xe8\x1a%\x7f\x00\x00`w\x15#%\x7f\x00\x000a\xff\xe2\xdaU\x00\x00\xcd\xbf\xba\x12\xcc\xb6\xf7\xc8\x8a\xd4\x00\x00\x00\x00\x00\x00\x00\xa7\xd1\xec#\x7f\x00\x00\x908\x1b\xec$\x7f\x00\x00\xaca\xe1"%\x7f\x00\x00x\xb6f\xf0#\x7f\x00\x00\x04\x00\x00\x00\x00\x00\x00'

In [26]:
demo = Dask_NCCL_Demo(c)
demo.init(world_id)

In [27]:
demo.get_clique_size()

[2, 2]

In [28]:
demo.get_rank()

[0, 1]

In [29]:
demo.worker_ranks()

{('127.0.0.1', 36376): 0, ('127.0.0.1', 43562): 1}

In [30]:
demo.test_all_reduce()

[True, True]

In [31]:
demo.get_workers_()

[('127.0.0.1', 36376), ('127.0.0.1', 43562)]

The second demonstration will show that we can take a Dask cuDF and utilize NCCL on the workers hosting its partitions. This includes:
1. Initializing a NCCL clique with the workers that host partitions underlying the Dask cuDF
2. Perform a collective comm operation across the partitions using only the cuML comms facade to communicate with the NCCL comm. 
3. Output a Dask cuDF with the results of the collective comm. 

In [33]:
X = gen_dask_cudf(2000, 50, demo)
demo.test_on_dask_cudf(X)

[<Future: status: finished, type: tuple, key: tuple-5c283559-d1c9-416a-9c91-032aa81c49d6>, <Future: status: finished, type: tuple, key: tuple-859463ca-47ed-4225-a6b7-374b343a7e61>]
