In [1]:
from dask.distributed import Client
import subprocess as subp
import os

import dask

In [2]:
from dask_cuda import LocalCUDACluster
cluster = LocalCUDACluster()

In [3]:
c = Client(cluster)
c

0,1
Client  Scheduler: tcp://127.0.0.1:37140  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 2  Cores: 2  Memory: 50.39 GB


First demonstration will show that NCCL works within Dask as expected. This includes:
1. Creation of a NCCL clique using Dask workers to broadcast the ncclUniqueId
2. Demonstrating the NCCL clique successfully performs collective comms
3. All calls to the underlying NCCL comm are made through the cuML comms facade. 

In [4]:
from nccl_example import NCCL_Clique, unique_id
import random
from dask.distributed import wait

The second demonstration will show that we can take a Dask cuDF and utilize NCCL on the workers hosting its partitions. This includes:
1. Initializing a NCCL clique with the workers that host partitions underlying the Dask cuDF
2. Perform a collective comm operation across the partitions using only the cuML comms facade to communicate with the NCCL comm. 
3. Output a Dask cuDF with the results of the collective comm. 

In [5]:
import dask_cudf
import numpy as np
import cudf
import pandas as pd
from tornado import gen
from dask.distributed import default_client
import dask.dataframe as dd
from toolz import first
import logging

In [6]:
def parse_host_port(address):
    if '://' in address:
        address = address.rsplit('://', 1)[1]
    host, port = address.split(':')
    port = int(port)
    return host, port


def build_host_dict(workers):
    hosts = set(map(lambda x: parse_host_port(x), workers))
    hosts_dict = {}
    for host, port in hosts:
        if host not in hosts_dict:
            hosts_dict[host] = set([port])
        else:
            hosts_dict[host].add(port)

    return hosts_dict

def _build_host_dict(gpu_futures, client):
    """
    Helper function to build a dictionary mapping workers to parts
    that currently hold the parts of given futures.
    :param gpu_futures:
    :param client:
    :return:
    """
    who_has = client.who_has(gpu_futures)

    key_to_host_dict = {}
    for key in who_has:
        key_to_host_dict[key] = parse_host_port(who_has[key][0])

    hosts_to_key_dict = {}
    for key, host in key_to_host_dict.items():
        if host not in hosts_to_key_dict:
            hosts_to_key_dict[host] = set([key])
        else:
            hosts_to_key_dict[host].add(key)

    workers = [key[0] for key in list(who_has.values())]
    return build_host_dict(workers)

In [7]:
def gen_dask_cudf(m, n, demo):
    """Generates a large Dask cuDF initialized to all 1's (for verifying reduction op)"""
    n_workers = demo.get_clique_size()[0]
    X = np.ones((m*n_workers, n), dtype = np.float32)
    X = cudf.DataFrame.from_pandas(pd.DataFrame({'fea%d' % i: X[0:, i] for i in range(X.shape[1])}))
    return dask_cudf.from_cudf(X, npartitions = n_workers)

In [8]:
def to_gpu_matrix(i):
    return i

@gen.coroutine
def _get_mg_info(ddf):
    """
    Given a Dask cuDF, extract number of dimensions and convert
    the pieces of the Dask cuDF into Numba arrays, which can
    be passed into the kNN algorithm.
    build a
    :param ddf:
    :return:
    """

    client = default_client()

    if isinstance(ddf, dd.DataFrame):
        cols = len(ddf.columns)
        parts = ddf.to_delayed()
        parts = client.compute(parts)
        yield wait(parts)
    else:
        raise Exception("Input should be a Dask DataFrame")

    key_to_part_dict = dict([(str(part.key), part) for part in parts])
    who_has = yield client.who_has(parts)

    worker_map = []
    for key, workers in who_has.items():
        worker = parse_host_port(first(workers))
        worker_map.append((worker, key_to_part_dict[key]))

    gpu_data = [(worker, client.submit(to_gpu_matrix, part,
                                       workers=[worker]))
                for worker, part in worker_map]

    yield wait(gpu_data)

    raise gen.Return((gpu_data, cols))

In [9]:
def on_worker(rank, gpu_matrix):
    """
    Function to run on cudf partitions which:
    1. extracts the underlying ctypes pointer, 
    2. calls reduce on it in C++, and
    3. provides a single result array on the root worker (lowest rank) that
       can be used to construct an output cudf. 
    """
    return 1

In [10]:
from dask import delayed

In [5]:
class Dask_NCCL_Demo:
    
    def __init__(self, client):
        self.client = client
    
    @staticmethod
    def func_parse_host_port_(address):
        if '://' in address:
            address = address.rsplit('://', 1)[1]
        host, port = address.split(':')
        port = int(port)
        return host, port
    
    @staticmethod
    def func_init_(workerId, nWorkers, uniqueId):
        w = dask.distributed.get_worker()
        
        print("UNIQUEID: "  + str(uniqueId))
        
        print("Hello World! from ip=%s worker=%s/%d" % \
              (w.address, w.name, nWorkers))
        a = NCCL_Clique(workerId, nWorkers)
        a.create_clique(uniqueId)

        return a
    
    @staticmethod
    def func_get_size_(world, r):
        return world.get_clique_size()
    
    @staticmethod
    def func_get_rank_(world, r):
        return world.get_rank()
    
    @staticmethod
    def func_get_device_(world, r):
        return world.get_device()

    @staticmethod
    def func_test_all_reduce_(world, r):
        return world.test_all_reduce()
    
    @staticmethod
    def func_test_reduce_on_partition(world_df, root_rank, r):
        world, df = world_df
        world.test_on_partition(df, root_rank)
        
        
    def test_on_dask_cudf(self, dask_cudf):
        # Keep the futures around so the GPU memory doesn't get
        # deallocated on the workers.
        gpu_futures, cols = c.sync(_get_mg_info, X)
        
        # Combine dask_cudf partitions with their local "world" instance. 
        worker_world_map = dict(map(lambda x: (x[1], x[2]), self.clique))
        root_rank = demo.get_rank()[0]
        
        combined = list(map(delayed, [(worker_world_map[w], f) for w, f in gpu_futures]))
        combined = c.compute(combined)
        wait(combined)
        
        print(str(combined))

        f = [c.submit(Dask_NCCL_Demo.func_test_reduce_on_partition, f, root_rank, random.random()) for f in combined]
        wait(f)
        
    
    def worker_ranks(self):
        return dict(list(map(lambda x: (x[1], x[0]), self.clique)))
    
    
    def run_func_on_workers(self, func):
        f = [c.submit(func, a, random.random()) for i, w, a in self.clique]
        wait(f)
        return [a.result() for a in f]

    def get_workers_(self):
        return list(map(lambda x: Dask_NCCL_Demo.func_parse_host_port_(x), self.client.has_what().keys()))
    
    def init(self, uniqueId):
        workers = self.get_workers_()
        workers_indices = list(zip(workers, range(len(workers))))

        self.clique = [(idx, worker, self.client.submit(Dask_NCCL_Demo.func_init_, 
                                           idx, 
                                           len(workers), 
                                           uniqueId,
                                           workers=[worker]))
             for worker, idx in workers_indices]
        
    def get_clique_size(self):
        return self.run_func_on_workers(Dask_NCCL_Demo.func_get_size_)

    def get_rank(self):
        return self.run_func_on_workers(Dask_NCCL_Demo.func_get_rank_)
    
    def test_all_reduce(self):
        return self.run_func_on_workers(Dask_NCCL_Demo.func_test_all_reduce_)
    
    

In [6]:
world_id = unique_id()
world_id

b'\xe49\xc5\xd5d\x7f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00_exec_result_changed-46a\x00\x00\x00\x00\x00\x00\x00\x00\x12\x011aaf29\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x0b\x00\x00\x00\x01\x00\x00\x00e\x7f\x00\x00\xa08\xc5\xd5d\x7f\x00\x00\xd8 \xe7\x05e\x7f\x00\x00  0     0       0       \x00\x00 0\x00\x00}'

In [7]:
demo = Dask_NCCL_Demo(c)
demo.init(world_id)



In [None]:
demo.get_clique_size()

In [None]:
demo.get_rank()

In [None]:
demo.worker_ranks()

In [None]:
demo.test_all_reduce()

In [None]:
demo.get_workers_()

In [None]:
demo.clique

In [122]:
X = gen_dask_cudf(2000, 50, demo)
demo.test_on_dask_cudf(X)

[<Future: status: finished, type: tuple, key: tuple-769090da-7ad2-4935-a8e6-5060fadd03a8>, <Future: status: finished, type: tuple, key: tuple-d3537952-7048-4033-a6a0-d20d91794467>]
