In [1]:
from dask.distributed import Client
import os

import dask

If you have an existing Dask cluster running already, set the scheduler address below. Otherwise, leave it to `None` and a local cluster will be created.

In [2]:
scheduler_address = None #"tcp://10.2.168.161:8786"

if scheduler_address is None:
    from dask_cuda import LocalCUDACluster
    cluster = LocalCUDACluster()
    c = Client(cluster)
else:
    c = Client(scheduler_address)
c

0,1
Client  Scheduler: tcp://127.0.0.1:35150  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 2  Cores: 2  Memory: 50.39 GB


In [3]:
from nccl_example import nccl, NCCL_Clique
import random
from dask.distributed import wait

In [4]:
from dask import delayed
import dask.dataframe as dd
import numba.cuda
import cudf
import numpy as np

In [5]:
from tornado import gen
from dask.distributed import default_client
from toolz import first
import logging
import dask.dataframe as dd

import dask_cudf
import numpy as np
import cudf
import pandas as pd

from dask.distributed import wait


def parse_host_port(address):
    if '://' in address:
        address = address.rsplit('://', 1)[1]
    host, port = address.split(':')
    port = int(port)
    return host, port


def build_host_dict(workers):
    hosts = set(map(lambda x: parse_host_port(x), workers))
    hosts_dict = {}
    for host, port in hosts:
        if host not in hosts_dict:
            hosts_dict[host] = set([port])
        else:
            hosts_dict[host].add(port)

    return hosts_dict

def _build_host_dict(gpu_futures, client):
    """
    Helper function to build a dictionary mapping workers to parts
    that currently hold the parts of given futures.
    :param gpu_futures:
    :param client:
    :return:
    """
    who_has = client.who_has(gpu_futures)

    key_to_host_dict = {}
    for key in who_has:
        key_to_host_dict[key] = parse_host_port(who_has[key][0])

    hosts_to_key_dict = {}
    for key, host in key_to_host_dict.items():
        if host not in hosts_to_key_dict:
            hosts_to_key_dict[host] = set([key])
        else:
            hosts_to_key_dict[host].add(key)

    workers = [key[0] for key in list(who_has.values())]
    return build_host_dict(workers)

def to_gpu_matrix(i):
    return i

@gen.coroutine
def _get_mg_info(ddf):
    """
    Given a Dask cuDF, extract number of dimensions and convert
    the pieces of the Dask cuDF into Numba arrays, which can
    be passed into the kNN algorithm.
    build a
    :param ddf:
    :return:
    """

    client = default_client()

    if isinstance(ddf, dd.DataFrame):
        cols = len(ddf.columns)
        parts = ddf.to_delayed()
        parts = client.compute(parts)
        yield wait(parts)
    else:
        raise Exception("Input should be a Dask DataFrame")

    key_to_part_dict = dict([(str(part.key), part) for part in parts])
    who_has = yield client.who_has(parts)

    worker_map = []
    for key, workers in who_has.items():
        worker = parse_host_port(first(workers))
        worker_map.append((worker, key_to_part_dict[key]))

    gpu_data = [(worker, client.submit(to_gpu_matrix, part,
                                       workers=[worker]))
                for worker, part in worker_map]

    yield wait(gpu_data)

    raise gen.Return((gpu_data, cols))
    
import dask_cudf
    
def create_df(f, m, n):
    
    print("In function")
    X = np.ones((m, n), dtype = np.float32)
    
    
    print("Creatin df")
    ret = cudf.DataFrame([(i,
                           X[:, i].astype(np.float32)) for i in range(n)],
                         index=cudf.dataframe.RangeIndex(f * m,
                                                         f * m + m, 1))
    
    print("done")
    return ret

def get_meta(df):
    ret = df.iloc[:0]
    return ret

def gen_dask_cudf(nrows, ncols):
    workers = c.has_what().keys()

    # Create dfs on each worker (gpu)
    dfs = [c.submit(create_df, n, nrows, ncols, workers=[worker])
           for worker, n in list(zip(workers, list(range(len(workers)))))]
    # Wait for completion
    wait(dfs)

    meta = c.submit(get_meta, dfs[0]).result()
    return dask_cudf.from_delayed(dfs, meta=meta)
    


In [6]:
class Dask_NCCL_Demo:
    
    def __init__(self, client):
        self.client = client
    
    @staticmethod
    def func_parse_host_port_(address):
        if '://' in address:
            address = address.rsplit('://', 1)[1]
        host, port = address.split(':')
        port = int(port)
        return host, port
    
    @staticmethod
    def func_init_(workerId, nWorkers, uniqueId):
        w = dask.distributed.get_worker()
        
        print("UNIQUEID: "  + str(uniqueId))
        
        print("Hello World! from ip=%s worker=%s/%d" % \
              (w.address, w.name, nWorkers))
        
        n = nccl()
        n.init(nWorkers, uniqueId, workerId)
        
        print(str("Rank in python: " + str(n.user_rank())))
        

        a = NCCL_Clique(workerId, nWorkers)
        a.create_clique(n.get_comm())

        return (n, a)
    
    @staticmethod
    def func_get_size_(world, r):
        return world[1].get_clique_size()
    
    @staticmethod
    def func_get_rank_(world, r):
        return world[1].get_rank()
    
    @staticmethod
    def func_get_device_(world, r):
        return world[1].get_device()

    @staticmethod
    def func_test_all_reduce_(world, r):
        return world[1].test_all_reduce()
    
    @staticmethod
    def get_meta(df):
        return df.iloc[:0]
    
    @staticmethod
    def p(i, r):
        print("COMB: " + str(i))
    
    @staticmethod
    def func_test_reduce_on_partition(world_df, root_rank, r):
        """
        This function is executed on the workers and performs the necessary
        data preparation, as well as calling the cython-wrapped C++ "algorithm"
        function(s), returning a cuDF with results, if necessary. 
        
        The client will construct a Dask cuDF out of the cuDFs returned from
        this function, if necessary. 
        """

        nccl_world, df = world_df
        
        n, world = nccl_world
        

        # Build output array only for the root rank
        if world.get_rank() == root_rank:
            out_gpu_mat = numba.cuda.to_device(np.zeros((df.shape[0], df.shape[1]),
                                                        dtype=np.float32, order = "F"))
            out_df = cudf.DataFrame(index=cudf.dataframe.RangeIndex(0, df.shape[0]))
            
        else:
            out_gpu_mat = numba.cuda.device_array((1, 1), dtype=np.float32)
            out_df = None

        # Execute our cython-wrapped C++ "algorithm"
        world.test_on_partition(df, root_rank, out_gpu_mat)

        # Build cudf with results
        if world.get_rank() == root_rank:
            for i in range(0, out_gpu_mat.shape[1]):
                out_df[str(i)] = out_gpu_mat[:, i]
        return out_df

    def test_end_to_end(self, dask_cudf):
        """
        An end to end example to mimic a typical Dask cuML algorithm using
        OPG semantics. This function is executed on the client.
        
        The following steps are taken with Dask cuDF as input:
        1. Co-locate Dask cuDF partitions with cuml NCCL clique object on each worker
        2. Run the algorithm, extracting the Numba device memory pointer for each partition
           and allocating necessary output memory on device for constructing the cuDF 
           partition(s) that will be returned to the user. 
        3. Construct Dask cuDF from the futures containing the cuDFs returned from local 
           "algorithm" functions on each worker. 
        """
        
        # Keep the futures around so the GPU memory doesn't get
        # deallocated on the workers.
        gpu_futures, cols = c.sync(_get_mg_info, X)

        worker_world_map = dict(map(lambda x: (x[1], x[2]), self.clique))
        
        # Combine dask_cudf partitions with their local "world" instance.
        combined = list(map(delayed, [(worker_world_map[w], f) for w, f in gpu_futures]))
        combined = c.compute(combined)
        wait(combined)
        
        f = [c.submit(Dask_NCCL_Demo.p, f, random.random()) for f in combined]
        wait(f)

        # Run our "algorithm" to perform reduce 
        f = [c.submit(Dask_NCCL_Demo.func_test_reduce_on_partition, f, 0, random.random()) for f in combined]
        wait(f)
        
        # Convert result back into a dask_cudf
        dfs = [d for d in f if d.type != type(None)]
        meta = c.submit(Dask_NCCL_Demo.get_meta, dfs[0]).result()
        ddf = dd.from_delayed(dfs, meta=meta)
        
        return ddf
    
    def worker_ranks(self):
        return dict(list(map(lambda x: (x[1], x[0]), self.clique)))
    
    
    def run_func_on_workers(self, func):
        f = [c.submit(func, a, random.random()) for i, w, a in self.clique]
        wait(f)
        return [a.result() for a in f]

    def get_workers_(self):
        return list(map(lambda x: Dask_NCCL_Demo.func_parse_host_port_(x), self.client.has_what().keys()))
    
    def init(self, uniqueId):
        workers = self.get_workers_()
        workers_indices = list(zip(workers, range(len(workers))))

        self.clique = [(idx, worker, self.client.submit(Dask_NCCL_Demo.func_init_, 
                                           idx, 
                                           len(workers), 
                                           uniqueId,
                                           workers=[worker]))
             for worker, idx in workers_indices]
        
    def get_clique_size(self):
        return self.run_func_on_workers(Dask_NCCL_Demo.func_get_size_)

    def get_rank(self):
        return self.run_func_on_workers(Dask_NCCL_Demo.func_get_rank_)
    
    def test_all_reduce(self):
        return self.run_func_on_workers(Dask_NCCL_Demo.func_test_all_reduce_)

First demonstration will show that NCCL works within Dask as expected. This includes:
1. Creation of a NCCL clique using Dask workers to broadcast the ncclUniqueId
2. Demonstrating the NCCL clique successfully performs collective comms
3. All calls to the underlying NCCL comm are made through the cuML comms facade. 

In [7]:
world_id = nccl.get_unique_id()

In [8]:
demo = Dask_NCCL_Demo(c)
demo.init(world_id)

In [9]:
demo.get_clique_size()

[2, 2]

In [10]:
demo.get_rank()

[0, 1]

In [11]:
demo.worker_ranks()

{('127.0.0.1', 39929): 0, ('127.0.0.1', 42830): 1}

In [12]:
for result in demo.test_all_reduce():
    assert result == True

In [13]:
demo.get_workers_()

[('127.0.0.1', 39929), ('127.0.0.1', 42830)]

The second demonstration will show that we can take a Dask cuDF and utilize NCCL on the workers hosting its partitions. This includes:
1. Initializing a NCCL clique with the workers that host partitions underlying the Dask cuDF
2. Perform a collective comm operation across the partitions using only the cuML comms facade to communicate with the NCCL comm. 
3. Output a Dask cuDF with the results of the collective comm. 

In [14]:
X = gen_dask_cudf(10, 50)

In [15]:
c.has_what()

{'tcp://127.0.0.1:39929': ('func_init_-b34ab2fa879a762bf5d150b8263d4d0c',
  'create_df-ff8074bc9c4621841eccc530ea22e510'),
 'tcp://127.0.0.1:42830': ('func_init_-ad11642d41827c695844bea66b71114c',
  'create_df-331847be44eed21b1abc59936bb6cc0a')}

In [16]:
result = demo.test_end_to_end(X)

AttributeError: 'tuple' object has no attribute 'get_rank'

In [25]:
print(str(result))

<dask_cudf.DataFrame | 2 tasks | 1 npartitions>


In [26]:
print(str(result.compute()))

      0     1     2     3     4     5     6 ...    49
0  10.0  10.0  10.0  10.0  10.0  10.0  10.0 ...  10.0
1  10.0  10.0  10.0  10.0  10.0  10.0  10.0 ...  10.0
2  10.0  10.0  10.0  10.0  10.0  10.0  10.0 ...  10.0
3  10.0  10.0  10.0  10.0  10.0  10.0  10.0 ...  10.0
4  10.0  10.0  10.0  10.0  10.0  10.0  10.0 ...  10.0
5  10.0  10.0  10.0  10.0  10.0  10.0  10.0 ...  10.0
6  10.0  10.0  10.0  10.0  10.0  10.0  10.0 ...  10.0
7  10.0  10.0  10.0  10.0  10.0  10.0  10.0 ...  10.0
8  10.0  10.0  10.0  10.0  10.0  10.0  10.0 ...  10.0
9  10.0  10.0  10.0  10.0  10.0  10.0  10.0 ...  10.0
[42 more columns]
