In [1]:
%store -r data_dir
%store -r filename
input_data_path = (data_dir + filename).rstrip('.gz')

In [2]:
import time
from dask.distributed import Client, wait
import dask_cudf
import cugraph
from dask_cuda import LocalCUDACluster
# https://github.com/rapidsai/cugraph/blob/branch-21.12/python/cugraph/cugraph/dask/link_analysis/pagerank.py
import cugraph.dask as dcg
# import cugraph.dask.link_analysis.pagerank as dcg
import cugraph.comms as Comms

In [3]:
cluster = LocalCUDACluster(threads_per_worker=1)
client = Client(cluster)
Comms.initialize(p2p=True)

In [4]:
# Helper function to set the reader chunk size to automatically get one partition per GPU  
chunksize = dcg.get_chunksize(input_data_path)

# Start timer
t_start = time.time()

# Multi-GPU CSV reader
e_list = dask_cudf.read_csv(input_data_path, 
                            chunksize=chunksize, 
                            delimiter=' ', 
                            names=['src', 'dst'], 
                            dtype=['int32', 'int32'])

G = cugraph.DiGraph()
G.from_dask_cudf_edgelist(e_list, source='src', destination='dst')

# Wait for the lazy reader
tmp = wait(client.compute(e_list.to_delayed()))

# Print time
print(time.time()-t_start, "s")

29.640702724456787 s


In [6]:
# Start timer
t_start = time.time()

# Get the pagerank scores
pr_ddf = dcg.pagerank(G, max_iter=20)
# pr_ddf = dcg.pagerank(G, tol=1e-4)

# Print time
print(time.time()-t_start, "s")

MemoryError: std::bad_alloc: CUDA error at: /opt/conda/envs/rapids/include/rmm/mr/device/cuda_memory_resource.hpp:69: cudaErrorMemoryAllocation out of memory

In [None]:
# Start timer
t_start = time.time()

# Dask Data Frame to regular cuDF Data Frame 
pr_df = pr_ddf.compute()

# Sort, descending order
pr_sorted_df = pr_df.sort_values('pagerank',ascending=False)

# Print time
print(time.time()-t_start, "s")

# Print the Top 3
print(pr_sorted_df.head(3))

In [None]:
client.close()
cluster.close()