# Benchmarking `graph-tool`

In [None]:
%%bash
SOURCE="deb [ arch=amd64 ] https://downloads.skewed.de/apt bionic main"
echo $SOURCE >> /etc/apt/sources.list
apt-key adv --keyserver keys.openpgp.org --recv-key 612DEFB798507F25
apt-get update
apt-get install python3-graph-tool

In [None]:
!pip3 install networkx

In [3]:
import time

import networkx as nx
import graph_tool.all as gt



In [4]:
from google.colab import drive

drive.mount('/content/drive/')

Mounted at /content/drive/


In [5]:
import json

def create_graph_from_file(filename, format='nx'):
    """Opens the data from the given filepath as JSON and return a networkx DiGraph object."""
    with open(filename, 'r') as f:
        data = json.load(f)
    return create_graph_from_json(data, format=format)


def create_graph_from_json(data, format='nx'):
    """Converts the input JSON file to a networkx DiGraph object."""
    nodes = data['nodes']
    links = data['links']

    if format == 'nx':
        g = nx.DiGraph()
        for node in nodes:
            g.add_node(node['id'], **node)
        for link in links:
            g.add_edge(link['source'], link['target'], weight=link['value'])
    elif format == 'gt':
        g = gt.Graph(directed=True)
        g.vertex_properties['id'] = g.new_vp('string')
        g.edge_properties['weight'] = g.new_ep('int')
        vertices = {}
        for node in nodes:
            node_id = node['id']
            v = g.add_vertex()
            vertices[node_id] = v
            g.vertex_properties['id'][v] = node_id
        for link in links:
            src = link['source']
            dst = link['target']
            weight = link['value']
            e = g.add_edge(vertices[src], vertices[dst])
            g.edge_properties['weight'][e] = weight
    else:
        raise ValueError("format must be nx or gt")
    return g


In [6]:
dataset = '/content/drive/My Drive/'\
        'Linked Commons Colab Notebooks/fdg_input_file.json'

In [7]:
gtG = create_graph_from_file(dataset, format='gt')

In [8]:
tic = time.time()
_ = gt.pagerank(gtG, epsilon=1e-4, weight=gtG.ep.weight)
toc = time.time()
print(toc - tic)

1.3653359413146973


# Benchmarking `cugraph`

The following code is adapted from [this Medium post](https://medium.com/dropout-analytics/installing-rapids-ai-in-google-colab-87c247f2c468), which is a direct copy of the code found at [Rapids AI's Github](https://github.com/rapidsai/rapidsai-csp-utils)

In [9]:
import pynvml
pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
device_name = pynvml.nvmlDeviceGetName(handle)
if (device_name != b'Tesla T4') and (device_name != b'Tesla P4') and (device_name != b'Tesla P100-PCIE-16GB'):
  raise Exception("""
    Unfortunately this instance does not have a T4, P4 or P100 GPU.
    
    Please make sure you've configured Colab to request a GPU instance type.
    
    Sometimes Colab allocates a Tesla K80 instead of a T4, P4 or P100. Resetting the instance.
If you get a K80 GPU, try Runtime -> Reset all runtimes...
  """)
else:
  print('Woo! You got the right kind of GPU!')

Woo! You got the right kind of GPU!


In [None]:
# Install RAPIDS
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!bash rapidsai-csp-utils/colab/rapids-colab.sh stable

import sys, os

dist_package_index = sys.path.index('/usr/local/lib/python3.6/dist-packages')
sys.path = sys.path[:dist_package_index] + ['/usr/local/lib/python3.6/site-packages'] + sys.path[dist_package_index:]
sys.path
exec(open('rapidsai-csp-utils/colab/update_modules.py').read(), globals())

In [11]:
import time

import rmm
import cudf
import cugraph

In [12]:
from google.colab import drive

drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [13]:
# Testing to see if cugraph allows parallel edges
src = [0, 0, 1]
dst = [1, 1, 3]
df = cudf.DataFrame()
df['src'] = src
df['dst'] = dst
display(df)

G = cugraph.DiGraph()
G.from_cudf_edgelist(df, source='src', destination='dst', renumber=False)

display(G.degrees()) # Parallel edges work as expected.

Unnamed: 0,src,dst
0,0,1
1,0,1
2,1,3


Unnamed: 0,vertex,in_degree,out_degree
0,0,0,2
1,1,2,1
2,2,0,0
3,3,1,0


In [14]:
dataset = '/content/drive/My Drive/'\
        'Linked Commons Colab Notebooks/fdg_input_file.json'
csv = 'edges.csv'
csv_multi_edges = 'multi_edges.csv'

In [15]:
import json
import os

def json_to_csv(in_file, out_file, parallel_edges=False):
    with open(in_file, 'r') as f:
        data = json.load(f)
    nodes = data['nodes']
    links = data['links']
    ids = dict()
    counter = 0
    for node in nodes:
        if node['id'] not in ids:
            # Should I also create the inverse dictionary?
            ids[node['id']] = counter
            counter += 1
    with open(out_file, 'w') as f:
        for link in links:
            src_id = str(ids[link['source']])
            dst_id = str(ids[link['target']])
            if parallel_edges:
                for _ in range(link['value']):
                    f.write(src_id + ',' + dst_id + '\n')
            else:
                f.write(src_id + ',' + dst_id + '\n')

In [16]:
if not os.path.exists(csv):
    tic = time.time()
    json_to_csv(dataset, csv)
    toc = time.time()
    print(toc - tic)

if not os.path.exists(csv_multi_edges):
    tic = time.time()
    json_to_csv(dataset, csv_multi_edges, parallel_edges=True)
    toc = time.time()
    print(toc - tic)

18.615588665008545
238.70451498031616


In [17]:
!du -h edges.csv
!du -h multi_edges.csv

52M	edges.csv
9.5G	multi_edges.csv


Below cell gives us some very simple benchmarks about using `cugraph`. PageRank is quite fast (36 ms on ~400k edges) but, without enabling oversubscription, it fails on the larger case (with ~200x the number of edges). Oversubscription is supposed to be easy using the `rmm` module, but Colab crashes every time I try to do a computation using `rmm` for memory management.

In [18]:
# FIXME: Colab crashes when trying to use managed memory
# rmm.reinitialize(managed_memory=True)
# assert rmm.is_initialized()

In [19]:
for edge_list in [csv, csv_multi_edges]:
    print("Analyzing:", edge_list)
    tic = time.time()
    gdf = cudf.read_csv(edge_list, names=['src', 'dst'])
    toc = time.time()
    print("Reading CSV:", toc-tic)

    tic = time.time()
    G = cugraph.DiGraph()
    G.from_cudf_edgelist(gdf, source='src', destination='dst', renumber=False)
    toc = time.time()
    print("Constructing graph:", toc - tic)

    tic = time.time()
    ranks = cugraph.pagerank(G, tol=1e-4)
    toc = time.time()
    print("Computing PageRank:", toc - tic)

    G.clear()

Analyzing: edges.csv
Reading CSV: 1.4415392875671387
Constructing graph: 0.0004985332489013672
Computing PageRank: 0.15007328987121582
Analyzing: multi_edges.csv


MemoryError: ignored