In [1]:
!pip install ipython-autotime
%load_ext autotime

Collecting ipython-autotime
  Downloading https://files.pythonhosted.org/packages/e6/f9/0626bbdb322e3a078d968e87e3b01341e7890544de891d0cb613641220e6/ipython-autotime-0.1.tar.bz2
Building wheels for collected packages: ipython-autotime
  Building wheel for ipython-autotime (setup.py) ... [?25ldone
[?25h  Created wheel for ipython-autotime: filename=ipython_autotime-0.1-cp36-none-any.whl size=1832 sha256=cf8212ebd06854749da83ed8b72941dc45a3deb2d8e0f4696871fb4fa33b7b25
  Stored in directory: /root/.cache/pip/wheels/d2/df/81/2db1e54bc91002cec40334629bc39cfa86dff540b304ebcd6e
Successfully built ipython-autotime
Installing collected packages: ipython-autotime
Successfully installed ipython-autotime-0.1


In [2]:
#-- IMPORT packages
import cudf
import cugraph
import numpy as np
from collections import OrderedDict
import time

time: 1.09 s


In [3]:
datafile = '../data/400K_Input.csv'
raw_data  = cudf.read_csv(datafile, delimiter=",", names=['node_1', 'node_2'], dtype=['str', 'str'], skiprows=1 )
raw_data.rename(columns={'node_1':'src_str', 'node_2':'dst_str'}, inplace=True)

time: 1.31 s


In [4]:
type(raw_data)

cudf.core.dataframe.DataFrame

time: 10.1 ms


In [5]:
print(len(raw_data))

720768
time: 689 µs


In [6]:
raw_data['src_hash'] = raw_data['src_str'].str.hash()
raw_data['dst_hash'] = raw_data['dst_str'].str.hash()

time: 183 ms


In [7]:
raw_data['src_str'].unique().count()

164400

time: 26.6 ms


In [8]:
raw_data['src_hash'].unique().count()

164395

time: 738 ms


In [9]:
raw_data['dst_str'].unique().count()

400000

time: 37.7 ms


In [10]:
raw_data['dst_hash'].unique().count()

399979

time: 17.5 ms


In [None]:
### the hash function in cudf does not look right as the unique counts differ after hashing

In [11]:
# Renumber the hash values to a smaller contiguous range 
raw_data['src'], raw_data['dst'], mapping = cugraph.renumber(raw_data['src_hash'], raw_data['dst_hash'])

time: 11.3 ms


In [13]:
raw_data['src'].nunique()

164395

time: 5.17 ms


In [15]:
raw_data['dst'].unique().count()

399979

time: 183 ms


In [19]:
gpuG = cugraph.DiGraph() # cugraph.DiGraph is directed graph
gpuG.add_edge_list(raw_data['src'], raw_data['dst'])

time: 2.61 ms


  Use from_cudf_edgelist instead')


In [20]:
print("Gpu Graph")
print("\tNumber of Vertices: " + str(gpuG.number_of_vertices()))
print("\tNumber of Edges:    " + str(gpuG.number_of_edges()))

Gpu Graph
	Number of Vertices: 404379
	Number of Edges:    720768
time: 2.02 ms


In [21]:
gpuG1 = cugraph.Graph() # cugraph.Graph is undirected graph
gpuG1.add_edge_list(raw_data['src'], raw_data['dst'])

time: 20.7 ms


In [22]:
print("Gpu Graph")
print("\tNumber of Vertices: " + str(gpuG1.number_of_vertices()))
print("\tNumber of Edges:    " + str(gpuG1.number_of_edges()))

Gpu Graph
	Number of Vertices: 404379
	Number of Edges:    1281531
time: 2.45 ms


## why the number of edges are ODD number for undirected cugraph.Graph()?

In [23]:
wcc = cugraph.weakly_connected_components(gpuG1)

time: 72.4 ms


In [24]:
label_gby = wcc.groupby('labels')
label_count = label_gby.count()
print("Total number of components found : ", len(label_count))

Total number of components found :  2
time: 19.8 ms


**NETWORKX RUN**

In [31]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

time: 699 ms


In [32]:
df = pd.read_csv(datafile, delimiter=",", names=['node_1', 'node_2'], skiprows=1)
df.rename(columns={'node_1':'src', 'node_2':'dst'}, inplace=True)

time: 757 ms


In [33]:
df.head()

Unnamed: 0,src,dst
0,ESP_PSR_100FG@@ESP_PSR_10PLANT,ESP_PSR_100FG@@ESP_PSR_10DC
1,ESP_PSR_100FG@@ESP_PSR_10PLANT,ESP_PSR_100FG@@ESP_PSR_10PLANT
2,ESP_PSR_100FG@@ESP_PSR_10PLANT,ESP_PSR_100SEMIFG@@ESP_PSR_10PLANT
3,ESP_PSR_100FG@@ESP_PSR_11PLANT,ESP_PSR_100FG@@ESP_PSR_11DC
4,ESP_PSR_100FG@@ESP_PSR_11PLANT,ESP_PSR_100FG@@ESP_PSR_11PLANT


time: 6.4 ms


In [34]:
print(len(df))

720768
time: 620 µs


In [35]:
df['src_hash'] = df['src'].apply(hash)
df['dst_hash'] = df['dst'].apply(hash)

time: 633 ms


In [36]:
#df['src_hash'].value_counts()
df['src_hash'].head()

0   -8046491048877958886
1   -8046491048877958886
2   -8046491048877958886
3    2235457228028694153
4    2235457228028694153
Name: src_hash, dtype: int64

time: 4.41 ms


In [37]:
cpuG=nx.from_pandas_edgelist(df, source='src_hash', target='dst_hash',create_using=nx.DiGraph)

time: 3.31 s


In [38]:
print("cpu Graph")
print("\tNumber of Vertices: " + str(cpuG.number_of_nodes()))
print("\tNumber of Edges:    " + str(cpuG.number_of_edges()))

cpu Graph
	Number of Vertices: 404400
	Number of Edges:    720768
time: 340 ms


In [39]:
print("\tNumber weakly connected components: " + str(nx.number_weakly_connected_components(cpuG)))

	Number weakly connected components: 16
time: 1.71 s
