In [1]:
!pip install ipython-autotime
%load_ext autotime



In [2]:
#-- IMPORT packages
import cudf
import cugraph
import numpy as np
from collections import OrderedDict
import time

time: 1.04 s


### create graph using Cugraph

In [3]:
datafile = '../data/400K_Input.csv'
gdf  = cudf.read_csv(datafile, delimiter=",", names=['node_1', 'node_2'], dtype=['str', 'str'], skiprows=1 )
gdf.rename(columns={'node_1':'src_str', 'node_2':'dst_str'}, inplace=True)

time: 1.17 s


In [4]:
type(gdf)

cudf.core.dataframe.DataFrame

time: 9.86 ms


In [5]:
print(len(gdf))

720768
time: 785 µs


In [6]:
gdf['src_hash'] = gdf['src_str'].hash_values()
gdf['dst_hash'] = gdf['dst_str'].hash_values()
gdf.head()

Unnamed: 0,src_str,dst_str,src_hash,dst_hash
0,ESP_PSR_100FG@@ESP_PSR_10PLANT,ESP_PSR_100FG@@ESP_PSR_10DC,-1046842473,-1231339202
1,ESP_PSR_100FG@@ESP_PSR_10PLANT,ESP_PSR_100FG@@ESP_PSR_10PLANT,-1046842473,1828407819
2,ESP_PSR_100FG@@ESP_PSR_10PLANT,ESP_PSR_100SEMIFG@@ESP_PSR_10PLANT,-1046842473,1516831772
3,ESP_PSR_100FG@@ESP_PSR_11PLANT,ESP_PSR_100FG@@ESP_PSR_11DC,-1708607005,-625024697
4,ESP_PSR_100FG@@ESP_PSR_11PLANT,ESP_PSR_100FG@@ESP_PSR_11PLANT,-1708607005,232932757


time: 92.8 ms


#### note that different hash values were computed for the SAME string on different columns! For example, row 1 has the same source and dst strings, but end up with different hash values. maybe a CUDF bug

In [7]:
# workaround to combine to a single column, hash, and then seperate
new_series=cudf.concat([gdf['src_str'],gdf['dst_str']])
new_series.count()

1441536

time: 153 ms


In [8]:
temp = new_series.hash_values()
gdf['src_hash'] = temp[:len(gdf)]

time: 43.2 ms


In [9]:
gdf['dst_hash'] = temp[len(gdf):]

time: 976 µs


In [10]:
gdf.head()

Unnamed: 0,src_str,dst_str,src_hash,dst_hash
0,ESP_PSR_100FG@@ESP_PSR_10PLANT,ESP_PSR_100FG@@ESP_PSR_10DC,1828407819,-1231339202
1,ESP_PSR_100FG@@ESP_PSR_10PLANT,ESP_PSR_100FG@@ESP_PSR_10PLANT,1828407819,1828407819
2,ESP_PSR_100FG@@ESP_PSR_10PLANT,ESP_PSR_100SEMIFG@@ESP_PSR_10PLANT,1828407819,1516831772
3,ESP_PSR_100FG@@ESP_PSR_11PLANT,ESP_PSR_100FG@@ESP_PSR_11DC,232932757,-625024697
4,ESP_PSR_100FG@@ESP_PSR_11PLANT,ESP_PSR_100FG@@ESP_PSR_11PLANT,232932757,232932757


time: 39.8 ms


#### the hash values are now correct for both columns

In [11]:
gdf['src_str'].unique().count()

164400

time: 25.4 ms


In [12]:
gdf['src_hash'].unique().count()

164400

time: 357 ms


In [13]:
gdf['dst_str'].unique().count()

400000

time: 34.5 ms


In [14]:
gdf['dst_hash'].unique().count()

400000

time: 15.5 ms


In [15]:
# Renumber the hash values to a smaller contiguous range 
gdf['src_renumbered'], gdf['dst_renumbered'], mapping = cugraph.renumber(gdf['src_hash'], gdf['dst_hash'])

time: 10.1 ms


In [16]:
gdf['src_renumbered'].nunique()

164400

time: 5.23 ms


In [17]:
gdf['dst_renumbered'].unique().count()

400000

time: 16.2 ms


In [18]:
gdf.head()

Unnamed: 0,src_str,dst_str,src_hash,dst_hash,src_renumbered,dst_renumbered
0,ESP_PSR_100FG@@ESP_PSR_10PLANT,ESP_PSR_100FG@@ESP_PSR_10DC,1828407819,-1231339202,227673,281558
1,ESP_PSR_100FG@@ESP_PSR_10PLANT,ESP_PSR_100FG@@ESP_PSR_10PLANT,1828407819,1828407819,227673,227673
2,ESP_PSR_100FG@@ESP_PSR_10PLANT,ESP_PSR_100SEMIFG@@ESP_PSR_10PLANT,1828407819,1516831772,227673,296886
3,ESP_PSR_100FG@@ESP_PSR_11PLANT,ESP_PSR_100FG@@ESP_PSR_11DC,232932757,-625024697,261409,296547
4,ESP_PSR_100FG@@ESP_PSR_11PLANT,ESP_PSR_100FG@@ESP_PSR_11PLANT,232932757,232932757,261409,261409


time: 43.9 ms


In [19]:
gpuG = cugraph.DiGraph() # cugraph.DiGraph is directed graph
gpuG.from_cudf_edgelist(gdf, source='src_renumbered', target='dst_renumbered')

time: 1.79 ms


In [20]:
print("Gpu Graph")
print("\tNumber of Vertices: " + str(gpuG.number_of_vertices()))
print("\tNumber of Edges:    " + str(gpuG.number_of_edges()))

Gpu Graph
	Number of Vertices: 404400
	Number of Edges:    720768
time: 1.96 ms


In [21]:
gpuG_und = cugraph.Graph() # cugraph.Graph is undirected graph. Cugraph WCC currently only supports undirected graph
gpuG_und.from_cudf_edgelist(gdf, source='src_renumbered', target='dst_renumbered')

time: 21.6 ms


In [22]:
wcc = cugraph.weakly_connected_components(gpuG_und)

time: 42.4 ms


In [23]:
label_gby = wcc.groupby('labels')
label_count = label_gby.count()
print("Total number of components found : ", len(label_count))

Total number of components found :  16
time: 25.1 ms


### NETWORKX RUN

In [24]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

time: 394 ms


In [25]:
df = pd.read_csv(datafile, delimiter=",", names=['node_1', 'node_2'], skiprows=1)
df.rename(columns={'node_1':'src', 'node_2':'dst'}, inplace=True)

time: 781 ms


In [26]:
df.head()

Unnamed: 0,src,dst
0,ESP_PSR_100FG@@ESP_PSR_10PLANT,ESP_PSR_100FG@@ESP_PSR_10DC
1,ESP_PSR_100FG@@ESP_PSR_10PLANT,ESP_PSR_100FG@@ESP_PSR_10PLANT
2,ESP_PSR_100FG@@ESP_PSR_10PLANT,ESP_PSR_100SEMIFG@@ESP_PSR_10PLANT
3,ESP_PSR_100FG@@ESP_PSR_11PLANT,ESP_PSR_100FG@@ESP_PSR_11DC
4,ESP_PSR_100FG@@ESP_PSR_11PLANT,ESP_PSR_100FG@@ESP_PSR_11PLANT


time: 7.5 ms


In [27]:
print(len(df))

720768
time: 589 µs


In [28]:
df['src_hash'] = df['src'].apply(hash)
df['dst_hash'] = df['dst'].apply(hash)

time: 637 ms


In [29]:
#df['src_hash'].value_counts()
df['src_hash'].nunique()

164400

time: 25.2 ms


In [30]:
df['dst_hash'].nunique()

400000

time: 37.6 ms


In [31]:
df.head()

Unnamed: 0,src,dst,src_hash,dst_hash
0,ESP_PSR_100FG@@ESP_PSR_10PLANT,ESP_PSR_100FG@@ESP_PSR_10DC,-8120110218710317506,6493039885157609027
1,ESP_PSR_100FG@@ESP_PSR_10PLANT,ESP_PSR_100FG@@ESP_PSR_10PLANT,-8120110218710317506,-8120110218710317506
2,ESP_PSR_100FG@@ESP_PSR_10PLANT,ESP_PSR_100SEMIFG@@ESP_PSR_10PLANT,-8120110218710317506,-8155104890878717737
3,ESP_PSR_100FG@@ESP_PSR_11PLANT,ESP_PSR_100FG@@ESP_PSR_11DC,1087440908846034202,6489779059197334226
4,ESP_PSR_100FG@@ESP_PSR_11PLANT,ESP_PSR_100FG@@ESP_PSR_11PLANT,1087440908846034202,1087440908846034202


time: 9.45 ms


In [32]:
cpuG=nx.from_pandas_edgelist(df, source='src_hash', target='dst_hash',create_using=nx.DiGraph)

time: 3.43 s


In [33]:
print("cpu Graph")
print("\tNumber of Vertices: " + str(cpuG.number_of_nodes()))
print("\tNumber of Edges:    " + str(cpuG.number_of_edges()))

cpu Graph
	Number of Vertices: 404400
	Number of Edges:    720768
time: 358 ms


In [34]:
print("\tNumber weakly connected components: " + str(nx.number_weakly_connected_components(cpuG)))

	Number weakly connected components: 16
time: 1.77 s
