In [1]:
# Import needed libraries
import cugraph
import cudf
import numpy as np

In [2]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

In [3]:
#Test file    
datafile='../data/matrix210.prefix.LP.edgelist.csv'

In [4]:
# read the data using cuDF
gdf = cudf.read_csv(datafile, delimiter=",", names=['src', 'dst'], dtype=['int32', 'int32'],skiprows=1)
df = pd.read_csv(datafile, delimiter=",", names=['src', 'dst'], skiprows=1)

In [5]:
#gdf.to_pandas()


In [6]:
df.head()

Unnamed: 0,src,dst
0,0,312262
1,0,526142
2,1,526146
3,1,312266
4,2,312269


In [7]:
cpuG=nx.from_pandas_edgelist(df, source='src', target='dst',create_using=nx.DiGraph)
#nx.draw(cpuG, with_labels=True,pos=nx.circular_layout(cpuG), node_color='r', edge_color='b')
#plt.show()

In [8]:
print("cpu Graph")
print("\tNumber of Vertices: " + str(cpuG.number_of_nodes()))
print("\tNumber of Edges:    " + str(cpuG.number_of_edges()))


cpu Graph
	Number of Vertices: 552975
	Number of Edges:    792665


In [9]:
nx.is_strongly_connected(cpuG)


False

In [10]:
nx.is_weakly_connected(cpuG)

False

In [11]:
connectedskus = sorted(nx.weakly_connected_components(cpuG), key=len, reverse=True)

In [12]:
print("\tNumber weakly connected components: " + str(len(connectedskus)))

	Number weakly connected components: 20464


In [13]:
# Generate WCCs of cpuG, returning a geneator of sets of nodes, one for each weakly connected component of G
#[len(c) for c in sorted(nx.weakly_connected_components(cpuG), key=len, reverse=True)]

In [14]:
#nodeslist = []
#for consku in connectedskus:
#    nodeslist = []
#    for val in consku:
#        nodeslist.append(val)
#    print(nodeslist)

In [15]:
# we don't need to renumber for this dataset as the node index starts from 0 and contiguous 
gdf['renumbered_src'], gdf['renumbered_dst'], mapping = cugraph.renumber(gdf['src'], gdf['dst'])

In [16]:
# Note that currently cuGraph WCC only supported undirected network graph, so we use Graph() instead of DiGraph()
#gpuG = cugraph.DiGraph()
gpuG = cugraph.Graph()
gpuG.from_cudf_edgelist(gdf, source='renumbered_src', destination='renumbered_dst')

In [17]:
print("Main Graph")
print("\tNumber of Vertices: " + str(gpuG.number_of_vertices()))
print("\tNumber of Edges:    " + str(gpuG.number_of_edges()))

Main Graph
	Number of Vertices: 552975
	Number of Edges:    1585330


In [18]:
# Generate WCCs of gpuG, returning cuda dataFrame df
# df[‘labels’][i] gives the label id of the ith vertex and df[‘vertices’][i] gives the vertex id of the i’th vertex
wcc = cugraph.weakly_connected_components(gpuG)

In [19]:
wcc['org_vertices'] = mapping[wcc['vertices']]

In [20]:
wcc

Unnamed: 0,labels,vertices,org_vertices
0,1,0,0
1,2,1,8191
2,3,2,16382
3,4,3,24573
4,5,4,32764
...,...,...,...
552970,4552,552970,516032
552971,837,552971,524223
552972,4245,552972,532414
552973,4555,552973,540605


In [21]:
wcc['labels'].unique()

0             1
1             2
2             3
3             4
4             5
          ...  
20459    539596
20460    539643
20461    542559
20462    544366
20463    544783
Name: labels, Length: 20464, dtype: int32

In [22]:
label_gby = wcc.groupby('labels')
maxNodesCountPerComponent = label_gby['org_vertices'].count().max()

print("Total number of components found : ", wcc['labels'].unique().count())
#print("Max # of nodes in any of the component : ", label_gby['org_vertices'].count().max())
print("Max # of nodes in any of the component : ", maxNodesCountPerComponent)

Total number of components found :  20464
Max # of nodes in any of the component :  146874


In [23]:
label_gby['vertices'].count().max()

146874

In [24]:
# create a Graph using the source (src) and destination (dst) vertex pairs from the Dataframe
#G = cugraph.Graph()
#G.from_cudf_edgelist(gdf, source='src', destination='dst')

In [25]:
# Call cugraph.weakly_connected_components on the dataframe
#df = cugraph.weakly_connected_components(G)
#df.head()

In [28]:
# Use groupby on the 'labels' column of the WCC output to get the counts of each connected component label
#label_gby = df.groupby('labels')
#label_count = label_gby.count()

#print("Total number of components found : ", len(label_count))

In [29]:
# Call nlargest on the groupby result to get the row where the component count is the largest
#largest_component = label_count.nlargest(n = 1, columns = 'vertices')
#print("Size of the largest component is found to be : ", largest_component['vertices'][0])

In [30]:
# Query the connected component output to display vertex ids that belong to a component of interest
expr = "labels == 12"
component = wcc.query(expr)
print("Vertex Ids that belong to component label 12: ")
print(component)

Vertex Ids that belong to component label 12: 
        labels  vertices  org_vertices
11          12        11         90101
14          12        14        114674
20          12        20        163820
25          12        25        204775
28          12        28        229348
...        ...       ...           ...
552950      12    552950        352212
552953      12    552953        376785
552958      12    552958        417740
552966      12    552966        483268
552968      12    552968        499650

[146874 rows x 3 columns]


In [31]:
def print_components(_df, id, maxColumnLength):
    
    _f = _df.query('labels == @id')
    print(len(_f))
  
    part = []
    for i in range(len(_f)):
        part.append(_f['org_vertices'][i])
        
    #for i in range(len(_f), maxColumnLength):
     #   part.append(None)
    print(part)
    
    return part

In [32]:
tempdf = cudf.DataFrame()
i =0
for j in wcc['labels'].unique():
    print("Vertex Ids that belong to component label", j, "#",i, ": ")
    #print("Vertex Ids that belong to component label ", j, ": ") 
    i +=1
    #tempdf['WCC'+str(j)] = print_components(wcc, j, label_gby['org_vertices'].count().max())
    tempdf['WCC'+str(j)] = str(print_components(wcc, j, label_gby['org_vertices'].count().max()))


Vertex Ids that belong to component label 1 # 0 : 
1058
[0, 57358, 8218, 163853, 37, 131100, 483326, 90166, 68, 57426, 8286, 163921, 221260, 105, 131168, 483393, 90234, 136, 57494, 16542, 8354, 163989, 221328, 173, 131236, 352393, 483460, 204, 57562, 16610, 164057, 221396, 352460, 131304, 483527, 393463, 16678, 164125, 221464, 352527, 131372, 483594, 254249, 491807, 393530, 16746, 221532, 377, 352594, 131440, 483661, 254317, 491874, 393597, 16814, 41396, 221600, 352661, 164269, 131508, 483728, 491941, 393664, 16882, 41464, 221668, 352728, 164337, 131576, 483795, 492008, 393731, 16950, 320017, 41532, 221736, 352795, 164405, 131644, 483862, 492075, 393798, 320084, 41600, 221804, 352862, 164473, 131712, 483929, 492142, 393865, 320151, 41668, 221872, 352929, 164541, 131780, 483996, 492209, 393932, 320218, 17154, 41736, 221940, 352996, 295661, 164609, 131848, 484063, 492276, 393999, 320285, 17222, 41804, 353063, 222008, 295728, 164677, 131916, 484130, 492343, 394066, 320352, 17290, 41872, 3