# Supply chain partitioning example

In this notebook, we will use cuGraph to prototype partitioning of JDA supply chain example graph  

* Created:   11/3/2019
* Last Edit: 05/07/2020

RAPIDS Versions: 0.13.0

Test Hardware
* GV100 32G, CUDA 10.1

Using docker container: rapidsai/rapidsai-dev:0.13-cuda10.1-devel-ubuntu18.04-py3.7

### Test Data
We will be using the example dataset Arijit provided.

### Prep

In [1]:
# Import needed libraries
import cugraph
import cudf
import numpy as np

In [2]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

### Read data using cuDF and pandas

In [3]:
# Test file    
datafile='../data/matrix15.prefix.LP.edgelist.csv'
#datafile='../data/matrix210.prefix.LP.edgelist.csv'

In [4]:
# read the data using cuDF
gdf = cudf.read_csv(datafile, delimiter=",", names=['src', 'dst'], dtype=['int32', 'int32'],skiprows=1)
df = pd.read_csv(datafile, delimiter=",", names=['src', 'dst'], skiprows=1)

In [5]:
gdf.head().to_pandas()

Unnamed: 0,src,dst
0,0,2868076
1,0,2918950
2,0,2806750
3,1,2868077
4,1,2806752


In [6]:
df.head()

Unnamed: 0,src,dst
0,0,2868076
1,0,2918950
2,0,2806750
3,1,2868077
4,1,2806752


### Create the directed graph using NetworkX

In [7]:
cpuG=nx.from_pandas_edgelist(df, source='src', target='dst',create_using=nx.DiGraph)
#nx.draw(cpuG, with_labels=True,pos=nx.circular_layout(cpuG), node_color='r', edge_color='b')
#plt.show()

In [8]:
print("cpu Graph")
print("\tNumber of Vertices: " + str(cpuG.number_of_nodes()))
print("\tNumber of Edges:    " + str(cpuG.number_of_edges()))

cpu Graph
	Number of Vertices: 2921703
	Number of Edges:    4903396


In [9]:
nx.is_strongly_connected(cpuG)

False

In [10]:
nx.is_weakly_connected(cpuG)

False

In [11]:
connectedskus = sorted(nx.weakly_connected_components(cpuG), key=len, reverse=True)

In [12]:
print("\tNumber weakly connected components: " + str(len(connectedskus)))
print("\tMax # of nodes in any of the component : ", len(connectedskus[0]))

	Number weakly connected components: 2967
	Max # of nodes in any of the component :  2850852


In [13]:
# Generate WCCs of cpuG, returning a geneator of sets of nodes, one for each weakly connected component of G
#[len(c) for c in sorted(nx.weakly_connected_components(cpuG), key=len, reverse=True)]

In [14]:
#nodeslist = []
#for consku in connectedskus:
#  nodeslist = []
#  for val in consku:
#          nodeslist.append(val)
#  print(nodeslist)

### Create the directed graph using cugraph

In [15]:
gpuG = cugraph.DiGraph()
gpuG.from_cudf_edgelist(gdf, source='src', destination='dst', renumber=True)

In [16]:
print("Main Graph")
print("\tNumber of Vertices: " + str(gpuG.number_of_vertices()))
print("\tNumber of Edges:    " + str(gpuG.number_of_edges()))

Main Graph
	Number of Vertices: 2921703
	Number of Edges:    4903396


In [17]:
# Generate WCCs of gpuG, returning cuda dataFrame df
# df[‘labels’][i] gives the label id of the ith vertex and df[‘vertices’][i] gives the vertex id of the i’th vertex
wcc = cugraph.weakly_connected_components(gpuG)

In [18]:
wcc.head()

Unnamed: 0,vertices,labels
0,0,1
1,8191,1
2,16382,1
3,24573,1
4,32764,1


In [19]:
wcc['labels'].unique()

0             1
1            47
2            49
3            51
4           102
         ...   
2962    2672389
2963    2673101
2964    2673813
2965    2678085
2966    2729033
Name: labels, Length: 2967, dtype: int32

In [20]:
label_gby = wcc.groupby('labels')
maxNodesCountPerComponent = label_gby['vertices'].count().max()  
print("Total number of components found : ", wcc['labels'].unique().count())
print("Max # of nodes in any of the component : ", maxNodesCountPerComponent)
#largest_component = label_count.nlargest(n = 1, columns = 'vertices')
#print("Size of the largest component is found to be : ", largest_component['vertices'][0])

Total number of components found :  2967
Max # of nodes in any of the component :  2850852


In [21]:
# Call nlargest on the groupby result to get the row where the component count is the largest
label_count = label_gby.count()
label_count = label_count.sort_values('vertices',ascending=False)
label_count.head()

Unnamed: 0_level_0,vertices
labels,Unnamed: 1_level_1
1,2850852
787121,2049
618626,1938
343032,1903
306679,1898


In [22]:
expr = "labels == 1"
component = wcc.query(expr)
len(component)

2850852

In [36]:
sverts = cudf.Series(component['vertices'])
subG = cugraph.subgraph(gpuG,  sverts )
subGDF = subG.view_edge_list()

In [37]:
core = cugraph.core_number(subG) 

In [38]:
remove = core[ core['core_number'] == core['core_number'].max()]
remove

Unnamed: 0,vertex,core_number
798809,798809,19
799155,799155,19
799501,799501,19
799846,799846,19
800193,800193,19
...,...,...
818165,818165,19
818510,818510,19
818856,818856,19
819200,819200,19


In [62]:
def extract_subgraph_vertex(input_edges_df, remove_vertices_list):    
    output_edges_df = input_edges_df
    
    for i in range(len(remove_vertices_list)):
        #print(i)
        #print(remove_vertices_list.iloc[i])
        remove_id = remove_vertices_list.iloc[i]
        output_edges_df = output_edges_df.query('src != @remove_id and dst != @remove_id')
  
    return output_edges_df

In [57]:
remove['vertex'].iloc[0]


798809

In [63]:
subGDF1 = extract_subgraph_vertex(subGDF, remove['vertex'])

In [64]:
subGDF1.head()

Unnamed: 0,src,dst
0,0,429825
1,0,1030984
2,0,1890436
3,1,943016
4,1,2360650


In [65]:
subG1 = cugraph.DiGraph()
subG1.from_cudf_edgelist(subGDF1, source='src', destination='dst', renumber=True)

In [66]:
print("Main Graph")
print("\tNumber of Vertices: " + str(subG1.number_of_vertices()))
print("\tNumber of Edges:    " + str(subG1.number_of_edges()))

Main Graph
	Number of Vertices: 2850785
	Number of Edges:    4813061


In [59]:
def print_components(_df, id, maxColumnLength):
    
    _f = _df.query('labels == @id')
    print(len(_f))
    part = []
    for i in range(len(_f)):
        part.append(_f['vertices'][i])
        
    print(part)
    
    return part

In [67]:
wcc1 = cugraph.weakly_connected_components(subG1)

In [69]:
label_gby = wcc1.groupby('labels')
maxNodesCountPerComponent = label_gby['vertices'].count().max()  
print("Total number of components found : ", wcc1['labels'].unique().count())
print("Max # of nodes in any of the component : ", maxNodesCountPerComponent)

Total number of components found :  3
Max # of nodes in any of the component :  2849092


In [72]:
wcc1['labels'].unique()

0        1
1     5153
2    16636
Name: labels, dtype: int32

In [74]:
label_count = label_gby.count()
largest_component = label_count.nlargest(n = 3, columns = 'vertices')
largest_component

Unnamed: 0_level_0,vertices
labels,Unnamed: 1_level_1
1,2849092
5153,1457
16636,236


In [60]:
#tempdf = cudf.DataFrame()

#for j in wcc['labels'].unique().head():
#        print("Vertex Ids that belong to component label ", j, ": ") 
        
#        tempdf['WCC'+str(j)] = str(print_components(wcc, j, label_gby['org_vertices'].count().max()))
#        #print_components(wcc, j, label_gby['org_vertices'].count().max())



2850852

0                0
1             8191
2            16382
3            24573
4            32764
            ...   
2921698    2883231
2921699    2891422
2921700    2899613
2921701    2907804
2921702    2915995
Name: vertices, Length: 2850852, dtype: int32