imports used throughout data processing

In [19]:
import pandas as pd # read from csv
import numpy as np  # preprocess data

read edge and node dataframes from their corresponding csv files

In [20]:
edge_df = pd.read_csv("./Datasets/edges.csv")

In [21]:
nodes_df = pd.read_csv("./Datasets/nodes.csv") # read edge dataset from our csv file

clean node dataset to remove all nodes where there is one edge or fewer leaving it

In [22]:
# Creates a copy of the nodes_df
cleaned_nodes_df = nodes_df.copy(deep=True)

In [23]:
# count number of edges leaving each node
node_edge_count = np.zeros(cleaned_nodes_df.shape[0])
for index, edge in edge_df.iterrows():
    node_edge_count[int(edge[1])] += 1 # increment counter for the corresponding node of the start node ID of the edge 

In [24]:
# convert the cleaned nodes dataframe to an np array of ints
cleaned_nodes_np = cleaned_nodes_df.to_numpy(dtype=np.int32)
# remove all rows where the number of edges leaving that node is less than 1
cleaned_nodes_np = np.delete(cleaned_nodes_np, np.where(node_edge_count > 1), axis = 0)

process the edge dataset to remove edges that no longer link to a valid node

In [25]:
# get overall list of node IDs
node_ids = cleaned_nodes_np[:, 0]
# convert the edge dataframe to an np array
cleaned_edges_np = edge_df.to_numpy(dtype=np.int32)
# create array that has values corresponding to whether the start and end node IDs are present in the overall list of IDs
present_start = np.isin(cleaned_edges_np[:, 1], node_ids)
present_end = np.isin(cleaned_edges_np[:, 2], node_ids)
present = np.logical_and(present_start, present_end)
# delete rows in which the start/end ID are not present in the overall list of node IDs
cleaned_edges_np = np.delete(cleaned_edges_np, ~present, axis = 0)

In [26]:
# print new shapes for cleaned nodes and edges (sanity check)
print(cleaned_nodes_np.shape)
print(nodes_df.shape)
print(cleaned_edges_np.shape)
print(edge_df.shape)

(169509, 3)
(175813, 3)
(158665, 4)
(179179, 4)


In [27]:
#export cleaned_nodes_np and cleaned_edges_np dataframe into own csv files
cleaned_nodes_df = pd.DataFrame(cleaned_nodes_np)
cleaned_nodes_df.to_csv('cleaned_nodes.csv')

cleaned_edges_df = pd.DataFrame(cleaned_edges_np)
cleaned_edges_df.to_csv('cleaned_edges.csv')
