In [None]:
import pandas as pd
import networkx as nx

In [None]:
trips_filename = '../data/201508_trip_data.csv'
stations_filename = '../data/201508_station_data.csv'

trips_pdf = pd.read_csv(trips_filename)
stations_pdf = pd.read_csv(stations_filename)

In [None]:
display(stations_pdf.head())

In [None]:
# create nodes from stations.
#
# although networkx used hashed values for nodes, i.e. we don't need to
# have 'id' column, we decided to keep the same format as GraphFrame (PySpark).
#
stations_pdf = stations_pdf.rename(columns={'station_id':'id'})

In [None]:
# index on the station id, now we call it 'id'
stations_pdf.set_index('id', inplace = True)
# display(stations_pdf.head())

In [None]:
# %%time
nodes_dict = stations_pdf.to_dict('index')
nodes_list = [x for x in nodes_dict.items()]

In [None]:
#display(nodes_list)

In [None]:
display(trips_pdf.tail())

In [None]:
# pick our set of columns to represent the edges
edge_columns = ['Start Terminal', 'End Terminal', 'Trip ID', 'Start Date', 'End Date']

trips_edges_pdf = trips_pdf.loc[:,edge_columns]

# rename the start and end terminal to 'src' and 'dst'
# Although not reqired we'll use the sample column names as the ones used with DataFrame 
# (PySpark) "src" (source vertex ID of edge) and "dst" (destination vertex ID of edge).

trips_edges_pdf = trips_edges_pdf.rename(columns={'Start Terminal':'src', 'End Terminal':'dst'})
trips_edges_pdf.shape

In [None]:
# display(trips_edges_pdf.head())

In [None]:
# trips_edges_pdf.loc[0, 'src']

In [None]:
# %%time
edges_list = []
for i in range(trips_edges_pdf.shape[0]):
    edges_list.append((trips_edges_pdf.loc[i, 'src'], trips_edges_pdf.loc[i, 'dst'], 
                       {'Start Date': trips_edges_pdf.loc[i, 'Start Date'], 
                        'End Date': trips_edges_pdf.loc[i, 'End Date']}))

In [None]:
# edges_list[0]

In [None]:
%%time
g = nx.MultiDiGraph()

g.add_nodes_from(nodes_list)

g.add_edges_from(edges_list)

In [None]:
%%time
g = nx.MultiDiGraph()
g.add_nodes_from(nodes_list)
for row in trips_edges_pdf.itertuples():
    edge = [(row.src, row.dst, 
                       {'Start Date': row._4, 
                        'End Date': row._5})]
    print(row[3])
    break
    g.add_edges_from(edge)

In [None]:
trips_edges_pdf.iloc[1,2:]

In [None]:
g.nodes[3]

In [None]:
type(g.node)

In [None]:
# g.get_edge_data(3,4, default=0)

In [None]:
# g.edges(data=True)[0:5]

In [None]:
#import matplotlib.pyplot as plt

In [None]:
# plt.subplot(121)
# nx.draw(g, with_labels=True, font_weight='bold')
# plt.show()

In [None]:
x = g.in_degree()

sorted(x, key=lambda x:x[1], reverse=True)

In [None]:
print('node {} degrees - in:{}, out;{}'.format(70, g.in_degree(70), g.out_degree(70)))

In [None]:
# from networkx.algorithms import community

# three_nodes_gen = community.k_clique_communities(g, 3)

# group = next(three_nodes_gen)
# count = 0
# while group:
#     group = next(three_nodes_gen)
#     count += 1

In [None]:
# attempt in finding motif of three nodes.
# import itertools

# # find 3 nodes subgraphs
# target = nx.MultiDiGraph()
# target.add_edge(1,2)
# target.add_edge(2,3)

# count = 0
# for sub_nodes in itertools.combinations(g.nodes(),len(target.nodes())):
#     subg = g.subgraph(sub_nodes)
#     if nx.is_weakly_connected(subg) and nx.is_isomorphic(subg, target):
#         print(subg.edges())
#         count += 1

In [None]:
df_adjacency = nx.to_pandas_adjacency(g)
df_edgelist = nx.to_pandas_edgelist(g)

In [None]:
display(df_adjacency.head())

In [None]:
display(df_edgelist.head())

In [None]:
df_edgelist.shape

In [None]:
centrality = nx.centrality.betweenness_centrality(g)
sorted(centrality.items(), key=lambda kv: kv[1], reverse=True)