In [1]:
import pandas as pd
import networkx as nx

In [2]:
trips_filename = '../data/201508_trip_data.csv'
stations_filename = '../data/201508_station_data.csv'

trips_pdf = pd.read_csv(trips_filename)
stations_pdf = pd.read_csv(stations_filename)

In [3]:
display(stations_pdf.head())

Unnamed: 0,station_id,name,lat,long,dockcount,landmark,installation
0,2,San Jose Diridon Caltrain Station,37.329732,-121.901782,27,San Jose,8/6/2013
1,3,San Jose Civic Center,37.330698,-121.888979,15,San Jose,8/5/2013
2,4,Santa Clara at Almaden,37.333988,-121.894902,11,San Jose,8/6/2013
3,5,Adobe on Almaden,37.331415,-121.8932,19,San Jose,8/5/2013
4,6,San Pedro Square,37.336721,-121.894074,15,San Jose,8/7/2013


In [4]:
# create nodes from stations.
#
# although networkx used hashed values for nodes, i.e. we don't need to
# have 'id' column, we decided to keep the same format as GraphFrame (PySpark).
#
stations_pdf = stations_pdf.rename(columns={'station_id':'id'})

In [5]:
# index on the station id, now we call it 'id'
stations_pdf.set_index('id', inplace = True)
# display(stations_pdf.head())

In [6]:
# %%time
nodes_dict = stations_pdf.to_dict('index')
nodes_list = [x for x in nodes_dict.items()]

In [7]:
#display(nodes_list)

In [7]:
display(trips_pdf.tail())

Unnamed: 0,Trip ID,Duration,Start Date,Start Station,Start Terminal,End Date,End Station,End Terminal,Bike #,Subscriber Type,Zip Code
354147,432951,619,9/1/2014 4:21,Powell Street BART,39,9/1/2014 4:32,Townsend at 7th,65,335,Subscriber,94118
354148,432950,6712,9/1/2014 3:16,Harry Bridges Plaza (Ferry Building),50,9/1/2014 5:08,San Francisco Caltrain (Townsend at 4th),70,259,Customer,44100
354149,432949,538,9/1/2014 0:05,South Van Ness at Market,66,9/1/2014 0:14,5th at Howard,57,466,Customer,32
354150,432948,568,9/1/2014 0:05,South Van Ness at Market,66,9/1/2014 0:15,5th at Howard,57,461,Customer,32
354151,432947,569,9/1/2014 0:05,South Van Ness at Market,66,9/1/2014 0:15,5th at Howard,57,318,Customer,32


In [9]:
# pick our set of columns to represent the edges
edge_columns = ['Start Terminal', 'End Terminal', 'Trip ID', 'Start Date', 'End Date']

trips_edges_pdf = trips_pdf.loc[:,edge_columns]

# rename the start and end terminal to 'src' and 'dst'
# Although not reqired we'll use the sample column names as the ones used with DataFrame 
# (PySpark) "src" (source vertex ID of edge) and "dst" (destination vertex ID of edge).

trips_edges_pdf = trips_edges_pdf.rename(columns={'Start Terminal':'src', 'End Terminal':'dst'})
trips_edges_pdf.shape

(354152, 5)

In [None]:
# display(trips_edges_pdf.head())

In [None]:
# trips_edges_pdf.loc[0, 'src']

In [13]:
# %%time
edges_list = []
for i in range(trips_edges_pdf.shape[0]):
    edges_list.append((trips_edges_pdf.loc[i, 'src'], trips_edges_pdf.loc[i, 'dst'], 
                       {'Start Date': trips_edges_pdf.loc[i, 'Start Date'], 
                        'End Date': trips_edges_pdf.loc[i, 'End Date']}))

In [14]:
# edges_list[0]

In [15]:
%%time
g = nx.MultiDiGraph()

g.add_nodes_from(nodes_list)

g.add_edges_from(edges_list)

CPU times: user 2.57 s, sys: 61.9 ms, total: 2.63 s
Wall time: 2.63 s


In [25]:
%%time
g = nx.MultiDiGraph()
g.add_nodes_from(nodes_list)
for row in trips_edges_pdf.itertuples():
    edge = [(row.src, row.dst, 
                       {'Start Date': row._4, 
                        'End Date': row._5})]
    print(row[3])
    break
    g.add_edges_from(edge)

913460
CPU times: user 1.98 ms, sys: 985 µs, total: 2.97 ms
Wall time: 2.66 ms


In [None]:
trips_edges_pdf.iloc[1,2:]

In [None]:
g.nodes[3]

In [None]:
type(g.node)

In [None]:
# g.get_edge_data(3,4, default=0)

In [None]:
# g.edges(data=True)[0:5]

In [None]:
#import matplotlib.pyplot as plt

In [None]:
# plt.subplot(121)
# nx.draw(g, with_labels=True, font_weight='bold')
# plt.show()

In [None]:
x = g.in_degree()

sorted(x, key=lambda x:x[1], reverse=True)

In [None]:
print('node {} degrees - in:{}, out;{}'.format(70, g.in_degree(70), g.out_degree(70)))

In [None]:
# from networkx.algorithms import community

# three_nodes_gen = community.k_clique_communities(g, 3)

# group = next(three_nodes_gen)
# count = 0
# while group:
#     group = next(three_nodes_gen)
#     count += 1

In [None]:
# attempt in finding motif of three nodes.
# import itertools

# # find 3 nodes subgraphs
# target = nx.MultiDiGraph()
# target.add_edge(1,2)
# target.add_edge(2,3)

# count = 0
# for sub_nodes in itertools.combinations(g.nodes(),len(target.nodes())):
#     subg = g.subgraph(sub_nodes)
#     if nx.is_weakly_connected(subg) and nx.is_isomorphic(subg, target):
#         print(subg.edges())
#         count += 1

In [None]:
df_adjacency = nx.to_pandas_adjacency(g)
df_edgelist = nx.to_pandas_edgelist(g)

In [None]:
display(df_adjacency.head())

In [None]:
display(df_edgelist.head())

In [None]:
df_edgelist.shape

In [None]:
centrality = nx.centrality.betweenness_centrality(g)
sorted(centrality.items(), key=lambda kv: kv[1], reverse=True)