In [1]:
import pandas as pd
import networkx as nx

In [2]:
trips_filename = '../data/201508_trip_data.csv'
stations_filename = '../data/201508_station_data.csv'

trips_pdf = pd.read_csv(trips_filename)
stations_pdf = pd.read_csv(stations_filename)

In [3]:
display(stations_pdf.head())

Unnamed: 0,station_id,name,lat,long,dockcount,landmark,installation
0,2,San Jose Diridon Caltrain Station,37.329732,-121.901782,27,San Jose,8/6/2013
1,3,San Jose Civic Center,37.330698,-121.888979,15,San Jose,8/5/2013
2,4,Santa Clara at Almaden,37.333988,-121.894902,11,San Jose,8/6/2013
3,5,Adobe on Almaden,37.331415,-121.8932,19,San Jose,8/5/2013
4,6,San Pedro Square,37.336721,-121.894074,15,San Jose,8/7/2013


In [4]:
# create nodes from stations.
#
# although networkx used hashed values for nodes, i.e. we don't need to
# have 'id' column, we decided to keep the same format as GraphFrame (PySpark).
#
stations_pdf = stations_pdf.rename(columns={'station_id':'id'})

In [5]:
# index on the station id, now we call it 'id'
stations_pdf.set_index('id', inplace = True)
display(stations_pdf.head())

Unnamed: 0_level_0,name,lat,long,dockcount,landmark,installation
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,San Jose Diridon Caltrain Station,37.329732,-121.901782,27,San Jose,8/6/2013
3,San Jose Civic Center,37.330698,-121.888979,15,San Jose,8/5/2013
4,Santa Clara at Almaden,37.333988,-121.894902,11,San Jose,8/6/2013
5,Adobe on Almaden,37.331415,-121.8932,19,San Jose,8/5/2013
6,San Pedro Square,37.336721,-121.894074,15,San Jose,8/7/2013


In [6]:
nodes_dict = stations_pdf.to_dict('index')
#display(nodes_dict)
nodes_list = [x for x in nodes_dict.items()]

In [7]:
#display(nodes_list)

In [8]:
display(trips_pdf.head())

Unnamed: 0,Trip ID,Duration,Start Date,Start Station,Start Terminal,End Date,End Station,End Terminal,Bike #,Subscriber Type,Zip Code
0,913460,765,8/31/2015 23:26,Harry Bridges Plaza (Ferry Building),50,8/31/2015 23:39,San Francisco Caltrain (Townsend at 4th),70,288,Subscriber,2139
1,913459,1036,8/31/2015 23:11,San Antonio Shopping Center,31,8/31/2015 23:28,Mountain View City Hall,27,35,Subscriber,95032
2,913455,307,8/31/2015 23:13,Post at Kearny,47,8/31/2015 23:18,2nd at South Park,64,468,Subscriber,94107
3,913454,409,8/31/2015 23:10,San Jose City Hall,10,8/31/2015 23:17,San Salvador at 1st,8,68,Subscriber,95113
4,913453,789,8/31/2015 23:09,Embarcadero at Folsom,51,8/31/2015 23:22,Embarcadero at Sansome,60,487,Customer,9069


In [9]:
# pick our set of columns to represent the edges
edge_columns = ['Start Terminal', 'End Terminal', 'Trip ID', 'Start Date', 'End Date']

trips_edges_pdf = trips_pdf.loc[:,edge_columns]

# rename the start and end terminal to 'src' and 'dst'
# Although not reqired we'll use the sample column names as the ones used with DataFrame 
# (PySpark) "src" (source vertex ID of edge) and "dst" (destination vertex ID of edge).

trips_edges_pdf = trips_edges_pdf.rename(columns={'Start Terminal':'src', 'End Terminal':'dst'})

In [10]:
display(trips_edges_pdf.head())

Unnamed: 0,src,dst,Trip ID,Start Date,End Date
0,50,70,913460,8/31/2015 23:26,8/31/2015 23:39
1,31,27,913459,8/31/2015 23:11,8/31/2015 23:28
2,47,64,913455,8/31/2015 23:13,8/31/2015 23:18
3,10,8,913454,8/31/2015 23:10,8/31/2015 23:17
4,51,60,913453,8/31/2015 23:09,8/31/2015 23:22


In [11]:
trips_edges_pdf.loc[0, 'src']

50

In [12]:
edges_list = []
for i in range(trips_edges_pdf.shape[0]):
    edges_list.append((trips_edges_pdf.loc[i, 'src'], trips_edges_pdf.loc[i, 'dst'], 
                       {'Start Date': trips_edges_pdf.loc[i, 'Start Date'], 
                        'End Date': trips_edges_pdf.loc[i, 'End Date']}))

In [13]:
edges_list[0]

(50, 70, {'Start Date': '8/31/2015 23:26', 'End Date': '8/31/2015 23:39'})

In [14]:
g = nx.MultiDiGraph()

g.add_nodes_from(nodes_list)

g.add_edges_from(edges_list)

[0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 3,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 2,
 0,
 0,
 4,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 2,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 2,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 3,
 3,
 0,
 0,
 1,
 1,
 0,
 2,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 5,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 4,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 3,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 2,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 3,
 1,
 0,


In [15]:
g.nodes[3]

{'name': 'San Jose Civic Center',
 'lat': 37.330698,
 'long': -121.888979,
 'dockcount': 15,
 'landmark': 'San Jose',
 'installation': '8/5/2013'}

In [25]:
type(g.node)

networkx.classes.reportviews.NodeView

In [16]:
# g.get_edge_data(3,4, default=0)

In [33]:
# g.edges(data=True)[0:5]

TypeError: 'OutMultiEdgeDataView' object is not subscriptable

In [17]:
#import matplotlib.pyplot as plt

In [18]:
# plt.subplot(121)
# nx.draw(g, with_labels=True, font_weight='bold')
# plt.show()

In [19]:
x = g.in_degree()

sorted(x, key=lambda x:x[1], reverse=True)

[(70, 34810),
 (69, 22523),
 (50, 17810),
 (61, 15463),
 (65, 15422),
 (60, 15065),
 (77, 13916),
 (74, 13617),
 (55, 12966),
 (39, 10239),
 (67, 10220),
 (76, 9685),
 (64, 8253),
 (57, 8147),
 (72, 7714),
 (63, 7275),
 (51, 7229),
 (82, 7159),
 (54, 6687),
 (56, 6330),
 (68, 6288),
 (75, 6262),
 (48, 6146),
 (45, 5821),
 (42, 5182),
 (41, 5068),
 (2, 5045),
 (49, 4991),
 (66, 4879),
 (47, 4803),
 (62, 4727),
 (73, 4319),
 (71, 4134),
 (28, 3828),
 (46, 3481),
 (59, 2852),
 (4, 1878),
 (27, 1724),
 (58, 1627),
 (6, 1595),
 (32, 1116),
 (31, 1099),
 (7, 1073),
 (29, 1046),
 (9, 976),
 (11, 960),
 (3, 909),
 (34, 900),
 (84, 880),
 (22, 799),
 (13, 729),
 (30, 725),
 (35, 724),
 (10, 671),
 (12, 647),
 (14, 621),
 (37, 576),
 (8, 547),
 (36, 496),
 (5, 490),
 (80, 489),
 (16, 445),
 (33, 441),
 (25, 436),
 (38, 417),
 (26, 230),
 (23, 187),
 (83, 145),
 (21, 100),
 (24, 98)]

In [35]:
print('node {} degrees - in:{}, out;{}'.format(70, g.in_degree(70), g.out_degree(70)))

node 70 degrees - in:34810, out;26304


In [None]:
# from networkx.algorithms import community

# three_nodes_gen = community.k_clique_communities(g, 3)

# group = next(three_nodes_gen)
# count = 0
# while group:
#     group = next(three_nodes_gen)
#     count += 1

In [None]:
# attempt in finding motif of three nodes.
# import itertools

# # find 3 nodes subgraphs
# target = nx.MultiDiGraph()
# target.add_edge(1,2)
# target.add_edge(2,3)

# count = 0
# for sub_nodes in itertools.combinations(g.nodes(),len(target.nodes())):
#     subg = g.subgraph(sub_nodes)
#     if nx.is_weakly_connected(subg) and nx.is_isomorphic(subg, target):
#         print(subg.edges())
#         count += 1

In [26]:
df_adjacency = nx.to_pandas_adjacency(g)
df_edgelist = nx.to_pandas_edgelist(g)

In [29]:
display(df_adjacency.head())

Unnamed: 0,2,3,4,5,6,7,8,9,10,11,...,72,73,74,75,76,77,80,82,83,84
2,77.0,282.0,1232.0,268.0,687.0,497.0,99.0,315.0,223.0,461.0,...,0.0,0.0,0.0,0.0,0.0,0.0,15.0,0.0,0.0,250.0
3,209.0,189.0,41.0,25.0,46.0,33.0,8.0,13.0,57.0,43.0,...,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,21.0
4,1214.0,22.0,105.0,12.0,16.0,35.0,26.0,11.0,14.0,17.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,4.0
5,307.0,14.0,17.0,29.0,22.0,29.0,17.0,7.0,13.0,21.0,...,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,4.0
6,614.0,54.0,11.0,20.0,86.0,36.0,29.0,120.0,46.0,34.0,...,0.0,0.0,0.0,0.0,0.0,0.0,68.0,0.0,0.0,110.0


In [28]:
display(df_edgelist.head())

Unnamed: 0,source,target,Start Date,End Date
0,2,16,8/31/2015 18:43,8/31/2015 18:56
1,2,16,8/28/2015 18:25,8/28/2015 18:40
2,2,16,8/26/2015 18:31,8/26/2015 18:44
3,2,16,8/26/2015 17:14,8/26/2015 17:28
4,2,16,8/25/2015 18:27,8/25/2015 18:43


In [30]:
df_edgelist.shape

(354152, 4)

In [54]:
centrality = nx.centrality.betweenness_centrality(g)
sorted(centrality.items(), key=lambda kv: kv[1], reverse=True)

[(70, 0.23124883703914031),
 (29, 0.18967539891188076),
 (76, 0.18540203308921865),
 (11, 0.13594709732817148),
 (3, 0.12378678085199823),
 (28, 0.0784477642797861),
 (35, 0.06167391559030276),
 (31, 0.04291409584637638),
 (39, 0.042315440669021216),
 (25, 0.0377470460092567),
 (38, 0.03748715711809299),
 (56, 0.027338495784787344),
 (9, 0.01982246305590632),
 (34, 0.019805607402416357),
 (83, 0.019758001205706147),
 (63, 0.018490531917641894),
 (27, 0.016354493962850193),
 (32, 0.015152222760091623),
 (2, 0.014163049936707228),
 (30, 0.01295846511639349),
 (22, 0.012042619794118463),
 (80, 0.010346277290011304),
 (36, 0.009424994767719963),
 (14, 0.007629677914204769),
 (37, 0.006318154129567867),
 (21, 0.005563444107519195),
 (23, 0.0033877572538534243),
 (24, 0.0032880212765599025),
 (33, 0.0006081136144877494),
 (26, 0.00016644338894978284),
 (4, 1.5223480696626475e-05),
 (5, 1.5223480696626475e-05),
 (6, 1.5223480696626475e-05),
 (7, 1.5223480696626475e-05),
 (8, 1.522348069662647