In [1]:
## Import required libraries

import neo4j
import csv
import math
import numpy as np
import pandas as pd
import psycopg2

In [2]:
## Create neo4j driver object and session

driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))
session = driver.session(database="neo4j")

In [3]:
## Function to run a query and return the results in a pandas dataframe

def my_neo4j_run_query_pandas(query, **kwargs):    
    result = session.run(query, **kwargs)
    
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df

In [4]:
## Function to find the shortest path between 2 stations
    
def my_neo4j_shortest_path(from_station, to_station):
    "given a from station and to station, run and print the shortest path"
    
    query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
    session.run(query)

    query = "CALL gds.graph.project('ds_graph', 'Station', 'LINK', {relationshipProperties: 'weight'})"
    session.run(query)

    query = """

    MATCH (source:Station {name: $source}), (target:Station {name: $target})
    CALL gds.shortestPath.dijkstra.stream(
        'ds_graph', 
        { sourceNode: source, 
          targetNode: target, 
          relationshipWeightProperty: 'weight'
        }
    )
    YIELD index, sourceNode, targetNode, totalCost, nodeIds, costs, path
    RETURN
        gds.util.asNode(sourceNode).name AS from,
        gds.util.asNode(targetNode).name AS to,
        totalCost,
        [nodeId IN nodeIds | gds.util.asNode(nodeId).name] AS nodes,
        costs
    ORDER BY index

    """

    result = session.run(query, source=from_station, target=to_station)
    
    for r in result:
        
        total_cost = int(r['totalCost'])
        
        print("\n--------------------------------")
        print("   Total Cost: ", total_cost)
        print("   Minutes: ", round(total_cost / 60.0,1))
        print("--------------------------------")
        
        nodes = r['nodes']
        costs = r['costs']
        
        i = 0
        previous = 0
        
        for n in nodes:
            
            print(n + ", " + str(int(costs[i]) - previous)  + ", " + str(int(costs[i])))
            
            previous = int(costs[i])
            i += 1

In [5]:
## Find top 10 stations which are the closest to all the other stations in the network    
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'LINK', {relationshipProperties: 'weight'})"
session.run(query)

query = """

CALL gds.closeness.harmonic.stream('ds_graph')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS station_name, score as closeness
ORDER BY closeness DESC

"""

df = my_neo4j_run_query_pandas(query)
# Remove arrive & depart nodes
df = df[~df['station_name'].str.contains('arrive|depart', case=False, na=False)].copy()
# Remove colors from the station names to be able to calculate average
colors_to_remove = r'\b(red|yellow|green|blue|orange|gray)\b'
df.loc[:, 'station_name'] = df['station_name'].str.replace(colors_to_remove, '', regex=True).str.strip()
# Calculate average of closeness values
df_closeness_centrality = df.groupby('station_name', as_index=False)['closeness'].mean().sort_values(by='closeness', ascending=False).reset_index(drop=True)
df_closeness_centrality

Unnamed: 0,station_name,closeness
0,West Oakland,0.159924
1,Embarcadero,0.15447
2,Lake Merritt,0.154067
3,12th Street,0.153819
4,Montgomery Street,0.150407
5,Fruitvale,0.147292
6,Powell Street,0.146942
7,19th Street,0.145794
8,Civic Center,0.143685
9,Coliseum,0.142317


In [6]:
## Travel time from West Oakland station to end-points
my_neo4j_shortest_path('depart West Oakland', 'arrive Millbrae') ## 43 mins
print('\n********************************')
my_neo4j_shortest_path('depart West Oakland', 'arrive Berryessa') ## 61 mins
print('\n********************************')
my_neo4j_shortest_path('depart West Oakland', 'arrive Richmond') ## 30 mins
print('\n********************************')
my_neo4j_shortest_path('depart West Oakland', 'arrive Antioch') ## 63 mins
print('\n********************************')


--------------------------------
   Total Cost:  2580
   Minutes:  43.0
--------------------------------
depart West Oakland, 0, 0
red West Oakland, 0, 0
red Embarcadero, 420, 420
red Montgomery Street, 60, 480
red Powell Street, 120, 600
red Civic Center, 60, 660
red 16th Street Mission, 180, 840
red 24th Street Mission, 120, 960
red Glen Park, 180, 1140
red Balboa Park, 120, 1260
red Daly City, 240, 1500
red Colma, 240, 1740
red South San Francisco, 180, 1920
red San Bruno, 240, 2160
red Millbrae, 420, 2580
arrive Millbrae, 0, 2580

********************************

--------------------------------
   Total Cost:  3660
   Minutes:  61.0
--------------------------------
depart West Oakland, 0, 0
green West Oakland, 0, 0
green Lake Merritt, 360, 360
green Fruitvale, 300, 660
green Coliseum, 240, 900
green San Leandro, 240, 1140
green Bay Fair, 240, 1380
green Hayward, 240, 1620
green South Hayward, 240, 1860
green Union City, 300, 2160
green Fremont, 300, 2460
green Warm Springs, 360,

In [7]:
## Travel time from Embarcadero station to end-points
my_neo4j_shortest_path('depart Embarcadero', 'arrive Millbrae') ## 36 mins
print('\n********************************')
my_neo4j_shortest_path('depart Embarcadero', 'arrive Berryessa') ## 68 mins
print('\n********************************')
my_neo4j_shortest_path('depart Embarcadero', 'arrive Richmond') ## 37 mins
print('\n********************************')
my_neo4j_shortest_path('depart Embarcadero', 'arrive Antioch') ## 70 mins
print('\n********************************')


--------------------------------
   Total Cost:  2160
   Minutes:  36.0
--------------------------------
depart Embarcadero, 0, 0
red Embarcadero, 0, 0
red Montgomery Street, 60, 60
red Powell Street, 120, 180
red Civic Center, 60, 240
red 16th Street Mission, 180, 420
red 24th Street Mission, 120, 540
red Glen Park, 180, 720
red Balboa Park, 120, 840
red Daly City, 240, 1080
red Colma, 240, 1320
red South San Francisco, 180, 1500
red San Bruno, 240, 1740
red Millbrae, 420, 2160
arrive Millbrae, 0, 2160

********************************

--------------------------------
   Total Cost:  4080
   Minutes:  68.0
--------------------------------
depart Embarcadero, 0, 0
green Embarcadero, 0, 0
green West Oakland, 420, 420
green Lake Merritt, 360, 780
green Fruitvale, 300, 1080
green Coliseum, 240, 1320
green San Leandro, 240, 1560
green Bay Fair, 240, 1800
green Hayward, 240, 2040
green South Hayward, 240, 2280
green Union City, 300, 2580
green Fremont, 300, 2880
green Warm Springs, 360, 3

In [8]:
## Travel time from Lake Merritt station to end-points
my_neo4j_shortest_path('depart Lake Merritt', 'arrive Millbrae') ## 49.8 mins
print('\n********************************')
my_neo4j_shortest_path('depart Lake Merritt', 'arrive Berryessa') ## 55 mins
print('\n********************************')
my_neo4j_shortest_path('depart Lake Merritt', 'arrive Richmond') ## 28 mins
print('\n********************************')
my_neo4j_shortest_path('depart Lake Merritt', 'arrive Antioch') ## 62 mins
print('\n********************************')


--------------------------------
   Total Cost:  2988
   Minutes:  49.8
--------------------------------
depart Lake Merritt, 0, 0
green Lake Merritt, 0, 0
green West Oakland, 360, 360
green Embarcadero, 420, 780
green Montgomery Street, 60, 840
green Powell Street, 120, 960
green Civic Center, 60, 1020
green 16th Street Mission, 180, 1200
green 24th Street Mission, 120, 1320
green Glen Park, 180, 1500
green Balboa Park, 120, 1620
red Balboa Park, 48, 1668
red Daly City, 240, 1908
red Colma, 240, 2148
red South San Francisco, 180, 2328
red San Bruno, 240, 2568
red Millbrae, 420, 2988
arrive Millbrae, 0, 2988

********************************

--------------------------------
   Total Cost:  3300
   Minutes:  55.0
--------------------------------
depart Lake Merritt, 0, 0
orange Lake Merritt, 0, 0
orange Fruitvale, 300, 300
orange Coliseum, 240, 540
orange San Leandro, 240, 780
orange Bay Fair, 240, 1020
orange Hayward, 240, 1260
orange South Hayward, 240, 1500
orange Union City, 300, 

In [9]:
## Find the amount of influence of each station in the bart network using betweenness centrality algorithm
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'LINK', {relationshipProperties: 'weight'})"
session.run(query)

query = """

CALL gds.betweenness.stream('ds_graph', {relationshipWeightProperty: 'weight'})
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS station_name, score as betweenness
ORDER BY betweenness DESC

"""

df = my_neo4j_run_query_pandas(query)
# Remove arrive & depart nodes
df = df[~df['station_name'].str.contains('arrive|depart', case=False, na=False)].copy()
# Remove colors from the station names to be able to calculate average
colors_to_remove = r'\b(red|yellow|green|blue|orange|gray)\b'
df.loc[:, 'station_name'] = df['station_name'].str.replace(colors_to_remove, '', regex=True).str.strip()
# Calculate average of betweenness centrality values
df_betweenness_centrality = df.groupby('station_name', as_index=False)['betweenness'].mean().sort_values(by='betweenness', ascending=False).reset_index(drop=True)
df_betweenness_centrality

Unnamed: 0,station_name,betweenness
0,Rockridge,5509.0
1,MacArthur,5239.0
2,Orinda,4997.0
3,Lafayette,4469.0
4,Walnut Creek,3925.0
5,12th Street,3752.333333
6,Lake Merritt,3723.0
7,West Oakland,3647.75
8,Coliseum,3603.75
9,19th Street,3535.0


In [21]:
## Travel time from MacArthur station to end-points
my_neo4j_shortest_path('depart MacArthur', 'arrive Millbrae') ## 53 mins
print('\n********************************')
my_neo4j_shortest_path('depart MacArthur', 'arrive Berryessa') ## 63 mins
print('\n********************************')
my_neo4j_shortest_path('depart MacArthur', 'arrive Richmond') ## 20 mins
print('\n********************************')
my_neo4j_shortest_path('depart MacArthur', 'arrive Antioch') ## 53 mins
print('\n********************************')


--------------------------------
   Total Cost:  3180
   Minutes:  53.0
--------------------------------
depart MacArthur, 0, 0
red MacArthur, 0, 0
red 19th Street, 180, 180
red 12th Street, 120, 300
red West Oakland, 300, 600
red Embarcadero, 420, 1020
red Montgomery Street, 60, 1080
red Powell Street, 120, 1200
red Civic Center, 60, 1260
red 16th Street Mission, 180, 1440
red 24th Street Mission, 120, 1560
red Glen Park, 180, 1740
red Balboa Park, 120, 1860
red Daly City, 240, 2100
red Colma, 240, 2340
red South San Francisco, 180, 2520
red San Bruno, 240, 2760
red Millbrae, 420, 3180
arrive Millbrae, 0, 3180

********************************

--------------------------------
   Total Cost:  3780
   Minutes:  63.0
--------------------------------
depart MacArthur, 0, 0
orange MacArthur, 0, 0
orange 19th Street, 180, 180
orange 12th Street, 120, 300
orange Lake Merritt, 180, 480
orange Fruitvale, 300, 780
orange Coliseum, 240, 1020
orange San Leandro, 240, 1260
orange Bay Fair, 240, 

In [10]:
## Find the amount of influence of each bart station in the bart network through page rank
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'LINK', {relationshipProperties: 'weight'})"
session.run(query)

query = """

CALL gds.pageRank.stream('ds_graph',
                         { maxIterations: $max_iterations,
                           dampingFactor: $damping_factor}
                         )
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS station_name, score as page_rank
ORDER BY page_rank DESC, station_name ASC

"""

max_iterations = 21
damping_factor = 0.05

df = my_neo4j_run_query_pandas(query, max_iterations=max_iterations, damping_factor=damping_factor)
# Remove arrive & depart nodes
df = df[~df['station_name'].str.contains('arrive|depart', case=False, na=False)].copy()
# Remove colors from the station names to be able to calculate average
colors_to_remove = r'\b(red|yellow|green|blue|orange|gray)\b'
df.loc[:, 'station_name'] = df['station_name'].str.replace(colors_to_remove, '', regex=True).str.strip()
# Calculate average of page_rank values
df_page_rank = df.groupby('station_name', as_index=False)['page_rank'].mean().sort_values(by='page_rank', ascending=False).reset_index(drop=True)
df_page_rank

Unnamed: 0,station_name,page_rank
0,Pittsburg Center,1.040071
1,West Dublin,1.039954
2,Pittsburg,1.032033
3,North Concord,1.031899
4,Concord,1.031897
5,Pleasant Hill,1.031897
6,Walnut Creek,1.031897
7,Lafayette,1.031895
8,Orinda,1.031779
9,Millbrae,1.026996


In [11]:
## Travel time from Pittsburg station to end-points
my_neo4j_shortest_path('depart Pittsburg Center', 'arrive Millbrae') ## 99.8 mins
print('\n********************************')
my_neo4j_shortest_path('depart Pittsburg Center', 'arrive Berryessa') ## 110 mins
print('\n********************************')
my_neo4j_shortest_path('depart Pittsburg Center', 'arrive Richmond') ## 67 mins
print('\n********************************')
my_neo4j_shortest_path('depart Pittsburg Center', 'arrive Antioch') ## 7 mins
print('\n********************************')


--------------------------------
   Total Cost:  5988
   Minutes:  99.8
--------------------------------
depart Pittsburg Center, 0, 0
yellow Pittsburg Center, 0, 0
yellow Pittsburg, 600, 600
yellow North Concord, 360, 960
yellow Concord, 180, 1140
yellow Pleasant Hill, 360, 1500
yellow Walnut Creek, 120, 1620
yellow Lafayette, 300, 1920
yellow Orinda, 300, 2220
yellow Rockridge, 300, 2520
yellow MacArthur, 240, 2760
yellow 19th Street, 180, 2940
yellow 12th Street, 120, 3060
yellow West Oakland, 300, 3360
yellow Embarcadero, 420, 3780
yellow Montgomery Street, 60, 3840
yellow Powell Street, 120, 3960
yellow Civic Center, 60, 4020
yellow 16th Street Mission, 180, 4200
yellow 24th Street Mission, 120, 4320
yellow Glen Park, 180, 4500
yellow Balboa Park, 120, 4620
red Balboa Park, 48, 4668
red Daly City, 240, 4908
red Colma, 240, 5148
red South San Francisco, 180, 5328
red San Bruno, 240, 5568
red Millbrae, 420, 5988
arrive Millbrae, 0, 5988

********************************

----------

In [12]:
## Travel time from Pittsburg station to end-points
my_neo4j_shortest_path('depart West Dublin', 'arrive Millbrae') ## 80.8 mins
print('\n********************************')
my_neo4j_shortest_path('depart West Dublin', 'arrive Berryessa') ## 53 mins
print('\n********************************')
my_neo4j_shortest_path('depart West Dublin', 'arrive Richmond') ## 59.9 mins
print('\n********************************')
my_neo4j_shortest_path('depart West Dublin', 'arrive Antioch') ## 93.9 mins
print('\n********************************')


--------------------------------
   Total Cost:  4848
   Minutes:  80.8
--------------------------------
depart West Dublin, 0, 0
blue West Dublin, 0, 0
blue Castro Valley, 600, 600
blue Bay Fair, 240, 840
blue San Leandro, 240, 1080
blue Coliseum, 240, 1320
blue Fruitvale, 240, 1560
blue Lake Merritt, 300, 1860
blue West Oakland, 360, 2220
blue Embarcadero, 420, 2640
blue Montgomery Street, 60, 2700
blue Powell Street, 120, 2820
blue Civic Center, 60, 2880
blue 16th Street Mission, 180, 3060
blue 24th Street Mission, 120, 3180
blue Glen Park, 180, 3360
blue Balboa Park, 120, 3480
red Balboa Park, 48, 3528
red Daly City, 240, 3768
red Colma, 240, 4008
red South San Francisco, 180, 4188
red San Bruno, 240, 4428
red Millbrae, 420, 4848
arrive Millbrae, 0, 4848

********************************

--------------------------------
   Total Cost:  3183
   Minutes:  53.0
--------------------------------
depart West Dublin, 0, 0
blue West Dublin, 0, 0
blue Castro Valley, 600, 600
blue Bay Fair

In [13]:
## Triangle Count - number of triangles that pass through a station
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', {LINK: {orientation: 'UNDIRECTED'}})"
session.run(query)

query = """

CALL gds.triangleCount.stream('ds_graph')
YIELD nodeId, triangleCount
RETURN gds.util.asNode(nodeId).name AS station_name, triangleCount as triangle_count
ORDER BY triangleCount DESC, station_name

"""

df = my_neo4j_run_query_pandas(query)
# Remove arrive & depart nodes
df = df[~df['station_name'].str.contains('arrive|depart', case=False, na=False)].copy()
# Remove colors from the station names to be able to calculate average
colors_to_remove = r'\b(red|yellow|green|blue|orange|gray)\b'
df.loc[:, 'station_name'] = df['station_name'].str.replace(colors_to_remove, '', regex=True).str.strip()
# Calculate average of triangle count values
df_triangle_count = df.groupby('station_name', as_index=False)['triangle_count'].mean().sort_values(by='triangle_count', ascending=False).reset_index(drop=True)
df_triangle_count

Unnamed: 0,station_name,triangle_count
0,West Oakland,9.0
1,Daly City,9.0
2,24th Street Mission,9.0
3,Powell Street,9.0
4,Montgomery Street,9.0
5,Balboa Park,9.0
6,16th Street Mission,9.0
7,Glen Park,9.0
8,Civic Center,9.0
9,Coliseum,9.0


In [14]:
## Clustering Coefficient 
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', {LINK: {orientation: 'UNDIRECTED'}})"
session.run(query)

query = """

CALL gds.localClusteringCoefficient.stream('ds_graph')
YIELD nodeId, localClusteringCoefficient
RETURN gds.util.asNode(nodeId).name AS station_name, localClusteringCoefficient as clustering_coefficient
ORDER BY localClusteringCoefficient DESC, station_name

"""

df = my_neo4j_run_query_pandas(query)
# Remove arrive & depart nodes
df = df[~df['station_name'].str.contains('arrive|depart', case=False, na=False)].copy()
# Remove colors from the station names to be able to calculate average
colors_to_remove = r'\b(red|yellow|green|blue|orange|gray)\b'
df.loc[:, 'station_name'] = df['station_name'].str.replace(colors_to_remove, '', regex=True).str.strip()
# Calculate average of clustering coefficient values
df_clustering_coeff = df.groupby('station_name', as_index=False)['clustering_coefficient'].mean().sort_values(by='clustering_coefficient', ascending=False).reset_index(drop=True)
df_clustering_coeff

Unnamed: 0,station_name,clustering_coefficient
0,Daly City,0.514286
1,Coliseum,0.471429
2,West Oakland,0.428571
3,Embarcadero,0.428571
4,24th Street Mission,0.428571
5,Balboa Park,0.428571
6,Powell Street,0.428571
7,Montgomery Street,0.428571
8,Civic Center,0.428571
9,16th Street Mission,0.428571


In [15]:
## Communities in Louvain Modularity
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = """

CALL gds.graph.project('ds_graph', 'Station', 'LINK', 
                      {relationshipProperties: 'weight'})
"""

session.run(query)

query = """

CALL gds.louvain.stream('ds_graph', {includeIntermediateCommunities: true})
YIELD nodeId, communityId, intermediateCommunityIds
RETURN gds.util.asNode(nodeId).name AS station_name, communityId as community, intermediateCommunityIds as intermediate_community
ORDER BY community, station_name

"""

df = my_neo4j_run_query_pandas(query)

# Remove arrive & depart nodes
df = df[~df['station_name'].str.contains('arrive|depart', case=False, na=False)].copy()
# Remove colors from the station names to be able to calculate average
colors_to_remove = r'\b(red|yellow|green|blue|orange|gray)\b'
df.loc[:, 'station_name'] = df['station_name'].str.replace(colors_to_remove, '', regex=True).str.strip()
# Group by the station and assign the common community id
df_louvain = df.groupby('station_name', as_index=False).agg({
    'community': 'first', 
    'intermediate_community': 'first'
}).sort_values(by=['community', 'station_name'], ascending=False).reset_index(drop=True)
df_louvain

Unnamed: 0,station_name,community,intermediate_community
0,Walnut Creek,184,"[184, 184, 184]"
1,Rockridge,184,"[170, 158, 184]"
2,Pleasant Hill,184,"[164, 184, 184]"
3,Pittsburg Center,184,"[162, 162, 184]"
4,Pittsburg,184,"[160, 162, 184]"
5,Orinda,184,"[158, 158, 184]"
6,North Concord,184,"[154, 154, 184]"
7,Lafayette,184,"[140, 158, 184]"
8,Concord,184,"[118, 154, 184]"
9,Antioch,184,"[100, 162, 184]"


In [20]:
df_consolidated = pd.merge(
            pd.merge(
                pd.merge(
                    pd.merge(
                        pd.merge(
                                    df_closeness_centrality, df_betweenness_centrality, how='inner', on='station_name'
                        ), 
                                df_page_rank, how='inner', on='station_name'
                    ),
                            df_triangle_count, how='inner', on='station_name'
                ),
                        df_clustering_coeff, how='inner', on='station_name'
            ),
                    df_louvain, how='inner', on='station_name'
)
df_consolidated

Unnamed: 0,station_name,closeness,betweenness,page_rank,triangle_count,clustering_coefficient,community,intermediate_community
0,West Oakland,0.159924,3647.75,1.005418,9.0,0.428571,150,"[190, 150, 150]"
1,Embarcadero,0.15447,2920.75,1.00371,9.0,0.428571,150,"[130, 150, 150]"
2,Lake Merritt,0.154067,3723.0,1.004915,5.0,0.333333,134,"[142, 134, 134]"
3,12th Street,0.153819,3752.333333,1.004932,5.0,0.333333,96,"[92, 96, 96]"
4,Montgomery Street,0.150407,2774.75,1.003696,9.0,0.428571,150,"[150, 150, 150]"
5,Fruitvale,0.147292,3521.666667,1.004378,5.0,0.333333,134,"[134, 134, 134]"
6,Powell Street,0.146942,2657.75,1.003696,9.0,0.428571,166,"[166, 166, 166]"
7,19th Street,0.145794,3535.0,1.006107,5.0,0.333333,96,"[96, 96, 96]"
8,Civic Center,0.143685,2419.75,1.003696,9.0,0.428571,166,"[112, 166, 166]"
9,Coliseum,0.142317,3603.75,1.009748,9.0,0.471429,134,"[114, 134, 134]"
