# In this notebook, we will create graphical algorithms using neo4j and view the graphical network on neo4j at https://xxxx:7473

## Requirements

1. To have loaded the tables in the load_tables notebook.
2. Have the BART Neo4J dataset 

In [1]:
#import the required libraries
import neo4j

import csv

import math
import numpy as np
import pandas as pd

import psycopg2

In [2]:
#
# function to run a select query and return rows in a pandas dataframe
# pandas puts all numeric values from postgres to float
# if it will fit in an integer, change it to integer
#
def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)
    

In [3]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [4]:
cursor = connection.cursor()

In [36]:
#call the neo4j driver
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))

In [37]:
session = driver.session(database="neo4j")

In [38]:
#wipe out the neo4j database
def my_neo4j_wipe_out_database():
    "wipe out database by deleting all nodes and relationships"
    
    query = "match (node)-[relationship]->() delete node, relationship"
    session.run(query)
    
    query = "match (node) delete node"
    session.run(query)

In [39]:
#function to run a query and return the results in a pandas dataframe
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    
    result = session.run(query, **kwargs)
    
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df

In [40]:
#function to print the number of nodes and relationships
def my_neo4j_number_nodes_relationships():
    "print the number of nodes and relationships"
   
    
    query = """
        match (n) 
        return n.name as node_name, labels(n) as labels
        order by n.name
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_nodes = df.shape[0]
    
    
    query = """
        match (n1)-[r]->(n2) 
        return n1.name as node_name_1, labels(n1) as node_1_labels, 
            type(r) as relationship_type, n2.name as node_name_2, labels(n2) as node_2_labels
        order by node_name_1, node_name_2
    """
    
    df = my_neo4j_run_query_pandas(query)
    display(df)
    
    number_relationships = df.shape[0]
    
    print("-------------------------")
    print("  Nodes:", number_nodes)
    print("  Relationships:", number_relationships)
    print("-------------------------")

In [41]:
#function to create a node
def my_neo4j_create_node(station_name):
    "create a node with label Station"
    
    query = """
    
    CREATE (:Station {name: $station_name})
    
    """
    
    session.run(query, station_name=station_name)
    

In [42]:
#function to create a relationship one way
def my_neo4j_create_relationship_one_way(from_station, to_station, weight):
    "create a relationship one way between two stations with a weight"
    
    query = """
    
    MATCH (from:Station), 
          (to:Station)
    WHERE from.name = $from_station and to.name = $to_station
    CREATE (from)-[:LINK {weight: $weight}]->(to)
    
    """
    
    session.run(query, from_station=from_station, to_station=to_station, weight=weight)


In [43]:
#function to create graphical relationships two way
def my_neo4j_create_relationship_two_way(from_station, to_station, weight):
    "create relationships two way between two stations with a weight"
    
    query = """
    
    MATCH (from:Station), 
          (to:Station)
    WHERE from.name = $from_station and to.name = $to_station
    CREATE (from)-[:LINK {weight: $weight}]->(to),
           (to)-[:LINK {weight: $weight}]->(from)
    
    """
    
    session.run(query, from_station=from_station, to_station=to_station, weight=weight)


In [44]:
#function to connect to postgres
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [45]:
cursor = connection.cursor()

In [46]:
#wipe out the database initially 
# my_neo4j_wipe_out_database()

# Verify the number of nodes and relationships¶

In [47]:
my_neo4j_number_nodes_relationships()

Unnamed: 0,node_name_1,node_1_labels,relationship_type,node_name_2,node_2_labels
0,blue 16th Street Mission,[Station],LINK,arrive 16th Street Mission,[Station]
1,blue 16th Street Mission,[Station],LINK,blue 24th Street Mission,[Station]
2,blue 16th Street Mission,[Station],LINK,blue Civic Center,[Station]
3,blue 16th Street Mission,[Station],LINK,green 16th Street Mission,[Station]
4,blue 16th Street Mission,[Station],LINK,red 16th Street Mission,[Station]
...,...,...,...,...,...
647,yellow West Oakland,[Station],LINK,blue West Oakland,[Station]
648,yellow West Oakland,[Station],LINK,green West Oakland,[Station]
649,yellow West Oakland,[Station],LINK,red West Oakland,[Station]
650,yellow West Oakland,[Station],LINK,yellow 12th Street,[Station]


-------------------------
  Nodes: 214
  Relationships: 652
-------------------------


# Use case 1.  Graph Algorithm based on Average Exit Counts and travel time (combined_weight) to the destination station. 
Query the list of all segments between each station and its adjoining stations, create a relationship for each segment one way and estimate the significance of a station through page rank algorithm by number of exits in the destination station¶¶

# 1a. Page Rank Algorithm

In [50]:
connection.rollback()

query = """

WITH q AS (
    SELECT DISTINCT a.line, a.station as from_station, b.station as to_station, t.travel_time,
    MIN(t.travel_time) OVER (PARTITION BY a.station, b.station) as min_travel_time,
    MAX(t.travel_time) OVER (PARTITION BY a.station, b.station) as max_travel_time
    FROM lines a
    JOIN lines b ON a.line = b.line AND b.sequence = (a.sequence + 1)
    JOIN travel_times t ON (a.station = t.station_1 AND b.station = t.station_2)
        OR (a.station = t.station_2 AND b.station = t.station_1)
),
be_agg AS (
    SELECT station, MIN(average_exit_count) as min_exit_count, MAX(average_exit_count) as max_exit_count
    FROM stage_bart_exits
    GROUP BY station
),
min_max_values AS (
    SELECT 
        MIN(q.travel_time) AS min_travel_time,
        MAX(q.travel_time) AS max_travel_time,
        MIN(be.average_exit_count) AS min_exit_count,
        MAX(be.average_exit_count) AS max_exit_count
    FROM q
    LEFT JOIN stage_bart_exits be ON q.to_station = be.station
)
SELECT 
    q.from_station,
    q.to_station,
    MAX(q.travel_time) AS travel_time,
    AVG(be.average_exit_count) AS average_exit_count,
    (0.6 * ((MAX(q.travel_time) - min_max_values.min_travel_time) / (min_max_values.max_travel_time - min_max_values.min_travel_time))) +
    (0.4 * ((AVG(be.average_exit_count) - min_max_values.min_exit_count) / (min_max_values.max_exit_count - min_max_values.min_exit_count))) AS combined_weight
FROM stage_bart_exits be
LEFT JOIN stations s ON be.station = s.station
JOIN q ON be.station = q.to_station
CROSS JOIN min_max_values
GROUP BY q.from_station, q.to_station, min_max_values.min_travel_time, min_max_values.max_travel_time, min_max_values.min_exit_count, min_max_values.max_exit_count
;
"""

cursor.execute(query)

connection.rollback()

rows = cursor.fetchall()

for row in rows:
    from_station =  row[0]
    #lets create the node with from_station
    my_neo4j_create_node(from_station)
    to_station =  row[1]
    #lets create the node with to_station
    
    my_neo4j_create_node(to_station)
    combined_weight = float(row[4])
    my_neo4j_create_relationship_one_way(from_station, to_station,combined_weight)

In [51]:
#lets get the number of relationships and view them as a dataframe 
my_neo4j_number_nodes_relationships()

Unnamed: 0,node_name_1,node_1_labels,relationship_type,node_name_2,node_2_labels
0,12th Street,[Station],LINK,Lake Merritt,[Station]
1,12th Street,[Station],LINK,West Oakland,[Station]
2,12th Street,[Station],LINK,West Oakland,[Station]
3,16th Street Mission,[Station],LINK,24th Street Mission,[Station]
4,24th Street Mission,[Station],LINK,Glen Park,[Station]
...,...,...,...,...,...
839,yellow West Oakland,[Station],LINK,blue West Oakland,[Station]
840,yellow West Oakland,[Station],LINK,green West Oakland,[Station]
841,yellow West Oakland,[Station],LINK,red West Oakland,[Station]
842,yellow West Oakland,[Station],LINK,yellow 12th Street,[Station]


-------------------------
  Nodes: 314
  Relationships: 844
-------------------------


# 1. Page Rank Benefits
1. When we use the average exit count of a station(mean of the exit counts for a station over the past five months) as the weight in the relationships between stations and then run the PageRank algorithm, it calculates the importance or significance of each station in the transportation network based on the number of exits, connectivity and popularity.Higher exit counts generally indicate that more passengers are using the station to end their journeys or exit the transit system. Stations with high exit counts are often well-connected to other stations and transportation modes.

2. Likewise Travel time is essential for understanding how accessible a station is to passengers. Stations with shorter travel times are likely to be more attractive to commuters, as they offer faster and more convenient journeys. A station with shorter travel time can be considered popular in terms of ease of use and convenience

3. By combining both travel time and exit counts, you can get a more comprehensive picture of a station's popularity. A station with both short travel time and high exit counts would likely be highly popular and significant in the transit network. On the other hand, a station with long travel time but high exit counts may be an essential transfer point or serve a unique purpose, making it popular for specific travel routes.

4. combined_weight = (alpha * normalized_travel_time) + (beta * normalized_exit_count)

5. Higher PageRank scores indicate stations that are more central or crucial for the overall flow of commuters and travelers in the network. These stations are likely major transfer hubs or popular destinations.

6. Lower PageRank scores indicate stations that have less impact on the overall network flow, suggesting that they might be less frequented or less critical in terms of transfer or destination points.

7. In summary, we can gain insights into which stations have the most significant impact on the overall system. This information can be valuable for various purposes, such as optimizing transportation routes, identifying potential congestion points, or improving overall network efficiency.

# Page Rank Algorithm

In [52]:
#page rank algorithm test
query = "CALL gds.graph.drop('ds_graph', false)"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'LINK', {relationshipProperties: 'weight'})"
session.run(query)

query = """

CALL gds.pageRank.stream('ds_graph',
                         { maxIterations: $max_iterations,
                           dampingFactor: $damping_factor}
                         )
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as page_rank
ORDER BY page_rank DESC, name ASC

"""

max_iterations = 20
damping_factor = 0.80

result = my_neo4j_run_query_pandas(query, max_iterations=max_iterations, damping_factor=damping_factor)


# Group by station name and calculate the sum of PageRank scores for each station
station_sum_page_rank = result.groupby('name')['page_rank'].sum().reset_index()

# Sort the DataFrame based on the PageRank score in descending order
station_sum_page_rank = station_sum_page_rank.sort_values(by='page_rank', ascending=False)

print(station_sum_page_rank)

                   name  page_rank
10             Coliseum   2.977097
39        South Hayward   2.588630
43         Warm Springs   2.575929
18              Fremont   2.492338
41           Union City   1.968441
..                  ...        ...
136    depart Glen Park   0.200000
135    depart Fruitvale   0.200000
134      depart Fremont   0.200000
133  depart Embarcadero   0.200000
130       depart Dublin   0.200000

[260 rows x 2 columns]


# 2.  Identify communities or clusters of stations based on their combined significance, which includes both average exit counts and travel time using Louvain modularity. 

# 2. Louvain Modularity

In [53]:
query = "CALL gds.graph.drop('ds_graph', false)"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'LINK', {relationshipProperties: 'weight'})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7f82079a35e0>

In [54]:
query = """

CALL gds.louvain.stream('ds_graph')
YIELD nodeId, communityId, intermediateCommunityIds
RETURN DISTINCT gds.util.asNode(nodeId).name AS name, communityId as community, intermediateCommunityIds as intermediate_community
ORDER BY community, name ASC

"""

my_neo4j_run_query_pandas(query)

Unnamed: 0,name,community,intermediate_community
0,arrive 16th Street Mission,135,
1,arrive 24th Street Mission,135,
2,arrive Civic Center,135,
3,arrive Powell Street,135,
4,blue 16th Street Mission,135,
...,...,...,...
256,South Hayward,295,
257,Union City,295,
258,Warm Springs,295,
259,Castro Valley,311,


# 2 Louvain Modularity Benefits
1.  Louvain modularity algorithm with the above combined_weight was run to identify communities or clusters of stations based on their combined significance, which includes both average exit counts and travel time. 

2. By running Louvain modularity with the combined_weight, you can group stations that have similar combined significance, meaning they have comparable values in both average exit counts and travel time.

3. This will help us identify sets of stations that are closely related to each other in terms of both passenger usage and travel duration. 

4. Understanding these clusters or communities can provide insights into the structure and dynamics of the transportation network, helping us make informed decisions about efficient deliveries and optimizing your transport services.

# 3 Customer recomendation algorithm - Shortest path

Suggest a path from the customer closest BART station to the closest BART station to the Berkeley store.

First, merge AGM customer and zip tables to get each customer latitude and longitude

In [5]:
rollback_before_flag = True
rollback_after_flag = True


# Join customers and zip_codes tables
cust_zip_query = """

select distinct cu.customer_id
,cu.first_name
,cu.last_name
,zi.zip as customer_zip
,zi.latitude as zip_lat
,zi.longitude as zip_long
from customers cu
left join zip_codes zi
on cu.zip=zi.zip

"""

customer_location_df = my_select_query_pandas(cust_zip_query, rollback_before_flag, rollback_after_flag)

Then get the BART stations latitude and longitude from the BART stations table

In [6]:
rollback_before_flag = True
rollback_after_flag = True


# Query stations table
station_query = """

select distinct station, latitude as station_lat, longitude as station_long
from stations as st

"""

station_df = my_select_query_pandas(station_query, rollback_before_flag, rollback_after_flag)

## Use Geodesic distance to calculate each customer's nearest station

In [7]:
from geographiclib.geodesic import Geodesic
#
#  Given two points in (latitude, longitude) format, calculate the distance between them in miles
#

def my_calculate_distance(point_1, point_2):
    "Given two points in (latitude, longitude) format, calculate the distance between them in miles"
    
    geod = Geodesic.WGS84


    g = geod.Inverse(point_1[0], point_1[1], point_2[0], point_2[1])
    miles = g['s12'] / 1000 * 0.621371
    
    return miles

In [8]:
#
# Given row in customer dataframe, calculate the closest station to them via geodesic distance
#
def calculate_closest_station(x):
    
    # Get temporary closest station & distance
    closest_station = station_df.iloc[0]['station']
    closest_distance = my_calculate_distance((x['zip_lat'],x['zip_long']),(station_df.iloc[0]['station_lat'],station_df.iloc[0]['station_long']))
    
    # Iterate through each station
    for index,row in station_df.iterrows():
        station_point = (row['station_lat'],row['station_long'])
        
        # Calculate distance from customer to station
        current_dist = my_calculate_distance((x['zip_lat'],x['zip_long']),station_point)
        
        # If this distance is shorter than the current smallest distance, update it
        if current_dist<closest_distance:
            closest_station = station_df.iloc[index]['station']
            closest_distance = current_dist
            
    #print("{} : Final closest station is {} with a distance of {} miles".format(x['first_name'],closest_station,closest_distance))
    return closest_station
    

In [9]:
# Apply above function to customer dataframe to get closest station to each customer
customer_location_df['closest_station'] = customer_location_df.apply(lambda x: calculate_closest_station(x),axis=1) 

In [75]:
customer_location_df.to_csv('/user/projects/project-3-team2sec1/code/customer_location_station.csv',',')

## Use Geodesic distance to get the closest station from the Berkeley Store

In [77]:
rollback_before_flag, rollback_after_flag = True, True
connection.rollback()

query = """

select 
city, 
latitude,
longitude
from stores
where city = 'Berkeley'

"""
berkeley_lat_lon = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

In [81]:
def calculate_closest_station_lat_lon(lat, lon):
    
    # Get temporary closest station & distance
    closest_station = station_df.iloc[0]['station']
    closest_distance = my_calculate_distance((lat,lon),(station_df.iloc[0]['station_lat'],station_df.iloc[0]['station_long']))
    #print("Closest station is {} with a distance of {} miles".format(closest_station,closest_distance))
    
    # Iterate through each station
    for index,row in station_df.iterrows():
        station_point = (row['station_lat'],row['station_long'])
        #print(element['customer_id'],element['zip_lat'],element['zip_long'])
        
        # Calculate distance from customer to station
        current_dist = my_calculate_distance((lat,lon),station_point)
        
        # If this distance is shorter than the current smallest distance, update it
        if current_dist<closest_distance:
            closest_station = station_df.iloc[index]['station']
            closest_distance = current_dist
            
    #print("{} : Final closest station is {} with a distance of {} miles".format(x['first_name'],closest_station,closest_distance))
    return closest_station

In [108]:
lat, lon = float(berkeley_lat_lon["latitude"]), float(berkeley_lat_lon["longitude"])
#print(lat, lon)
print(f"Closest BART station to the Berkeley store is: {calculate_closest_station_lat_lon(lat, lon)}")
store_lat = lat
store_lon = lon

Closest BART station to the Berkeley store is: Ashby


## Calculate the shortest path from customer's closest station to Ashby station

In [84]:
def my_neo4j_shortest_path(from_station, to_station):
    "given a from station and to station, run and print the shortest path"
    
    query = "CALL gds.graph.drop('ds_graph', false)"
    session.run(query)

    query = "CALL gds.graph.project('ds_graph', 'Station', 'LINK', {relationshipProperties: 'weight'})"
    session.run(query)

    query = """

    MATCH (source:Station {name: $source}), (target:Station {name: $target})
    CALL gds.shortestPath.dijkstra.stream(
        'ds_graph', 
        { sourceNode: source, 
          targetNode: target, 
          relationshipWeightProperty: 'weight'
        }
    )
    YIELD index, sourceNode, targetNode, totalCost, nodeIds, costs, path
    RETURN
        gds.util.asNode(sourceNode).name AS from,
        gds.util.asNode(targetNode).name AS to,
        totalCost,
        [nodeId IN nodeIds | gds.util.asNode(nodeId).name] AS nodes,
        costs
    ORDER BY index

    """

    result = session.run(query, source=from_station, target=to_station)
    
    for r in result:
        
        total_cost = int(r['totalCost'])
        
        #print("\n--------------------------------")
        #print("   Total Cost: ", total_cost)
        #print("   Minutes: ", round(total_cost / 60.0,1))
        #print("--------------------------------")
        
        nodes = r['nodes']
        costs = r['costs']
        
        i = 0
        previous = 0
        path = []
        
        for n in nodes:
            
            tram = n + ", " + str(int(costs[i]) - previous)  + ", " + str(int(costs[i]))
            line_station = n
            #print(tram)
            path.append(n)
            
            previous = int(costs[i])
            i += 1
        
        return path

In [85]:
def extract_line_and_station(s):
    line, station = s.split(" ", 1)
    return line, station

def path_to_str_instructions(path):
    depart_line, depart_station = extract_line_and_station(path[1])
    previous_line = depart_line
    _, next_station = extract_line_and_station(path[2])
    instructions = f"Depart at {depart_station} take the {depart_line} line to {next_station}. "
    for p in path[3:]:
        line, station = extract_line_and_station(p)
        if line == previous_line:
            instructions += f"Continue on line {line} to station {station}. "
        elif line == "arrive":
            instructions += "You'll arrive to Ashby station."
        else:
            #print(f"previous line {previous_line} Current line: {line}")
            instructions += f"change to line {line} in {station}, then "
        previous_line = line
    return instructions

In [88]:
paths = []
final_station = "arrive Ashby"
for row in customer_location_df.itertuples():
    closest_station = row.closest_station
    #print(closest_station)
    depart_station = f"depart {closest_station}"
    path = my_neo4j_shortest_path(depart_station, final_station)
    #print(path)
    instructions = path_to_str_instructions(path)
    paths.append(instructions)
    
customer_location_df["path_to_berkeley_bart"] = paths
customer_location_df

Unnamed: 0,customer_id,first_name,last_name,customer_zip,zip_lat,zip_long,closest_station,path_to_berkeley_bart
0,8020,Deva,Ainley,94949,38.0653,-122.5384,Richmond,Depart at Richmond take the red line to El Cer...
1,4717,Beitris,Matonin,94556,37.8406,-122.1149,Lafayette,Depart at Lafayette take the yellow line to Or...
2,2491,Consalve,Brignall,94606,37.7918,-122.2450,Lake Merritt,Depart at Lake Merritt take the orange line to...
3,18886,Wren,Buxsy,75229,32.8937,-96.8644,Antioch,Depart at Antioch take the yellow line to Pitt...
4,27546,Estel,Willacot,37203,36.1493,-86.7903,Antioch,Depart at Antioch take the yellow line to Pitt...
...,...,...,...,...,...,...,...,...
31077,30320,Salvatore,French,37138,36.2467,-86.6194,Antioch,Depart at Antioch take the yellow line to Pitt...
31078,31018,Guinna,Baddow,37167,35.9586,-86.5327,Antioch,Depart at Antioch take the yellow line to Pitt...
31079,29404,Bobby,Van den Velde,37209,36.1555,-86.9216,Antioch,Depart at Antioch take the yellow line to Pitt...
31080,29704,Korey,Guilayn,37218,36.2070,-86.8926,Antioch,Depart at Antioch take the yellow line to Pitt...


In [90]:
customer_location_df.to_csv("customer_station_path.csv")

## Visualize some customers paths

In [105]:
customers_df = customer_location_df

station_latlon_map = {}
for row in station_df.itertuples():
    station_latlon_map[row.station] = (row.station_lat, row.station_long)

def get_customer_first_last_name(name):
    return name.split(" ")

def get_customer_closest_station(first, last):
    return customers_df.loc[(customers_df['first_name'] == first) & (customers_df['last_name'] == last)]['closest_station'].values[0]

def get_customer_latlon(first, last):
    customer_row = customers_df.loc[(customers_df['first_name'] == first) & (customers_df['last_name'] == last)]
    lat = customer_row['zip_lat'].values[0]
    lon = customer_row['zip_long'].values[0]
    return lat, lon

def get_station_latlon(station):
    return station_latlon_map[station]

def get_customer_bart_map_pins(customer_full_name):
    first, last = get_customer_first_last_name(customer_full_name)
    closest_station = get_customer_closest_station(first, last)
    station_lat, station_lon = get_station_latlon(closest_station)
    ashby_lat, ashby_lon = get_station_latlon("Ashby")
    return [
        {"name": "Closest Station to You", "latitude": station_lat, "longitude": station_lon},
        {"name": "Closest Station to Store", "latitude": ashby_lat, "longitude": ashby_lon}
    ]

def get_customer_home_map_pins(customer_full_name):
    first, last = get_customer_first_last_name(customer_full_name)
    customer_lat, customer_lon = get_customer_latlon(first, last)
    return  {"name": "You", "latitude": customer_lat, "longitude": customer_lon}
    

def get_store_pins():
    return {"name": "Store", "latitude": store_lat, "longitude": store_lon}

In [92]:
# Just needed to run once
!pip3 install folium



In [109]:
import folium

def map_customer(customer_full_name):
    customer_bart_map_pins = get_customer_bart_map_pins(customer_full_name)
    customer_home_pin = get_customer_home_map_pins(customer_full_name)
    store_pins = get_store_pins()
    # Create the map
    mymap = folium.Map(location=[37.7749, -122.4194], zoom_start=10)

    # Add markers for stations near San Francisco
    for station in customer_bart_map_pins:
        folium.Marker(
            location=[station["latitude"], station["longitude"]],
            popup=station["name"],
            icon=folium.Icon(icon="train", prefix="fa"),  # You can use Font Awesome icons
        ).add_to(mymap)

    folium.Marker(
        location=[customer_home_pin["latitude"], customer_home_pin["longitude"]],
        popup=customer_home_pin["name"],
        icon=folium.Icon(color="red", icon="home", prefix="fa"),
    ).add_to(mymap)

    folium.Marker(
        location=[store_pins["latitude"], store_pins["longitude"]],
        popup=store_pins["name"],
        icon=folium.Icon(color="green", icon="location-dot", prefix="fa"),
    ).add_to(mymap)
    
    return mymap

# Save the map to an HTML file
#mymap.save("map_with_markers.html")

def get_directions(customer_full_name):
    first, last = get_customer_first_last_name(customer_full_name)
    return customers_df.loc[(customers_df['first_name'] == "Beitris") & (customers_df['last_name'] == "Matonin")]['path_to_berkeley_bart'].values[0]

In [111]:
customer_full_name = "Beitris Matonin"
print(get_directions(customer_full_name))
map_customer(customer_full_name)

Depart at Lafayette take the yellow line to Orinda. Continue on line yellow to station Rockridge. Continue on line yellow to station MacArthur. change to line red in MacArthur, then Continue on line red to station Ashby. You'll arrive to Ashby station.


# 4. Identify key stations that act as hubs in the transportation system using Betweenness Centrality

# 4. Betweenness Centrality

In [55]:
query = "CALL gds.graph.drop('ds_graph', false)"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'LINK', {relationshipProperties: 'weight'})"
session.run(query)

query = """

CALL gds.betweenness.stream('ds_graph')
YIELD nodeId, score
RETURN  gds.util.asNode(nodeId).name AS name, score as betweenness
ORDER BY betweenness DESC

"""

my_neo4j_run_query_pandas(query)

Unnamed: 0,name,betweenness
0,yellow MacArthur,5999.809223
1,yellow Rockridge,5509.000000
2,orange Lake Merritt,5155.831877
3,orange 12th Street,5139.715461
4,yellow Orinda,4997.000000
...,...,...
309,Milpitas,0.000000
310,West Dublin,0.000000
311,Castro Valley,0.000000
312,West Oakland,0.000000


# 3. Betweenness Centrality Benefits
1. Betweenness centrality algorithm identifies key stations that act as hubs in the transportation system. 

2. Betweenness centrality is  a suitable centrality measure to identify key stations that act as hubs in the transportation system. It quantifies the importance of a node by calculating the number of times it lies on the shortest path between other pairs of nodes in the network. 

3. Nodes with high betweenness centrality act as critical connectors and play a significant role in facilitating communication and movement within the network.

4. In the output, each row represents a station, and the "betweenness" column indicates its betweenness centrality score.Stations with non-zero betweenness centrality scores (e.g., Powell Street, Embarcadero, Montgomery Street) act as critical connectors and play a significant role in facilitating travel between other stations in the network. These stations are likely to be major hubs or transfer points where many passengers change lines or routes.

5. Stations with Zero Betweenness Centrality: Stations with a betweenness centrality score of 0.0 (e.g., 24th Street Mission, Antioch, Pittsburg Center) are less critical in terms of connecting other stations. These stations may have limited connections to other parts of the network or may not serve as significant transfer points.

# Use case 2. Graph Algorithm based on transfer time as the weight

In [86]:
#wipe out the database initially 
my_neo4j_wipe_out_database()

# Verfiy the number of nodes and relationships

In [87]:
my_neo4j_number_nodes_relationships()

Unnamed: 0,node_name_1,node_1_labels,relationship_type,node_name_2,node_2_labels


-------------------------
  Nodes: 0
  Relationships: 0
-------------------------


# Query the list of all possible line transfers and the transfer times, create a relationship for each transfer with the transfer time as the weight 

In [88]:
# Add the list of specific stations to the SQL query
query = """
SELECT a.station, a.line as from_line, b.line as to_line, s.transfer_time
FROM lines a
JOIN lines b ON a.station = b.station AND a.line <> b.line
JOIN stations s ON a.station = s.station
ORDER BY 1, 2, 3
"""

# Rest of the code remains the same
cursor.execute(query)
connection.rollback()
rows = cursor.fetchall()

for row in rows:
    station = row[0]
    from_line = row[1]
    to_line = row[2]
    transfer_time = int(row[3])
    
    from_station = from_line + ' ' + station
    my_neo4j_create_node(from_station)
    to_station = to_line + ' ' + station
    my_neo4j_create_node(to_station)
    my_neo4j_create_relationship_one_way(from_station, to_station, transfer_time)


In [89]:
my_neo4j_number_nodes_relationships()

Unnamed: 0,node_name_1,node_1_labels,relationship_type,node_name_2,node_2_labels
0,blue 16th Street Mission,[Station],LINK,green 16th Street Mission,[Station]
1,blue 16th Street Mission,[Station],LINK,red 16th Street Mission,[Station]
2,blue 16th Street Mission,[Station],LINK,red 16th Street Mission,[Station]
3,blue 16th Street Mission,[Station],LINK,yellow 16th Street Mission,[Station]
4,blue 16th Street Mission,[Station],LINK,yellow 16th Street Mission,[Station]
...,...,...,...,...,...
2229,yellow West Oakland,[Station],LINK,red West Oakland,[Station]
2230,yellow West Oakland,[Station],LINK,red West Oakland,[Station]
2231,yellow West Oakland,[Station],LINK,red West Oakland,[Station]
2232,yellow West Oakland,[Station],LINK,red West Oakland,[Station]


-------------------------
  Nodes: 416
  Relationships: 2234
-------------------------


# 1. Betweenness centrality 

In [90]:
query = "CALL gds.graph.drop('ds_graph', false)"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'LINK', {relationshipProperties: 'weight'})"
session.run(query)

query = """

CALL gds.betweenness.stream('ds_graph')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as betweenness
ORDER BY betweenness DESC

"""

my_neo4j_run_query_pandas(query)

Unnamed: 0,name,betweenness
0,yellow West Oakland,79.307971
1,yellow Balboa Park,79.307971
2,yellow Civic Center,79.307971
3,orange Coliseum,79.307971
4,yellow Daly City,79.307971
...,...,...
411,green 24th Street Mission,0.000000
412,yellow 24th Street Mission,0.000000
413,red 24th Street Mission,0.000000
414,red Ashby,0.000000


# 1. Betwenness centrality benefits

1. Betweenness centrality analysis with respect to line transfer time provides crucial information about the effectiveness of stations in facilitating efficient transfers between different BART lines.

2. Shortest Transfer Paths: Stations with high betweenness centrality in the context of line transfer time indicate that they are frequently used as intermediate points for passengers to switch between different BART lines while minimizing their transfer time. These stations are crucial for ensuring smooth and quick transfers within the transportation system.

3. Transfer Efficiency: Stations with high betweenness centrality in line transfer time play a vital role in maintaining the overall efficiency of the BART network. Passengers are more likely to use these stations for transfers, as they offer better connectivity and reduced waiting times between different lines.

4. Optimal Transfer Routes: Stations with high betweenness centrality in line transfer time are potential candidates for optimizing transfer routes.

5. For instance, the station "yellow West Oakland" has a betweenness centrality score of 79.307971, indicating that it plays a significant role in facilitating transfers for passengers between other stations in the network. Similarly, "Yellow Balboa Park" and "yellow Civic Center" also have high betweenness centrality scores of 79.307971, indicating their importance as transfer hubs.

6. Some stations have a betweenness centrality score of 0.000000, which means they have low importance as transfer points. Stations with low betweenness centrality scores are less frequently used for transfers between other stations in the network.

# 2. Label Propagation 

In [92]:
query = "CALL gds.graph.drop('ds_graph', false)"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'LINK', {relationshipProperties: 'weight'})"
session.run(query)

query = """

CALL gds.labelPropagation.stream('ds_graph')
YIELD nodeId, communityId AS Community
RETURN gds.util.asNode(nodeId).name AS Name, Community
ORDER BY Community, Name

"""

my_neo4j_run_query_pandas(query)

Unnamed: 0,Name,Community
0,blue Balboa Park,1
1,blue Balboa Park,1
2,blue Balboa Park,1
3,green Balboa Park,1
4,green Balboa Park,1
...,...,...
411,red 24th Street Mission,487
412,orange Ashby,489
413,red Ashby,489
414,red Ashby,489


# 2. Label Propagation Benefits

1. The output of the Label Propagation algorithm provides information about the communities or clusters of stations within the BART network. 

2. For example, the station "blue Balboa Park" has been assigned the community label "1" This means that "blue Balboa Park" is part of the community with the label "1" Similarly, "green Balboa Park" also belongs to the community with label "1". Likewise, red 24th Street Mission	is assigned to community 487, green Balboa Park is assigned to community 291. Similarly "orange Ashby" may have strong connections with nodes in both community 489 and community 491. This can cause it to switch between the communities during the label propagation process.  

3. Stations with the same community label are more interconnected and have similar neighbors, suggesting that they belong to the same functional group within the transportation system. This information can be valuable for understanding the structure and organization of the BART network and may be used for various purposes, such as network analysis, optimization, and decision-making related to station operations and passenger flow.


# 3. Closeness centrality

In [93]:
query = "CALL gds.graph.drop('ds_graph', false)"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Station', 'LINK', {relationshipProperties: 'weight'})"
session.run(query)


query = """

CALL gds.beta.closeness.stream('ds_graph')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as closeness
ORDER BY score DESC

"""

my_neo4j_run_query_pandas(query)



Unnamed: 0,name,closeness
0,green Berryessa,1.0
1,red Colma,1.0
2,orange Downtown Berkeley,1.0
3,orange El Cerrito del Norte,1.0
4,orange El Cerrito Plaza,1.0
...,...,...
411,yellow 19th Street,0.0
412,yellow 24th Street Mission,0.0
413,yellow 24th Street Mission,0.0
414,yellow 24th Street Mission,0.0


# Closeness centrality Benefits
1. Closeness centrality measures how close a station is to all other stations in the network in terms of the shortest path distance. A higher closeness centrality value indicates that the station is more central and has shorter average travel distances to reach other stations, making it more accessible and facilitating quick travel to and from other locations in the network.
2. For example, the station "green Berryessa" , "red Colma" and "orange El Cerrito del Norte" has a closeness centrality of 1.000000, which means it is the most central station in the BART network.
3. Some stations like "yellow 19th Street", "yellow 24th Street Mission" have a closeness centrality of 0.000000, which means they have low closeness centrality. These stations are relatively less central in the network and may be located in more peripheral or less interconnected areas.
4. Closeness centrality can help transportation planners optimize routes, improve transfer facilities, and enhance the overall accessibility