# Calculate customer BART path

Calculate the route from customer closest BART station to the closest station to the Berkeley store.

## Requirements

1. Have the BART Neo4j dataset loaded from: /projects/project-3-team2sec1/exercise
2. Have the AGM SQL database loaded
3. Requires the CSV named "customer_location_station.csv" created in notebook "geodesic_merge"

## Load libraries and util methods

In [1]:
import neo4j

import csv

import math
import numpy as np
import pandas as pd

import psycopg2

import gmaps
import gmaps.geojson_geometries

from geographiclib.geodesic import Geodesic

In [2]:
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))

In [3]:
session = driver.session(database="neo4j")

In [4]:
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    
    result = session.run(query, **kwargs)
    
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df

In [5]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [6]:
cursor = connection.cursor()

In [7]:
#
# function to run a select query and return rows in a pandas dataframe
# pandas puts all numeric values from postgres to float
# if it will fit in an integer, change it to integer
#

def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)
    

## Get the closest BART station from the Berkeley Store

Get the lon and lat for the Berkeley store

In [8]:
rollback_before_flag, rollback_after_flag = True, True
connection.rollback()

query = """

select 
city, 
latitude,
longitude
from stores
where city = 'Berkeley'

"""
berkeley_lat_lon = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)
berkeley_lat_lon

Unnamed: 0,city,latitude,longitude
0,Berkeley,37.8555,-122.2604


Get the lat and lon for all the stations

In [9]:
rollback_before_flag = True
rollback_after_flag = True


# Query stations table
station_query = """

select distinct station, latitude as station_lat, longitude as station_long
from stations as st

"""

station_df = my_select_query_pandas(station_query, rollback_before_flag, rollback_after_flag)
station_df

Unnamed: 0,station,station_lat,station_long
0,Concord,37.973745,-122.029127
1,Civic Center,37.779861,-122.413498
2,Downtown Berkeley,37.869799,-122.268197
3,Colma,37.684722,-122.466111
4,West Oakland,37.8049,-122.2951
5,Richmond,37.936811,-122.353095
6,Pittsburg Center,38.018227,-121.890178
7,South San Francisco,37.664264,-122.444043
8,Union City,37.590746,-122.017282
9,Pleasant Hill,37.928399,-122.055992


Get the closest BART station to the Berkeley Store

In [10]:
# calculate closest location to Berkeley store

# from Sumedh's code
def my_calculate_distance(point_1, point_2):
    "Given two points in (latitude, longitude) format, calculate the distance between them in miles"
    
    geod = Geodesic.WGS84


    g = geod.Inverse(point_1[0], point_1[1], point_2[0], point_2[1])
    miles = g['s12'] / 1000 * 0.621371
    
    return miles


def calculate_closest_station(lat, lon):
    
    # Get temporary closest station & distance
    closest_station = station_df.iloc[0]['station']
    closest_distance = my_calculate_distance((lat,lon),(station_df.iloc[0]['station_lat'],station_df.iloc[0]['station_long']))
    #print("Closest station is {} with a distance of {} miles".format(closest_station,closest_distance))
    
    # Iterate through each station
    for index,row in station_df.iterrows():
        station_point = (row['station_lat'],row['station_long'])
        #print(element['customer_id'],element['zip_lat'],element['zip_long'])
        
        # Calculate distance from customer to station
        current_dist = my_calculate_distance((lat,lon),station_point)
        
        # If this distance is shorter than the current smallest distance, update it
        if current_dist<closest_distance:
            closest_station = station_df.iloc[index]['station']
            closest_distance = current_dist
            
    #print("{} : Final closest station is {} with a distance of {} miles".format(x['first_name'],closest_station,closest_distance))
    return closest_station

lat, lon = float(berkeley_lat_lon["latitude"]), float(berkeley_lat_lon["longitude"])
#print(lat, lon)
print(f"Closest BART station to the Berkeley store is: {calculate_closest_station(lat, lon)}")

Closest BART station to the Berkeley store is: Ashby


## Calculate the shortest path from customer's closest station to Ashby station

In [11]:
def my_neo4j_shortest_path(from_station, to_station):
    "given a from station and to station, run and print the shortest path"
    
    query = "CALL gds.graph.drop('ds_graph', false)"
    session.run(query)

    query = "CALL gds.graph.project('ds_graph', 'Station', 'LINK', {relationshipProperties: 'weight'})"
    session.run(query)

    query = """

    MATCH (source:Station {name: $source}), (target:Station {name: $target})
    CALL gds.shortestPath.dijkstra.stream(
        'ds_graph', 
        { sourceNode: source, 
          targetNode: target, 
          relationshipWeightProperty: 'weight'
        }
    )
    YIELD index, sourceNode, targetNode, totalCost, nodeIds, costs, path
    RETURN
        gds.util.asNode(sourceNode).name AS from,
        gds.util.asNode(targetNode).name AS to,
        totalCost,
        [nodeId IN nodeIds | gds.util.asNode(nodeId).name] AS nodes,
        costs
    ORDER BY index

    """

    result = session.run(query, source=from_station, target=to_station)
    
    for r in result:
        
        total_cost = int(r['totalCost'])
        
        #print("\n--------------------------------")
        #print("   Total Cost: ", total_cost)
        #print("   Minutes: ", round(total_cost / 60.0,1))
        #print("--------------------------------")
        
        nodes = r['nodes']
        costs = r['costs']
        
        i = 0
        previous = 0
        path = []
        
        for n in nodes:
            
            tram = n + ", " + str(int(costs[i]) - previous)  + ", " + str(int(costs[i]))
            line_station = n
            #print(tram)
            path.append(n)
            
            previous = int(costs[i])
            i += 1
        
        return path

Turn the path into directions/instructions on lines to take to Ashby

In [12]:
# Example path:
path = my_neo4j_shortest_path('depart Millbrae', 'arrive Ashby')
#print(path)

def extract_line_and_station(s):
    line, station = s.split(" ", 1)
    return line, station

def path_to_str_instructions(path):
    depart_line, depart_station = extract_line_and_station(path[1])
    previous_line = depart_line
    _, next_station = extract_line_and_station(path[2])
    instructions = f"Depart at {depart_station} take the {depart_line} line to {next_station}. "
    for p in path[3:]:
        line, station = extract_line_and_station(p)
        if line == previous_line:
            instructions += f"Continue on line {line} to station {station}. "
        elif line == "arrive":
            instructions += "You'll arrive to Ashby station."
        else:
            #print(f"previous line {previous_line} Current line: {line}")
            instructions += f"change to line {line} in {station}, then "
        previous_line = line
    return instructions

path_to_str_instructions(path)
    

"Depart at Millbrae take the red line to San Bruno. Continue on line red to station South San Francisco. Continue on line red to station Colma. Continue on line red to station Daly City. Continue on line red to station Balboa Park. Continue on line red to station Glen Park. Continue on line red to station 24th Street Mission. Continue on line red to station 16th Street Mission. Continue on line red to station Civic Center. Continue on line red to station Powell Street. Continue on line red to station Montgomery Street. Continue on line red to station Embarcadero. Continue on line red to station West Oakland. Continue on line red to station 12th Street. Continue on line red to station 19th Street. Continue on line red to station MacArthur. Continue on line red to station Ashby. You'll arrive to Ashby station."

Load the customer closest station CSV 

In [13]:
customers_df = pd.read_csv('/user/projects/project-3-team2sec1/code/customer_location_station.csv')
customers_df

Unnamed: 0.1,Unnamed: 0,customer_id,first_name,last_name,customer_zip,zip_lat,zip_long,closest_station
0,0,8020,Deva,Ainley,94949,38.0653,-122.5384,Richmond
1,1,4717,Beitris,Matonin,94556,37.8406,-122.1149,Lafayette
2,2,2491,Consalve,Brignall,94606,37.7918,-122.2450,Lake Merritt
3,3,18886,Wren,Buxsy,75229,32.8937,-96.8644,Antioch
4,4,27546,Estel,Willacot,37203,36.1493,-86.7903,Antioch
...,...,...,...,...,...,...,...,...
31077,31077,30320,Salvatore,French,37138,36.2467,-86.6194,Antioch
31078,31078,31018,Guinna,Baddow,37167,35.9586,-86.5327,Antioch
31079,31079,29404,Bobby,Van den Velde,37209,36.1555,-86.9216,Antioch
31080,31080,29704,Korey,Guilayn,37218,36.2070,-86.8926,Antioch


Create a new column in the customer df with the directions

In [14]:
paths = []
final_station = "arrive Ashby"
for row in customers_df.itertuples():
    closest_station = row.closest_station
    #print(closest_station)
    depart_station = f"depart {closest_station}"
    path = my_neo4j_shortest_path(depart_station, final_station)
    #print(path)
    instructions = path_to_str_instructions(path)
    paths.append(instructions)
    
customers_df["path_to_berkeley_bart"] = paths
customers_df

Unnamed: 0.1,Unnamed: 0,customer_id,first_name,last_name,customer_zip,zip_lat,zip_long,closest_station,path_to_berkeley_bart
0,0,8020,Deva,Ainley,94949,38.0653,-122.5384,Richmond,Depart at Richmond take the red line to El Cer...
1,1,4717,Beitris,Matonin,94556,37.8406,-122.1149,Lafayette,Depart at Lafayette take the yellow line to Or...
2,2,2491,Consalve,Brignall,94606,37.7918,-122.2450,Lake Merritt,Depart at Lake Merritt take the orange line to...
3,3,18886,Wren,Buxsy,75229,32.8937,-96.8644,Antioch,Depart at Antioch take the yellow line to Pitt...
4,4,27546,Estel,Willacot,37203,36.1493,-86.7903,Antioch,Depart at Antioch take the yellow line to Pitt...
...,...,...,...,...,...,...,...,...,...
31077,31077,30320,Salvatore,French,37138,36.2467,-86.6194,Antioch,Depart at Antioch take the yellow line to Pitt...
31078,31078,31018,Guinna,Baddow,37167,35.9586,-86.5327,Antioch,Depart at Antioch take the yellow line to Pitt...
31079,31079,29404,Bobby,Van den Velde,37209,36.1555,-86.9216,Antioch,Depart at Antioch take the yellow line to Pitt...
31080,31080,29704,Korey,Guilayn,37218,36.2070,-86.8926,Antioch,Depart at Antioch take the yellow line to Pitt...


Save the data to a CSV

In [15]:
customers_df.to_csv("customer_station_path.csv")