# Step 2: merge trip and address data

## Import dependencies

Code dependencies.

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, Markdown


Import the trip data file.

In [2]:
# Load the trip data file
df = pd.read_csv(Path("data/all-trips.csv").resolve())

df.sample(5)

Unnamed: 0,plate,Start Hub,End Hub,Date,Start Time,Trip Durration,Billing Zip Code
876,9140040,Cronton Ave & Pleasantville Road,Cronton Ave & Pleasantville Road,2024-09-20,2024-09-20 9:28:28,0:35:51,10562.0
1093,9140036,Market Square,Metro North - Plaza,2024-09-07,2024-09-07 12:11:14,1:25:31,10927.0
801,9140038,Wishnie Park,Cronton Ave & Pleasantville Road,2024-10-04,2024-10-04 23:40:19,1:02:09,10562.0
1002,9140034,Nelson park,Metro North - Plaza,2024-09-13,2024-09-13 18:32:56,1:18:17,10562.0
855,9140039,Municipal Parking Lot Lot# 7,Arcadian Shopping Center,2024-10-11,2024-10-11 15:38:01,0:14:41,10562.0


Import the hub locations file.

In [3]:
addresses_df = pd.read_csv(Path("data/hub-locations.csv").resolve())
addresses_df = addresses_df.drop(columns=["OpenStreetMap Data"]) # drop the column with OSM data... we don't need it here.

addresses_df.sample(5)

Unnamed: 0,Hub,Address,Geolocation
1,C Town Markets,"100 Croton Ave, Ossining, NY 10562","(41.1653387, -73.856297)"
9,"Spring St, Cofield","Spring St & Broad Ave, Ossining, NY 10562","(41.1570899, -73.8640002)"
0,Arcadian Shopping Center,"225-207 Albany Post Rd, Briarcliff Manor, NY 1...","(41.1394616, -73.861803)"
10,Wishnie Park,"145-149 Orchard Rd, Briarcliff Manor, NY 10510","(41.1607067, -73.8477673)"
7,Ossining Public Library,"53 Croton Ave, Ossining, NY 10562","(41.1642872, -73.8604165)"


## Merge data
Merge the trip data with the hub locations data to get a single data file with everything we need.

First, merge the Start Hub location.

In [4]:
# Merge df with addresses_df to add 'Start Hub Geolocation'
df = df.merge(addresses_df[['Hub', 'Geolocation']], left_on='Start Hub', right_on='Hub', how='left')
df = df.rename(columns={'Geolocation': 'Start Hub Geolocation'}).drop(columns=['Hub'])

df.sample(5)

Unnamed: 0,plate,Start Hub,End Hub,Date,Start Time,Trip Durration,Billing Zip Code,Start Hub Geolocation
990,9140018,Cronton Ave & Pleasantville Road,Metro North - Plaza,2024-09-14,2024-09-14 17:28:22,0:28:15,10562.0,"(41.1669377, -73.8497735)"
332,9140037,Market Square,Arcadian Shopping Center,2024-10-18,2024-10-18 14:54:47,0:08:29,10562.0,"(41.1609922, -73.8627212)"
627,9140002,Arcadian Shopping Center,Municipal Parking Lot Lot# 7,2024-10-28,2024-10-28 19:30:13,0:10:10,10562.0,"(41.1394616, -73.861803)"
600,9140045,Ossining Public Library,Ossining Public Library,2024-09-27,2024-09-27 20:35:58,2:20:31,10562.0,"(41.1642872, -73.8604165)"
906,9140025,Ossining Public Library,Cronton Ave & Pleasantville Road,2024-09-12,2024-09-12 9:16:09,0:06:57,10562.0,"(41.1642872, -73.8604165)"


Second, merge the End Hub location.

In [5]:
# Merge df with addresses_df to add 'End Hub Geolocation'
df = df.merge(addresses_df[['Hub', 'Geolocation']], left_on='End Hub', right_on='Hub', how='left')
df = df.rename(columns={'Geolocation': 'End Hub Geolocation'}).drop(columns=['Hub'])

df.sample(5)

Unnamed: 0,plate,Start Hub,End Hub,Date,Start Time,Trip Durration,Billing Zip Code,Start Hub Geolocation,End Hub Geolocation
1041,9140029,Arcadian Shopping Center,Arcadian Shopping Center,2024-09-25,2024-09-25 10:50:45,0:26:54,10562.0,"(41.1394616, -73.861803)","(41.1394616, -73.861803)"
1174,9140019,Nelson park,Metro North - Plaza,2024-09-03,2024-09-03 13:30:09,0:07:34,10562.0,"(41.1521048, -73.8623897)","(41.157844, -73.868112)"
809,9140034,C Town Markets,Wishnie Park,2024-10-02,2024-10-02 0:22:16,1:50:18,10562.0,"(41.1653387, -73.856297)","(41.1607067, -73.8477673)"
1286,9140020,C Town Markets,C Town Markets,2024-09-06,2024-09-06 21:21:54,1:21:56,10562.0,"(41.1653387, -73.856297)","(41.1653387, -73.856297)"
745,9140016,Cronton Ave & Pleasantville Road,Cronton Ave & Pleasantville Road,2024-09-28,2024-09-28 20:42:55,0:56:27,10562.0,"(41.1669377, -73.8497735)","(41.1669377, -73.8497735)"


## Save
Save the merged trip and geolocation data to a new CSV file.

In [7]:
# Save to file
file_path = './data/all-trips-geo.csv'
file_path = Path(file_path).resolve() # make platform-agnostic
df.to_csv(file_path, index=False)

## Quick map
Put together a quick map to visualze all the trip data.

Count how many trips start and end at each hub combination.

In [15]:
# Group by 'Start Hub Geolocation' and 'End Hub Geolocation' and count occurrences
grouped_df = df.groupby(['Start Hub Geolocation', 'End Hub Geolocation']).size().reset_index(name='Count')
# Map geolocations to hub names using addresses_df
geo_to_hub = addresses_df.set_index('Geolocation')['Hub'].to_dict()

# Add 'Start Hub' and 'End Hub' columns with the names of the hubs
grouped_df['Start Hub'] = grouped_df['Start Hub Geolocation'].map(geo_to_hub)
grouped_df['End Hub'] = grouped_df['End Hub Geolocation'].map(geo_to_hub)

grouped_df.head()

Unnamed: 0,Start Hub Geolocation,End Hub Geolocation,Count,Start Hub,End Hub
0,"(41.1394616, -73.861803)","(41.1394616, -73.861803)",40,Arcadian Shopping Center,Arcadian Shopping Center
1,"(41.1394616, -73.861803)","(41.1521048, -73.8623897)",8,Arcadian Shopping Center,Nelson park
2,"(41.1394616, -73.861803)","(41.1570899, -73.8640002)",4,Arcadian Shopping Center,"Spring St, Cofield"
3,"(41.1394616, -73.861803)","(41.157844, -73.868112)",8,Arcadian Shopping Center,Metro North - Plaza
4,"(41.1394616, -73.861803)","(41.1593892, -73.8638232)",16,Arcadian Shopping Center,Spring & Waller


In [31]:
# Add a new column 'One Hub' to indicate if 'Start Hub' and 'End Hub' are the same
grouped_df['Single Hub'] = grouped_df['Start Hub'] == grouped_df['End Hub']

grouped_df.sort_values(by='Count', inplace=True, ascending=False)
grouped_df.head(50)

Unnamed: 0,Start Hub Geolocation,End Hub Geolocation,Count,Start Hub,End Hub,Single Hub
12,"(41.1521048, -73.8623897)","(41.1521048, -73.8623897)",159,Nelson park,Nelson park,True
70,"(41.1618223, -73.8629757)","(41.1394616, -73.861803)",86,Municipal Parking Lot Lot# 7,Arcadian Shopping Center,False
57,"(41.1607067, -73.8477673)","(41.1607067, -73.8477673)",86,Wishnie Park,Wishnie Park,True
34,"(41.157844, -73.868112)","(41.157844, -73.868112)",78,Metro North - Plaza,Metro North - Plaza,True
23,"(41.1570899, -73.8640002)","(41.1570899, -73.8640002)",59,"Spring St, Cofield","Spring St, Cofield",True
14,"(41.1521048, -73.8623897)","(41.157844, -73.868112)",58,Nelson park,Metro North - Plaza,False
100,"(41.1653387, -73.856297)","(41.1653387, -73.856297)",48,C Town Markets,C Town Markets,True
76,"(41.1618223, -73.8629757)","(41.1618223, -73.8629757)",47,Municipal Parking Lot Lot# 7,Municipal Parking Lot Lot# 7,True
111,"(41.1669377, -73.8497735)","(41.1669377, -73.8497735)",46,Cronton Ave & Pleasantville Road,Cronton Ave & Pleasantville Road,True
7,"(41.1394616, -73.861803)","(41.1618223, -73.8629757)",46,Arcadian Shopping Center,Municipal Parking Lot Lot# 7,False


Calculate the min and max number of trips in the grouped data.

In [28]:
min_count = grouped_df['Count'].min()
max_count = grouped_df['Count'].max()

min_count, max_count

(np.int64(1), np.int64(159))

In [27]:
import folium

# Create a map centered on Ossining, NY
map = folium.Map(location=[41.162, -73.861], zoom_start=14, tiles='CartoDB dark_matter')

# Add circles for each unique hub
for _, row in addresses_df.iterrows():
    hub_coords = eval(row['Geolocation'])
    hub_name = row['Hub']
    # Count the number of rows with this location as 'Start Hub Geolocation'
    start_hub_count = df[df['Start Hub Geolocation'] == row['Geolocation']].shape[0]
    end_hub_count = df[df['End Hub Geolocation'] == row['Geolocation']].shape[0]
    same_hub_count = df[(df['Start Hub Geolocation'] == row['Geolocation']) & (df['End Hub Geolocation'] == row['Geolocation'])].shape[0]
    total_hub_count = start_hub_count + end_hub_count
    percent_same_hub = round((same_hub_count / total_hub_count) * 100)
    hub_popup = f"<h4>{hub_name}</h4><p><strong>{start_hub_count}</strong> trips started here</p><p><strong>{end_hub_count}</strong> trips ended here</p><p><strong>{percent_same_hub}%</strong> ({same_hub_count}) of these both started and ended here</p>"
    folium.CircleMarker(
        location=hub_coords,
        radius=5 + start_hub_count / 10,  # Base radius is 5, scaled by the count
        color='gray',
        fill=True,
        fill_color='black',
        fill_opacity=0.7,
        popup=folium.Popup(hub_popup, max_width=300, parse_html=False)
    ).add_to(map)


# Add lines for each trip
for _, row in grouped_df.iterrows():
    if pd.notnull(row['Start Hub Geolocation']) and pd.notnull(row['End Hub Geolocation']):
        start_coords = eval(row['Start Hub Geolocation'])
        end_coords = eval(row['End Hub Geolocation'])
        # start_coords = (start_coords[0] + 0.0005, start_coords[1] + 0.0005) # offset the start coords a bit to separate them from other lines going the other direction
        random_color = np.random.choice([
            'red', 'blue', 'green', 'purple', 'orange', 'yellow', 
            'pink', 'brown', 'cyan', 'magenta', 'lime', 'teal', 
            'indigo', 'violet', 'gold', 'silver', 'maroon', 'navy'
        ])
        line_weight = row['Count'] / 10  # Adjust the weight based on the count
        line = folium.PolyLine([start_coords, end_coords], color=random_color, weight=line_weight)
        # Add an arrowhead in the middle of the line, pointing towards the end
        mid_coords = ((start_coords[0] + end_coords[0]) / 2, (start_coords[1] + end_coords[1]) / 2)

        # a triangle pointing the direction of the trip
        offset_mid_coords = ((mid_coords[0] + start_coords[0]) / 2, (mid_coords[1] + start_coords[1]) / 2) # offset from midpoint to prevent overlap
        arrow_head = folium.RegularPolygonMarker(
            location=offset_mid_coords,
            number_of_sides=3,
            radius=8,
            color=None,
            fill=True,
            fill_color=random_color,
            rotation=45
        ).add_to(map)

        # a popup that shows the number of trips between the two hubs in this direction
        popup_text = f"<h4>{row['Start Hub']} -> {row['End Hub']}</h4><p><strong>{row['Count']}</strong> trips</p>"
        folium.Popup(popup_text, max_width=300, parse_html=False).add_to(line) # add to line
        folium.Popup(popup_text, max_width=300, parse_html=False).add_to(arrow_head) # add to arrow head too

        # add line to map
        line.add_to(map) # add line to map


# Display the map
map

In [13]:
# Ensure the renders directory exists
renders_dir = Path('./renders').resolve()
renders_dir.mkdir(parents=True, exist_ok=True)

# Save the map to an HTML file in the renders directory
map_file_path = renders_dir / 'all-trips-map.html'
map.save(str(map_file_path))