# Step 2: merge trip and address data

## Import dependencies

Code dependencies.

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, Markdown


Import the trip data file.

In [3]:
# Load the trip data file
df = pd.read_csv(Path("data/all-trips.csv").resolve())

df.sample(5)

Unnamed: 0,plate,Start Hub,End Hub,Date,Start Time,Trip Durration,Billing Zip Code
373,9140023,Nelson park,C Town Markets,2024-10-13,2024-10-13 20:09:21,0:14:14,10562.0
1120,9140022,Spring & Waller,Ossining Public Library,2024-09-05,2024-09-05 20:08:03,0:39:48,10601.0
1150,9140014,Arcadian Shopping Center,Market Square,2024-09-20,2024-09-20 18:14:18,0:13:18,10562.0
1308,9140012,Wishnie Park,Wishnie Park,2024-08-28,2024-08-28 11:20:56,0:28:35,12010.0
709,9140012,Arcadian Shopping Center,Arcadian Shopping Center,2024-10-21,2024-10-21 9:50:26,2:42:23,10562.0


Import the hub locations file.

In [4]:
addresses_df = pd.read_csv(Path("data/hub-locations.csv").resolve())
addresses_df = addresses_df.drop(columns=["OpenStreetMap Data"]) # drop the column with OSM data... we don't need it here.

addresses_df.sample(5)

Unnamed: 0,Hub,Address,Geolocation
7,Ossining Public Library,"53 Croton Ave, Ossining, NY 10562","(41.1642872, -73.8604165)"
1,C Town Markets,"100 Croton Ave, Ossining, NY 10562","(41.1653387, -73.856297)"
6,Nelson park,"20 Madison Ave, Ossining, NY 10562","(41.1521048, -73.8623897)"
9,"Spring St, Cofield","Spring St & Broad Ave, Ossining, NY 10562","(41.1570899, -73.8640002)"
2,Cronton Ave & Pleasantville Road,"Croton Ave & Pleasantville Rd, Ossining, NY 10562","(41.1669377, -73.8497735)"


## Merge data
Merge the trip data with the hub locations data to get a single data file with everything we need.

First, merge the Start Hub location.

In [5]:
# Merge df with addresses_df to add 'Start Hub Geolocation'
df = df.merge(addresses_df[['Hub', 'Geolocation']], left_on='Start Hub', right_on='Hub', how='left')
df = df.rename(columns={'Geolocation': 'Start Hub Geolocation'}).drop(columns=['Hub'])

df.sample(5)

Unnamed: 0,plate,Start Hub,End Hub,Date,Start Time,Trip Durration,Billing Zip Code,Start Hub Geolocation
911,9140030,Arcadian Shopping Center,Nelson park,2024-09-13,2024-09-13 16:20:44,0:11:30,10562.0,"(41.1394616, -73.861803)"
224,9140011,Market Square,Market Square,2024-11-03,2024-11-03 10:23:50,0:29:20,11230.0,"(41.1609922, -73.8627212)"
1283,9140016,Municipal Parking Lot Lot# 7,Municipal Parking Lot Lot# 7,2024-09-08,2024-09-08 14:16:00,1:26:29,10520.0,"(41.1618223, -73.8629757)"
45,9140035,Metro North - Plaza,Nelson park,2024-12-02,2024-12-02 20:00:04,0:06:27,10562.0,"(41.157844, -73.868112)"
801,9140038,Wishnie Park,Cronton Ave & Pleasantville Road,2024-10-04,2024-10-04 23:40:19,1:02:09,10562.0,"(41.1607067, -73.8477673)"


Second, merge the End Hub location.

In [6]:
# Merge df with addresses_df to add 'End Hub Geolocation'
df = df.merge(addresses_df[['Hub', 'Geolocation']], left_on='End Hub', right_on='Hub', how='left')
df = df.rename(columns={'Geolocation': 'End Hub Geolocation'}).drop(columns=['Hub'])

df.sample(5)

Unnamed: 0,plate,Start Hub,End Hub,Date,Start Time,Trip Durration,Billing Zip Code,Start Hub Geolocation,End Hub Geolocation
752,9140005,Nelson park,Municipal Parking Lot Lot# 7,2024-09-17,2024-09-17 11:55:08,0:09:14,10562.0,"(41.1521048, -73.8623897)","(41.1618223, -73.8629757)"
1212,9140004,Metro North - Plaza,Nelson park,2024-09-02,2024-09-02 13:39:29,0:14:25,10562.0,"(41.157844, -73.868112)","(41.1521048, -73.8623897)"
67,9140034,Ossining Public Library,Nelson park,2024-11-29,2024-11-29 14:27:54,0:25:45,10562.0,"(41.1642872, -73.8604165)","(41.1521048, -73.8623897)"
811,9140047,Arcadian Shopping Center,Municipal Parking Lot Lot# 7,2024-10-09,2024-10-09 16:19:58,0:11:04,10562.0,"(41.1394616, -73.861803)","(41.1618223, -73.8629757)"
841,9140027,Metro North - Plaza,Cronton Ave & Pleasantville Road,2024-09-27,2024-09-27 23:59:39,0:15:11,10562.0,"(41.157844, -73.868112)","(41.1669377, -73.8497735)"


## Save
Save the merged trip and geolocation data to a new CSV file.

In [7]:
# Save to file
file_path = './data/all-trips-geo.csv'
file_path = Path(file_path).resolve() # make platform-agnostic
df.to_csv(file_path, index=False)

## Quick map
Put together a quick map to visualze all the trip data.

Count how many trips start and end at each hub combination.

In [None]:
# Group by 'Start Hub Geolocation' and 'End Hub Geolocation' and count occurrences
grouped_df = df.groupby(['Start Hub Geolocation', 'End Hub Geolocation']).size().reset_index(name='Count')
# Map geolocations to hub names using addresses_df
geo_to_hub = addresses_df.set_index('Geolocation')['Hub'].to_dict()

# Add 'Start Hub' and 'End Hub' columns with the names of the hubs
grouped_df['Start Hub'] = grouped_df['Start Hub Geolocation'].map(geo_to_hub)
grouped_df['End Hub'] = grouped_df['End Hub Geolocation'].map(geo_to_hub)

grouped_df.head(50)

Unnamed: 0,Start Hub Geolocation,End Hub Geolocation,Count,Start Hub,End Hub
0,"(41.1394616, -73.861803)","(41.1394616, -73.861803)",40,Arcadian Shopping Center,Arcadian Shopping Center
1,"(41.1394616, -73.861803)","(41.1521048, -73.8623897)",8,Arcadian Shopping Center,Nelson park
2,"(41.1394616, -73.861803)","(41.1570899, -73.8640002)",4,Arcadian Shopping Center,"Spring St, Cofield"
3,"(41.1394616, -73.861803)","(41.157844, -73.868112)",8,Arcadian Shopping Center,Metro North - Plaza
4,"(41.1394616, -73.861803)","(41.1593892, -73.8638232)",16,Arcadian Shopping Center,Spring & Waller
5,"(41.1394616, -73.861803)","(41.1607067, -73.8477673)",6,Arcadian Shopping Center,Wishnie Park
6,"(41.1394616, -73.861803)","(41.1609922, -73.8627212)",14,Arcadian Shopping Center,Market Square
7,"(41.1394616, -73.861803)","(41.1618223, -73.8629757)",46,Arcadian Shopping Center,Municipal Parking Lot Lot# 7
8,"(41.1394616, -73.861803)","(41.1642872, -73.8604165)",9,Arcadian Shopping Center,Ossining Public Library
9,"(41.1394616, -73.861803)","(41.1653387, -73.856297)",9,Arcadian Shopping Center,C Town Markets


Calculate the min and max number of trips in the grouped data.

In [44]:
min_count = grouped_df['Count'].min()
max_count = grouped_df['Count'].max()

min_count, max_count

(np.int64(1), np.int64(159))

In [None]:
import folium

# Create a map centered on Ossining, NY
map = folium.Map(location=[41.162, -73.861], zoom_start=14)

# Add circles for each unique hub
for _, row in addresses_df.iterrows():
    hub_coords = eval(row['Geolocation'])
    hub_name = row['Hub']
    folium.CircleMarker(
        location=hub_coords,
        radius=15,
        color='gray',
        fill=True,
        fill_color='black',
        fill_opacity=0.7,
        popup=folium.Popup(hub_name, parse_html=True)
    ).add_to(map)


# Add lines for each trip
for _, row in grouped_df.iterrows():
    if pd.notnull(row['Start Hub Geolocation']) and pd.notnull(row['End Hub Geolocation']):
        start_coords = eval(row['Start Hub Geolocation'])
        end_coords = eval(row['End Hub Geolocation'])
        start_coords = (start_coords[0] + 0.0002, start_coords[1] + 0.0002) # offset the start coords a bit to separate them from other lines going the other direction
        random_color = np.random.choice(['red', 'blue', 'green', 'purple', 'orange', 'yellow'])
        line_weight = row['Count'] / 10  # Adjust the weight based on the count
        line = folium.PolyLine([start_coords, end_coords], color=random_color, weight=line_weight)
        # Add an arrowhead in the middle of the line, pointing towards the end
        mid_coords = ((start_coords[0] + end_coords[0]) / 2, (start_coords[1] + end_coords[1]) / 2)

        # a triangle pointing the direction of the trip
        folium.RegularPolygonMarker(
            location=mid_coords,
            number_of_sides=3,
            radius=8,
            color=random_color,
            fill=True,
            fill_color=random_color,
            rotation=45
        ).add_to(map)

        # a popup that shows the number of trips between the two hubs in this direction
        popup_text = f"Trips: {row['Count']}"
        folium.Popup(popup_text).add_to(line)

        # add line to map
        line.add_to(map)


# Display the map
map

In [36]:
# Ensure the renders directory exists
renders_dir = Path('./renders').resolve()
renders_dir.mkdir(parents=True, exist_ok=True)

# Save the map to an HTML file in the renders directory
map_file_path = renders_dir / 'all-trips-map.html'
map.save(str(map_file_path))