# Step 1: cleanup the raw trip data

## import dependencies

Code dependencies.

In [33]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, Markdown
from geopy.geocoders import Nominatim # max 1 requeset per second
from geopy.exc import GeocoderTimedOut #




Import the original data file.

In [34]:
# Load the CSV file into a dataframe
df = pd.read_csv(Path("data/all-trips.csv").resolve())

df.shape

(1458, 7)

Show a random sample of the data.

In [48]:
df.sample(10)

Unnamed: 0,plate,Start Hub,End Hub,Date,Start Time,Trip Durration,Billing Zip Code
966,9140012,Market Square,Arcadian Shopping Center,2024-09-09,2024-09-09 13:46:00,1:07:12,10565.0
772,9140015,Ossining Public Library,Cronton Ave & Pleasantville Road,2024-09-19,2024-09-19 14:39:01,0:04:38,10562.0
283,9140045,Ossining Public Library,Ossining Public Library,2024-10-19,2024-10-19 16:17:55,4:54:16,10562.0
801,9140038,Wishnie Park,Cronton Ave & Pleasantville Road,2024-10-04,2024-10-04 23:40:19,1:02:09,10562.0
816,9140031,Arcadian Shopping Center,Arcadian Shopping Center,2024-10-07,2024-10-07 21:53:25,0:22:10,10562.0
785,9140007,Arcadian Shopping Center,Wishnie Park,2024-10-13,2024-10-13 14:37:03,1:06:01,10562.0
890,9140008,Market Square,Cronton Ave & Pleasantville Road,2024-09-09,2024-09-09 14:51:52,0:01:32,10562.0
1245,9140009,Market Square,Market Square,2024-08-30,2024-08-30 11:38:52,0:39:26,10562.0
586,9140003,"Spring St, Cofield","Spring St, Cofield",2024-09-12,2024-09-12 6:43:13,0:09:29,10562.0
1043,9140034,C Town Markets,C Town Markets,2024-09-20,2024-09-20 22:37:42,0:38:59,10562.0


## Hub names
Establish a set of the hub names in the data.

In [35]:
# Combine unique values from 'Start Hub' and 'End Hub' columns
unique_starts = df['Start Hub'].dropna().unique() # unique start hubs
unique_ends = df['End Hub'].dropna().unique() # unique end hubs
unique_hubs_combined = np.concatenate((unique_starts, unique_ends)) # combine the two arrays
unique_hubs_combined = np.unique(unique_hubs_combined) # remove duplicates

# Convert to a list
unique_hubs_list = unique_hubs_combined

unique_hubs_list

array(['Arcadian Shopping Center', 'C Town Markets',
       'Cronton Ave & Pleasantville Road', 'Market Square',
       'Metro North - Plaza', 'Municipal Parking Lot Lot# 7',
       'Nelson park', 'Ossining Public Library', 'Spring & Waller',
       'Spring St, Cofield', 'Wishnie Park'], dtype=object)

## Hub addresses
Hard-code approximate addresses for each hub so we can geolocate them.

In [36]:
# addresses of each hub
addresses = {
'Arcadian Shopping Center': "225-207 Albany Post Rd, Briarcliff Manor, NY 10510", 
'C Town Markets': "100 Croton Ave, Ossining, NY 10562",
'Cronton Ave & Pleasantville Road': "Croton Ave & Pleasantville Rd, Ossining, NY 10562", 
'Market Square': "160 Main St, Ossining, NY 10562",
'Metro North - Plaza': "1 Secor Rd, Ossining, NY 10562", 
'Municipal Parking Lot Lot# 7': "1-15 Leonard St, Ossining, NY 10562",
'Nelson park': "20 Madison Ave, Ossining, NY 10562", 
'Ossining Public Library': "53 Croton Ave, Ossining, NY 10562", 
'Spring & Waller': "Spring St & Waller Ave, Ossining, NY 10562",
'Spring St, Cofield': "Spring St & Broad Ave, Ossining, NY 10562", 
'Wishnie Park': "145-149 Orchard Rd, Briarcliff Manor, NY 10510"
}

In [37]:
# Create a dataframe from the addresses dictionary
addresses_df = pd.DataFrame(list(addresses.items()), columns=['Hub', 'Address'])

addresses_df

Unnamed: 0,Hub,Address
0,Arcadian Shopping Center,"225-207 Albany Post Rd, Briarcliff Manor, NY 1..."
1,C Town Markets,"100 Croton Ave, Ossining, NY 10562"
2,Cronton Ave & Pleasantville Road,"Croton Ave & Pleasantville Rd, Ossining, NY 10562"
3,Market Square,"160 Main St, Ossining, NY 10562"
4,Metro North - Plaza,"1 Secor Rd, Ossining, NY 10562"
5,Municipal Parking Lot Lot# 7,"1-15 Leonard St, Ossining, NY 10562"
6,Nelson park,"20 Madison Ave, Ossining, NY 10562"
7,Ossining Public Library,"53 Croton Ave, Ossining, NY 10562"
8,Spring & Waller,"Spring St & Waller Ave, Ossining, NY 10562"
9,"Spring St, Cofield","Spring St & Broad Ave, Ossining, NY 10562"


## Geocode
Use the `geopy` library to geocode hub addresses.

In [38]:

# Initialize geocoder using OpenStreetMap's Nominatim... throttles requests to 1 per second max
geolocator = Nominatim(user_agent="bike-ped")
#geolocator = RateLimiter(nom.geocode, min_delay_seconds=1) # throttle to not anger the APIgods

# Function to geocode addresses
def geocode_address(address):
    try:
        location = geolocator.geocode(address, exactly_one=True, extratags=True, timeout=10)
        if location:
            print(location.raw)
            return location.latitude, location.longitude, location.raw
    except GeocoderTimedOut:
        pass
    return np.nan, np.nan


Geocode each hub address.... this crashes at the end, but otherwise works.

In [None]:

# Geocode all addresses
addresses_df['OpenStreetMap Data'] = addresses_df['Address'].apply(geocode_address)


{'place_id': 328948482, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright', 'osm_type': 'way', 'osm_id': 260854757, 'lat': '41.1394616', 'lon': '-73.8618030', 'class': 'highway', 'type': 'primary', 'place_rank': 26, 'importance': 0.05338392538338289, 'addresstype': 'road', 'name': 'South Highland Avenue', 'display_name': 'South Highland Avenue, Scarborough, Village of Briarcliff Manor, Town of Ossining, Westchester County, New York, 10510, United States', 'extratags': {'lanes': '3', 'surface': 'asphalt', 'maxspeed': '40 mph', 'sidewalk': 'no', 'lanes:forward': '1', 'lanes:backward': '1', 'lanes:both_ways': '1', 'turn:lanes:both_ways': 'left'}, 'boundingbox': ['41.1379390', '41.1411018', '-73.8618067', '-73.8612486']}
{'place_id': 330584028, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright', 'osm_type': 'way', 'osm_id': 745301090, 'lat': '41.1653387', 'lon': '-73.8562970', 'class': 'building', 'type': 'retail', 'place_rank

ValueError: Columns must be same length as key

In [40]:
addresses_df

Unnamed: 0,Hub,Address,Coordinates
0,Arcadian Shopping Center,"225-207 Albany Post Rd, Briarcliff Manor, NY 1...","(41.1394616, -73.861803, {'place_id': 32894848..."
1,C Town Markets,"100 Croton Ave, Ossining, NY 10562","(41.1653387, -73.856297, {'place_id': 33058402..."
2,Cronton Ave & Pleasantville Road,"Croton Ave & Pleasantville Rd, Ossining, NY 10562","(41.1669377, -73.8497735, {'place_id': 3285865..."
3,Market Square,"160 Main St, Ossining, NY 10562","(41.1609922, -73.8627212, {'place_id': 3278134..."
4,Metro North - Plaza,"1 Secor Rd, Ossining, NY 10562","(41.157844, -73.868112, {'place_id': 365405434..."
5,Municipal Parking Lot Lot# 7,"1-15 Leonard St, Ossining, NY 10562","(41.1618223, -73.8629757, {'place_id': 3354653..."
6,Nelson park,"20 Madison Ave, Ossining, NY 10562","(41.1521048, -73.8623897, {'place_id': 3298846..."
7,Ossining Public Library,"53 Croton Ave, Ossining, NY 10562","(41.1642872, -73.8604165, {'place_id': 3301173..."
8,Spring & Waller,"Spring St & Waller Ave, Ossining, NY 10562","(41.1593892, -73.8638232, {'place_id': 3301882..."
9,"Spring St, Cofield","Spring St & Broad Ave, Ossining, NY 10562","(41.1570899, -73.8640002, {'place_id': 3301221..."


Extract just the latitude and longitude from the OpenStreetMap data.

In [None]:
# Create a 'Geolocation' field with latitude and longitude in parentheses
addresses_df['Geolocation'] = addresses_df['OpenStreetMap Data'].apply(lambda coord: f"({coord[0]}, {coord[1]})")
addresses_df


Unnamed: 0,Hub,Address,Coordinates,Geolocation
0,Arcadian Shopping Center,"225-207 Albany Post Rd, Briarcliff Manor, NY 1...","(41.1394616, -73.861803, {'place_id': 32894848...","(41.1394616, -73.861803)"
1,C Town Markets,"100 Croton Ave, Ossining, NY 10562","(41.1653387, -73.856297, {'place_id': 33058402...","(41.1653387, -73.856297)"
2,Cronton Ave & Pleasantville Road,"Croton Ave & Pleasantville Rd, Ossining, NY 10562","(41.1669377, -73.8497735, {'place_id': 3285865...","(41.1669377, -73.8497735)"
3,Market Square,"160 Main St, Ossining, NY 10562","(41.1609922, -73.8627212, {'place_id': 3278134...","(41.1609922, -73.8627212)"
4,Metro North - Plaza,"1 Secor Rd, Ossining, NY 10562","(41.157844, -73.868112, {'place_id': 365405434...","(41.157844, -73.868112)"
5,Municipal Parking Lot Lot# 7,"1-15 Leonard St, Ossining, NY 10562","(41.1618223, -73.8629757, {'place_id': 3354653...","(41.1618223, -73.8629757)"
6,Nelson park,"20 Madison Ave, Ossining, NY 10562","(41.1521048, -73.8623897, {'place_id': 3298846...","(41.1521048, -73.8623897)"
7,Ossining Public Library,"53 Croton Ave, Ossining, NY 10562","(41.1642872, -73.8604165, {'place_id': 3301173...","(41.1642872, -73.8604165)"
8,Spring & Waller,"Spring St & Waller Ave, Ossining, NY 10562","(41.1593892, -73.8638232, {'place_id': 3301882...","(41.1593892, -73.8638232)"
9,"Spring St, Cofield","Spring St & Broad Ave, Ossining, NY 10562","(41.1570899, -73.8640002, {'place_id': 3301221...","(41.1570899, -73.8640002)"


## Save geolocations
Save each hub's address and geolocation to a `hub-locations.csv` file for later use.

In [46]:
# Save to file
file_path = './data/hub-locations.csv'
file_path = Path(file_path).resolve() # make platform-agnostic
addresses_df.to_csv(file_path, index=False)