In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

bike_parking_df = pd.read_csv('../data/raw/streetfurniture-bicycle_parkingdata.csv')
bike_parking_df.head()

Unnamed: 0,_id,OBJECTID,ID,ADDRESSNUMBERTEXT,ADDRESSSTREET,FRONTINGSTREET,SIDE,FROMSTREET,DIRECTION,SITEID,WARD,BIA,ASSETTYPE,STATUS,SDE_STATE_ID,geometry
0,1,5,BP-05830,4841-4881,Yonge St,,,Harlandale Ave,,,18.0,Willowdale,Ring,Existing,,"{'type': 'MultiPoint', 'coordinates': [[-79.41..."
1,2,34,BP-03501,8,Kensington Ave,,,Kensington Ave,,,11.0,Kensington Market,Ring,Existing,,"{'type': 'MultiPoint', 'coordinates': [[-79.40..."
2,3,35,BP-11699,70,The Pond Rd,,,Seneca Lane,,,7.0,,Rack,Existing,,"{'type': 'MultiPoint', 'coordinates': [[-79.49..."
3,4,107,BP-12883,21,Canniff St,,,Strachan Ave,,,10.0,,Ring,Existing,,"{'type': 'MultiPoint', 'coordinates': [[-79.41..."
4,5,172,BP-15331,911,Davenport Rd,,,Davenport Rd,,,12.0,,,Existing,,"{'type': 'MultiPoint', 'coordinates': [[-79.42..."


In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

cyclist_traffic_df = pd.read_csv('../data/raw/volumes_atr_cyclists_shortterm.csv')
cyclist_traffic_df.head()

Unnamed: 0,centreline_id,direction,location,class_type,daily_temperature,daily_precipitation,datetime_bin_start,datetime_bin_end,volume
0,8313231,Eastbound,ADELAIDE ST E EB W OF JARVIS ST,Cyclists,26.0,0.0,2018-10-10 23:00:00,2018-10-11 00:00:00,32
1,8313231,Eastbound,ADELAIDE ST E EB W OF JARVIS ST,Cyclists,26.0,0.0,2018-10-10 22:00:00,2018-10-10 23:00:00,50
2,8313231,Eastbound,ADELAIDE ST E EB W OF JARVIS ST,Cyclists,26.0,0.0,2018-10-10 21:00:00,2018-10-10 22:00:00,44
3,8313231,Eastbound,ADELAIDE ST E EB W OF JARVIS ST,Cyclists,26.0,0.0,2018-10-10 20:00:00,2018-10-10 21:00:00,58
4,8313231,Eastbound,ADELAIDE ST E EB W OF JARVIS ST,Cyclists,26.0,0.0,2018-10-10 19:00:00,2018-10-10 20:00:00,145


In [25]:
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import time
import logging
import re
import requests

# Configure logging to print to the notebook's output
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


# Function to remove direction abbreviation from location
def extract_intersection(location):
    # Pattern for "X OF Y" format
    pattern1 = r'(.+?)\s+(?:EB|WB|NB|SB)\s+(?:E|W|N|S)\s+OF\s+(.+)'
    
    # Pattern for "X AND Y" format (in case some are already in this format)
    pattern2 = r'(.+?)\s+AND\s+(.+)'
    
    # Pattern for single street names
    pattern3 = r'(.+?)\s+(?:EB|WB|NB|SB)$'
    
    match1 = re.match(pattern1, location)
    match2 = re.match(pattern2, location)
    match3 = re.match(pattern3, location)
    
    if match1:
        street1, street2 = match1.groups()
        return f"{street1.strip()} AND {street2.strip()}"
    elif match2:
        return location  # Already in the desired format
    elif match3:
        return match3.group(1).strip()  # Return just the street name
    else:
        return location 

def geocode_with_geogratis(location, attempt=1, max_attempts=2, delay=1):
    url = f"https://geogratis.gc.ca/services/geolocation/en/locate?q={location}"
    try:
        time.sleep(delay)  # Add delay before making the request
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            if data:
                # Use the first result
                coordinates = data[0]['geometry']['coordinates']
                #logging.info(f"Geocoded location: {location} -> {coordinates}")
                return {'latitude': coordinates[1], 'longitude': coordinates[0]}
            else:
                # return empty result
                #logging.warning(f"No geocoding result for location: {location}")
                return {'latitude': '', 'longitude': ''}
    except requests.RequestException as e:
        if attempt <= max_attempts:
            time.sleep(2 ** attempt)  # Exponential backoff
            return geocode_with_geogratis(location, attempt + 1, max_attempts, delay)
        logging.error(f"Geocoding failed for location: {location} after {max_attempts} attempts")
        raise e
    
# Extract distinct locations
distinct_locations = cyclist_traffic_df['location'].unique()

# Create a DataFrame for distinct locations
distinct_locations_df = pd.DataFrame(distinct_locations, columns=['location'])

# Clean the column to extract intersections
distinct_locations_df['intersection'] = distinct_locations_df['location'].apply(extract_intersection)

# Add full location for geocoding
distinct_locations_df['full_location'] = distinct_locations_df['intersection'] + ", Toronto"

# Apply geocoding to distinct locations
distinct_locations_df['geocoded'] = distinct_locations_df['full_location'].apply(geocode_with_geogratis)
distinct_locations_df['latitude'] = distinct_locations_df['geocoded'].apply(lambda loc: loc['latitude'] if isinstance(loc, dict) else None)
distinct_locations_df['longitude'] = distinct_locations_df['geocoded'].apply(lambda loc: loc['longitude'] if isinstance(loc, dict) else None)

# Merge geocoded results back to the original DataFrame
cyclist_traffic_df = cyclist_traffic_df.merge(distinct_locations_df[['location', 'latitude', 'longitude']], on='location', how='left')

# Print the DataFrame to verify
print(cyclist_traffic_df.head())

   centreline_id  direction                         location class_type  \
0        8313231  Eastbound  ADELAIDE ST E EB W OF JARVIS ST   Cyclists   
1        8313231  Eastbound  ADELAIDE ST E EB W OF JARVIS ST   Cyclists   
2        8313231  Eastbound  ADELAIDE ST E EB W OF JARVIS ST   Cyclists   
3        8313231  Eastbound  ADELAIDE ST E EB W OF JARVIS ST   Cyclists   
4        8313231  Eastbound  ADELAIDE ST E EB W OF JARVIS ST   Cyclists   

   daily_temperature  daily_precipitation   datetime_bin_start  \
0               26.0                  0.0  2018-10-10 23:00:00   
1               26.0                  0.0  2018-10-10 22:00:00   
2               26.0                  0.0  2018-10-10 21:00:00   
3               26.0                  0.0  2018-10-10 20:00:00   
4               26.0                  0.0  2018-10-10 19:00:00   

      datetime_bin_end  volume                 intersection   latitude  \
0  2018-10-11 00:00:00      32  ADELAIDE ST E AND JARVIS ST  43.651314   
1  2

In [26]:
import pandas as pd
import re
import logging

print(cyclist_traffic_df[['location', 'latitude', 'longitude']])

print(bike_parking_df[['geometry']])


                                 location   latitude  longitude
0         ADELAIDE ST E EB W OF JARVIS ST  43.651314 -79.372248
1         ADELAIDE ST E EB W OF JARVIS ST  43.651314 -79.372248
2         ADELAIDE ST E EB W OF JARVIS ST  43.651314 -79.372248
3         ADELAIDE ST E EB W OF JARVIS ST  43.651314 -79.372248
4         ADELAIDE ST E EB W OF JARVIS ST  43.651314 -79.372248
...                                   ...        ...        ...
28041  LANSDOWNE AVE NB S OF DAVENPORT RD  43.671555 -79.448289
28042  LANSDOWNE AVE NB S OF DAVENPORT RD  43.671555 -79.448289
28043  LANSDOWNE AVE NB S OF DAVENPORT RD  43.671555 -79.448289
28044  LANSDOWNE AVE NB S OF DAVENPORT RD  43.671555 -79.448289
28045  LANSDOWNE AVE NB S OF DAVENPORT RD  43.671555 -79.448289

[28046 rows x 3 columns]
                                                geometry
0      {'type': 'MultiPoint', 'coordinates': [[-79.41...
1      {'type': 'MultiPoint', 'coordinates': [[-79.40...
2      {'type': 'MultiPoint', 'coor

In [32]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import ast

# Convert cyclist_traffic_df to a GeoDataFrame
# Replace empty strings with NaN
cyclist_traffic_df['longitude'] = cyclist_traffic_df['longitude'].replace('', np.nan)
cyclist_traffic_df['latitude'] = cyclist_traffic_df['latitude'].replace('', np.nan)

# Drop rows with missing or invalid values in longitude and latitude columns
cyclist_traffic_df = cyclist_traffic_df.dropna(subset=['longitude', 'latitude'])

# Convert the columns to float
cyclist_traffic_df['longitude'] = cyclist_traffic_df['longitude'].astype(float)
cyclist_traffic_df['latitude'] = cyclist_traffic_df['latitude'].astype(float)

# Step 1: Convert cyclist_traffic_df to a GeoDataFrame
# If you already have longitude and latitude columns:
cyclist_traffic_gdf = gpd.GeoDataFrame(
    cyclist_traffic_df, 
    geometry=gpd.points_from_xy(cyclist_traffic_df.longitude, cyclist_traffic_df.latitude),
    crs="EPSG:4326"
)

def parse_geometry(geom_str):
    geom_dict = ast.literal_eval(geom_str)
    if geom_dict['type'] == 'MultiPoint':
        # Take the first point of the MultiPoint
        return Point(geom_dict['coordinates'][0])
    else:
        # In case there are any single Points
        return Point(geom_dict['coordinates'])

bike_parking_gdf = gpd.GeoDataFrame(
    bike_parking_df,
    geometry=bike_parking_df['geometry'].apply(parse_geometry),
    crs="EPSG:4326"
)



# Step 4: If you want to limit the join to points within a certain distance (e.g., 1000 meters)
# First, we need to project to a local CRS for accurate distance measurement
# For Toronto, we can use EPSG:32617 (UTM zone 17N)
cyclist_traffic_gdf_utm = cyclist_traffic_gdf.to_crs("EPSG:32617")
bike_parking_gdf_utm = bike_parking_gdf.to_crs("EPSG:32617")

merged_gdf_utm = gpd.sjoin_nearest(cyclist_traffic_gdf_utm, bike_parking_gdf_utm, how="left", distance_col="distance_meters", max_distance=1000)

# Step 5: Convert back to EPSG:4326
merged_gdf_final = merged_gdf_utm.to_crs("EPSG:4326")

print(merged_gdf_final.head())

# Now merged_gdf_final contains all columns from both datasets, plus a 'distance_meters' column




   centreline_id  direction                         location class_type  \
0        8313231  Eastbound  ADELAIDE ST E EB W OF JARVIS ST   Cyclists   
1        8313231  Eastbound  ADELAIDE ST E EB W OF JARVIS ST   Cyclists   
2        8313231  Eastbound  ADELAIDE ST E EB W OF JARVIS ST   Cyclists   
3        8313231  Eastbound  ADELAIDE ST E EB W OF JARVIS ST   Cyclists   
4        8313231  Eastbound  ADELAIDE ST E EB W OF JARVIS ST   Cyclists   

   daily_temperature  daily_precipitation   datetime_bin_start  \
0               26.0                  0.0  2018-10-10 23:00:00   
1               26.0                  0.0  2018-10-10 22:00:00   
2               26.0                  0.0  2018-10-10 21:00:00   
3               26.0                  0.0  2018-10-10 20:00:00   
4               26.0                  0.0  2018-10-10 19:00:00   

      datetime_bin_end  volume                 intersection  ...  SIDE  \
0  2018-10-11 00:00:00      32  ADELAIDE ST E AND JARVIS ST  ...  East   
1  2

In [34]:
# Save the merged GeoDataFrame to a file in the data/processed folder
output_path = "../data/processed/merged_gdf_final.geojson"
merged_gdf_final.to_file(output_path, driver="GeoJSON")

2024-07-27 17:42:48,732 - INFO - Created 20,558 records


In [35]:
output_path = "../data/processed/merged_gdf_final.csv"
merged_gdf_final.to_csv(output_path, index=False)