# Visualize Commute Routes
This notebook visualizes the commute routes generated by `main.py`. The routes are loaded from the `commute_routes.csv` file and plotted on an interactive map using Plotly.

In [41]:
import gcsfs
import pandas as pd

In [None]:
notes_df = pd.read_csv("data/dexter/commute_routes.csv")
notes_df['route_id'] = notes_df['origin'] + ' to ' + notes_df['destination']


In [None]:
# Set your project ID and bucket name
project_id = 'dig-es-nws-gemini-projects'
bucket_name = 'marketplace-commutes'

# Initialize gcsfs
gcs = gcsfs.GCSFileSystem(project=project_id)

# List all CSV files in the bucket
files = gcs.glob(f'gs://{bucket_name}/*.csv')

print(files)

# Read all CSV files into a list of DataFrames
all_dfs = []
for f in files:
    df_temp = pd.read_csv(f"gs://{f}")
    # Extract timezone from filename
    filename = f.split('/')[-1]
    if 'atlantic' in filename:
        df_temp['timezone'] = 'America/Halifax'
    elif 'central' in filename:
        df_temp['timezone'] = 'America/Winnipeg'
    elif 'eastern' in filename:
        df_temp['timezone'] = 'America/Toronto'
    elif 'mountain' in filename:
        df_temp['timezone'] = 'America/Edmonton'
    elif 'pacific' in filename:
        df_temp['timezone'] = 'America/Vancouver'
    else:
        df_temp['timezone'] = 'UTC' # Default or handle as needed
    all_dfs.append(df_temp)


# Concatenate all DataFrames into a single DataFrame
df = pd.concat(all_dfs, ignore_index=True)

# Create a unique identifier for each route
df['route_id'] = df['origin'] + ' to ' + df['destination']

# --- Create timestamp_local column converting 'timestamp' into local time per-row ---
# Parse 'timestamp' into UTC-aware Timestamps (assume UTC if timezone-naive)
df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True, errors='coerce')

# Helper to convert a UTC-aware timestamp to a per-row timezone with safe fallback
def _to_local(ts, tz):
    try:
        if pd.isna(ts):
            return pd.NaT
        if pd.isna(tz) or tz == '':
            tz = 'UTC'
        return ts.tz_convert(tz)
    except Exception:
        # If conversion fails (invalid tz string), return the original UTC timestamp
        try:
            return ts.tz_convert('UTC')
        except Exception:
            return pd.NaT

# Apply conversion per-row (vectorized conversion per value isn't possible when tz varies by row)
if 'timezone' in df.columns:
    df['timestamp_local'] = df.apply(lambda r: _to_local(r['timestamp'], r['timezone']), axis=1)
else:
    # If timezone column is missing, fallback to UTC
    df['timestamp_local'] = df['timestamp']

# Display a quick sample to check results
display(df.head())

['marketplace-commutes/commute_routes_atlantic.csv', 'marketplace-commutes/commute_routes_central.csv', 'marketplace-commutes/commute_routes_eastern.csv', 'marketplace-commutes/commute_routes_mountain.csv', 'marketplace-commutes/commute_routes_pacific.csv']


Unnamed: 0,route_id,timestamp,timezone,timestamp_local
0,"Larry Uteck, Halifax to Hospital in Halifax",2025-08-26 20:10:05+00:00,America/Halifax,2025-08-26 17:10:05-03:00
1,"Windsor Street, Halifax to St. Margaret's Bay ...",2025-08-26 20:10:05+00:00,America/Halifax,2025-08-26 17:10:05-03:00
2,"Armdale, Halifax to Bayers Lake, Halifax",2025-08-26 20:10:06+00:00,America/Halifax,2025-08-26 17:10:06-03:00
3,"North End, Halifax to Macdonald Bridge, Halifax",2025-08-26 20:10:06+00:00,America/Halifax,2025-08-26 17:10:06-03:00
4,"Larry Uteck, Halifax to Hospital in Halifax",2025-08-26 20:20:05+00:00,America/Halifax,2025-08-26 17:20:05-03:00


In [71]:
df.head()

Unnamed: 0,origin,destination,travel_mode,distance_km,duration_min,warnings,line_geometry,timestamp,timezone,route_id,timestamp_local,notes,hour
0,"Larry Uteck, Halifax",Hospital in Halifax,DRIVE,10.113,20,,"LINESTRING (-63.67528 44.70207, -63.67451 44.7...",2025-08-26 20:10:05+00:00,America/Halifax,"Larry Uteck, Halifax to Hospital in Halifax",2025-08-26 17:10:05-03:00,,17
1,"Windsor Street, Halifax","St. Margaret's Bay Road, Halifax",DRIVE,5.678,13,,"LINESTRING (-63.60553 44.65555, -63.60527 44.6...",2025-08-26 20:10:05+00:00,America/Halifax,"Windsor Street, Halifax to St. Margaret's Bay ...",2025-08-26 17:10:05-03:00,,17
2,"Armdale, Halifax","Bayers Lake, Halifax",DRIVE,7.775,12,,"LINESTRING (-63.60788 44.63002, -63.60795 44.6...",2025-08-26 20:10:06+00:00,America/Halifax,"Armdale, Halifax to Bayers Lake, Halifax",2025-08-26 17:10:06-03:00,,17
3,"North End, Halifax","Macdonald Bridge, Halifax",BICYCLE,1.723,10,,"LINESTRING (-63.60227 44.65962, -63.60234 44.6...",2025-08-26 20:10:06+00:00,America/Halifax,"North End, Halifax to Macdonald Bridge, Halifax",2025-08-26 17:10:06-03:00,,17
4,"Larry Uteck, Halifax",Hospital in Halifax,DRIVE,10.113,21,,"LINESTRING (-63.67528 44.70207, -63.67451 44.7...",2025-08-26 20:20:05+00:00,America/Halifax,"Larry Uteck, Halifax to Hospital in Halifax",2025-08-26 17:20:05-03:00,,17


In [44]:
len(df["route_id"].unique())

55

In [45]:
# Merge notes from the local CSV
df = pd.merge(df, notes_df[['route_id', 'notes']], on='route_id', how='left')

# Display the first few rows of the combined DataFrame
df.head()

Unnamed: 0,origin,destination,travel_mode,distance_km,duration_min,warnings,line_geometry,timestamp,timezone,route_id,timestamp_local,notes
0,"Larry Uteck, Halifax",Hospital in Halifax,DRIVE,10.113,20,,"LINESTRING (-63.67528 44.70207, -63.67451 44.7...",2025-08-26 20:10:05+00:00,America/Halifax,"Larry Uteck, Halifax to Hospital in Halifax",2025-08-26 17:10:05-03:00,
1,"Windsor Street, Halifax","St. Margaret's Bay Road, Halifax",DRIVE,5.678,13,,"LINESTRING (-63.60553 44.65555, -63.60527 44.6...",2025-08-26 20:10:05+00:00,America/Halifax,"Windsor Street, Halifax to St. Margaret's Bay ...",2025-08-26 17:10:05-03:00,
2,"Armdale, Halifax","Bayers Lake, Halifax",DRIVE,7.775,12,,"LINESTRING (-63.60788 44.63002, -63.60795 44.6...",2025-08-26 20:10:06+00:00,America/Halifax,"Armdale, Halifax to Bayers Lake, Halifax",2025-08-26 17:10:06-03:00,
3,"North End, Halifax","Macdonald Bridge, Halifax",BICYCLE,1.723,10,,"LINESTRING (-63.60227 44.65962, -63.60234 44.6...",2025-08-26 20:10:06+00:00,America/Halifax,"North End, Halifax to Macdonald Bridge, Halifax",2025-08-26 17:10:06-03:00,
4,"Larry Uteck, Halifax",Hospital in Halifax,DRIVE,10.113,21,,"LINESTRING (-63.67528 44.70207, -63.67451 44.7...",2025-08-26 20:20:05+00:00,America/Halifax,"Larry Uteck, Halifax to Hospital in Halifax",2025-08-26 17:20:05-03:00,


In [60]:
# keep wall-clock local hour (preserves local time)
df['hour'] = df['timestamp_local'].apply(lambda ts: ts.hour if pd.notnull(ts) else pd.NA)

In [61]:
# Create a 'route_id' column using origin and destination
if 'origin' in df.columns and 'destination' in df.columns:
    df['route_id'] = df['origin'].astype(str) + ' to ' + df['destination'].astype(str)
    df["route_id"] = df["route_id"].str.replace("44.41906079312099, -80.09223079949398", "Stayner, Ontario L0M 1S0")
    df["route_id"] = df["route_id"].str.replace("44.672247457480196, -63.478352018683125", "Cole Harbour, Nova Scotia")
    
else:
    raise ValueError("'origin' and/or 'destination' columns not found in DataFrame.")
df.head()

Unnamed: 0,origin,destination,travel_mode,distance_km,duration_min,warnings,line_geometry,timestamp,timezone,route_id,timestamp_local,notes,hour
0,"Larry Uteck, Halifax",Hospital in Halifax,DRIVE,10.113,20,,"LINESTRING (-63.67528 44.70207, -63.67451 44.7...",2025-08-26 20:10:05+00:00,America/Halifax,"Larry Uteck, Halifax to Hospital in Halifax",2025-08-26 17:10:05-03:00,,17
1,"Windsor Street, Halifax","St. Margaret's Bay Road, Halifax",DRIVE,5.678,13,,"LINESTRING (-63.60553 44.65555, -63.60527 44.6...",2025-08-26 20:10:05+00:00,America/Halifax,"Windsor Street, Halifax to St. Margaret's Bay ...",2025-08-26 17:10:05-03:00,,17
2,"Armdale, Halifax","Bayers Lake, Halifax",DRIVE,7.775,12,,"LINESTRING (-63.60788 44.63002, -63.60795 44.6...",2025-08-26 20:10:06+00:00,America/Halifax,"Armdale, Halifax to Bayers Lake, Halifax",2025-08-26 17:10:06-03:00,,17
3,"North End, Halifax","Macdonald Bridge, Halifax",BICYCLE,1.723,10,,"LINESTRING (-63.60227 44.65962, -63.60234 44.6...",2025-08-26 20:10:06+00:00,America/Halifax,"North End, Halifax to Macdonald Bridge, Halifax",2025-08-26 17:10:06-03:00,,17
4,"Larry Uteck, Halifax",Hospital in Halifax,DRIVE,10.113,21,,"LINESTRING (-63.67528 44.70207, -63.67451 44.7...",2025-08-26 20:20:05+00:00,America/Halifax,"Larry Uteck, Halifax to Hospital in Halifax",2025-08-26 17:20:05-03:00,,17


In [62]:
df["route_id"]

0               Larry Uteck, Halifax to Hospital in Halifax
1         Windsor Street, Halifax to St. Margaret's Bay ...
2                  Armdale, Halifax to Bayers Lake, Halifax
3           North End, Halifax to Macdonald Bridge, Halifax
4               Larry Uteck, Halifax to Hospital in Halifax
                                ...                        
287812               Colwood, Victoria to Downtown Victoria
287813    Tillicum Centre, Victoria to Patricia Bay High...
287814    Esquimalt, Victoria to University of Victoria,...
287815        Port Coquitlam, BC to Richmond Sea Island, BC
287816                            Squamish, BC to Delta, BC
Name: route_id, Length: 287817, dtype: object

In [63]:
df["timestamp"].min()

Timestamp('2025-08-26 20:10:05+0000', tz='UTC')

In [64]:
df["timestamp"].max()

Timestamp('2025-10-18 16:50:14+0000', tz='UTC')

In [65]:
df[df["route_id"] == "43.481393453134494, -79.97285174417762 to 43.83903876957066, -79.12450551817862"].sort_values("timestamp")

Unnamed: 0,origin,destination,travel_mode,distance_km,duration_min,warnings,line_geometry,timestamp,timezone,route_id,timestamp_local,notes,hour
206621,"43.481393453134494, -79.97285174417762","43.83903876957066, -79.12450551817862",DRIVE,97.331,61,This route has tolls.; This route includes a h...,"LINESTRING (-79.97287 43.48133, -79.97313 43.4...",2025-10-17 19:10:10+00:00,America/Toronto,"43.481393453134494, -79.97285174417762 to 43.8...",2025-10-17 15:10:10-04:00,Michaela's commute,15
206645,"43.481393453134494, -79.97285174417762","43.83903876957066, -79.12450551817862",DRIVE,97.331,61,This route has tolls.; This route includes a h...,"LINESTRING (-79.97287 43.48133, -79.97313 43.4...",2025-10-17 19:20:11+00:00,America/Toronto,"43.481393453134494, -79.97285174417762 to 43.8...",2025-10-17 15:20:11-04:00,Michaela's commute,15
206669,"43.481393453134494, -79.97285174417762","43.83903876957066, -79.12450551817862",DRIVE,97.331,58,This route has tolls.; This route includes a h...,"LINESTRING (-79.97287 43.48133, -79.97313 43.4...",2025-10-17 19:30:11+00:00,America/Toronto,"43.481393453134494, -79.97285174417762 to 43.8...",2025-10-17 15:30:11-04:00,Michaela's commute,15
206693,"43.481393453134494, -79.97285174417762","43.83903876957066, -79.12450551817862",DRIVE,97.331,59,This route has tolls.; This route includes a h...,"LINESTRING (-79.97287 43.48133, -79.97313 43.4...",2025-10-17 19:40:12+00:00,America/Toronto,"43.481393453134494, -79.97285174417762 to 43.8...",2025-10-17 15:40:12-04:00,Michaela's commute,15
206717,"43.481393453134494, -79.97285174417762","43.83903876957066, -79.12450551817862",DRIVE,97.331,61,This route has tolls.; This route includes a h...,"LINESTRING (-79.97287 43.48133, -79.97313 43.4...",2025-10-17 19:50:12+00:00,America/Toronto,"43.481393453134494, -79.97285174417762 to 43.8...",2025-10-17 15:50:12-04:00,Michaela's commute,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...
208781,"43.481393453134494, -79.97285174417762","43.83903876957066, -79.12450551817862",DRIVE,97.331,56,This route has tolls.; This route includes a h...,"LINESTRING (-79.97287 43.48133, -79.97313 43.4...",2025-10-18 16:10:11+00:00,America/Toronto,"43.481393453134494, -79.97285174417762 to 43.8...",2025-10-18 12:10:11-04:00,Michaela's commute,12
208805,"43.481393453134494, -79.97285174417762","43.83903876957066, -79.12450551817862",DRIVE,97.331,55,This route has tolls.; This route includes a h...,"LINESTRING (-79.97287 43.48133, -79.97313 43.4...",2025-10-18 16:20:14+00:00,America/Toronto,"43.481393453134494, -79.97285174417762 to 43.8...",2025-10-18 12:20:14-04:00,Michaela's commute,12
208829,"43.481393453134494, -79.97285174417762","43.83903876957066, -79.12450551817862",DRIVE,97.331,55,This route has tolls.; This route includes a h...,"LINESTRING (-79.97287 43.48133, -79.97313 43.4...",2025-10-18 16:30:15+00:00,America/Toronto,"43.481393453134494, -79.97285174417762 to 43.8...",2025-10-18 12:30:15-04:00,Michaela's commute,12
208853,"43.481393453134494, -79.97285174417762","43.83903876957066, -79.12450551817862",DRIVE,97.331,57,This route has tolls.; This route includes a h...,"LINESTRING (-79.97287 43.48133, -79.97313 43.4...",2025-10-18 16:40:14+00:00,America/Toronto,"43.481393453134494, -79.97285174417762 to 43.8...",2025-10-18 12:40:14-04:00,Michaela's commute,12


In [70]:
# Filter for off-peak hours
off_peak_df = df[~((df['hour'] >= 7) & (df['hour'] < 9) | (df['hour'] >= 17) & (df['hour'] < 19))]

# Calculate the mean travel time for each route during off-peak hours
off_peak_stats = off_peak_df.groupby(['route_id', 'travel_mode'])['duration_min'].agg(['mean']).reset_index()
off_peak_stats.columns = ['route_id', 'travel_mode', 'mean_off_peak_duration_min']

# Filter for peak hours
peak_df = df[((df['hour'] >= 8) & (df['hour'] < 10) | (df['hour'] >= 17) & (df['hour'] < 19))]

# Calculate the median and 95th percentile travel time for each route during peak hours
peak_stats = peak_df.groupby(['route_id', 'travel_mode'])['duration_min'].agg(['mean']).reset_index()
peak_stats.columns = ['route_id', 'travel_mode', 'mean_peak_duration_min']

# Merge the stats
final_stats1 = pd.merge(off_peak_stats, peak_stats, on=['route_id', 'travel_mode'], how='left')

# Add calculated columns
final_stats1['peak_vs_off_peak_mean_diff'] = final_stats1['mean_peak_duration_min'] - final_stats1['mean_off_peak_duration_min']

# Calculate the score and sort by it
final_stats = final_stats1.sort_values(by='peak_vs_off_peak_mean_diff', ascending=False)

# Drop the specified route
final_stats = final_stats[final_stats['route_id'] != 'Westshore, Victoria to Downtown Victoria']

# Merge notes from the read-in commute_routes.csv (if available)
if 'notes' in locals() or 'notes_df' in globals():
    try:
        final_stats = final_stats.merge(notes_df[['route_id', 'notes']], on='route_id', how='left')
    except Exception:
        # if merge fails for any reason, create an empty notes column
        final_stats['notes'] = ''
else:
    final_stats['notes'] = ''

# Display the final table with notes
display(final_stats)

Unnamed: 0,route_id,travel_mode,mean_off_peak_duration_min,mean_peak_duration_min,peak_vs_off_peak_mean_diff,notes
0,"Highway 401, Mississauga to Highway 401 and Do...",DRIVE,39.749359,48.14951,8.400151,
1,"Cloverdale, Surrey to Richmond/Vancouver",DRIVE,53.042409,60.620283,7.577874,
2,"Port Coquitlam, BC to Richmond Sea Island, BC",DRIVE,50.436131,55.336039,4.899908,
3,"Gardiner Expressway and Highway 427, Toronto t...",DRIVE,18.457111,21.919811,3.462701,
4,"The Beaches, Toronto, ON to CBC Toronto, Canad...",DRIVE,22.343204,25.437908,3.094705,Submitted by Dana Dragone
5,"49.1628474739296, -122.84518311047466 to Coqui...",DRIVE,29.924031,32.913978,2.989947,88th avenue in Surrey
6,"Stayner, Ontario L0M 1S0 to 43.67371164802397,...",DRIVE,101.396688,104.039216,2.642527,
7,"Riverside South, Ottawa to Downtown Ottawa",DRIVE,26.47088,29.018868,2.547988,
8,"Bathurst and St. Clair, Toronto to 43.73014981...",DRIVE,17.976451,20.520425,2.543973,Submitted by Steven Dennis
9,"Larry Uteck, Halifax to Hospital in Halifax",DRIVE,16.701782,19.23978,2.537998,


In [67]:
final_stats["travel_mode"].value_counts()

travel_mode
DRIVE      32
TRANSIT    13
BICYCLE     8
Name: count, dtype: int64

In [68]:
# # Filter for weekdays (Monday=0, Sunday=6)
# weekday_day = df
# df["weekday_df"] = df['timestamp_local'].apply(lambda x: x.weekday())

# weekday_df = df[df["weekday_df"] < 5]

# # Fill NaN values in 'notes' with an empty string to prevent them from being dropped
# weekday_df['notes'] = weekday_df['notes'].fillna('')

# # Filter for off-peak hours
# off_peak_df = weekday_df[~((weekday_df['hour'] >= 7) & (weekday_df['hour'] < 9) | (weekday_df['hour'] >= 17) & (weekday_df['hour'] < 19))]

# # Calculate the median travel time for each route during off-peak hours
# off_peak_stats = off_peak_df.groupby(['route_id', 'travel_mode', 'notes']).agg(
#     median_off_peak_duration_min=('duration_min', 'median'),
#     off_peak_data_points=('duration_min', 'count'),
# ).reset_index()

# # Filter for peak hours
# peak_df = weekday_df[((weekday_df['hour'] >= 7) & (weekday_df['hour'] < 9) | (weekday_df['hour'] >= 17) & (weekday_df['hour'] < 19))]

# # Calculate the median and 90th percentile travel time for each route during peak hours
# peak_stats = peak_df.groupby(['route_id', 'travel_mode'])['duration_min'].agg(['median', lambda x: x.quantile(0.90), 'count']).reset_index()
# peak_stats.columns = ['route_id', 'travel_mode', 'median_peak_duration_min', 'p90_peak_duration_min', 'peak_data_points']

# # Merge the stats
# final_stats = pd.merge(off_peak_stats, peak_stats, on=['route_id', 'travel_mode'], how='left')

# # Add calculated columns
# final_stats['peak_vs_off_peak_median_diff'] = final_stats['median_peak_duration_min'] - final_stats['median_off_peak_duration_min']
# final_stats['p90_vs_median_peak_diff'] = final_stats['p90_peak_duration_min'] - final_stats['median_peak_duration_min']

# # Calculate the score and sort by it
# final_stats['score'] = (final_stats['peak_vs_off_peak_median_diff'] + final_stats['p90_vs_median_peak_diff']) / final_stats['median_off_peak_duration_min']
# final_stats = final_stats.sort_values(by='score', ascending=False)

# # Drop the specified route
# final_stats = final_stats[final_stats['route_id'] != 'Westshore, Victoria to Downtown Victoria']

# print("Travel time statistics (Weekdays only):")
# display(final_stats)

In [69]:
# from shapely.wkt import loads
# import numpy as np
# from geopy.distance import great_circle
# from itertools import combinations

# # --- Linestring Consistency Check ---

# # Function to extract start and end points from a WKT linestring
# def get_start_end_points(wkt_string):
#     if not isinstance(wkt_string, str) or not wkt_string.startswith('LINESTRING'):
#         return None, None
#     try:
#         line = loads(wkt_string)
#         # Note: Shapely coordinates are (lon, lat), geopy expects (lat, lon)
#         start = (line.coords[0][1], line.coords[0][0])
#         end = (line.coords[-1][1], line.coords[-1][0])
#         return start, end
#     except Exception as e:
#         return None, None

# # Apply the function to the dataframe
# df[['start_point', 'end_point']] = df['line_geometry'].apply(get_start_end_points).apply(pd.Series)

# # Function to calculate max distance between a list of points
# def get_max_distance(points):
#     points = [p for p in points if p is not None and all(v is not None for v in p)]
#     if len(points) < 2:
#         return 0
    
#     max_dist = 0
#     for p1, p2 in combinations(points, 2):
#         dist = great_circle(p1, p2).kilometers
#         if dist > max_dist:
#             max_dist = dist
#     return max_dist

# # Group by route and get unique start/end points
# route_points = df.groupby('route_id').agg({
#     'start_point': lambda x: list(x.unique()),
#     'end_point': lambda x: list(x.unique())
# }).reset_index()

# # Calculate the max distance for start and end points
# route_points['max_start_point_dist_km'] = route_points['start_point'].apply(get_max_distance)
# route_points['max_end_point_dist_km'] = route_points['end_point'].apply(get_max_distance)

# # Identify routes with start or end points spread by more than 1 km
# inconsistent_routes = route_points[
#     (route_points['max_start_point_dist_km'] > 1) | 
#     (route_points['max_end_point_dist_km'] > 1)
# ]

# print("Inconsistent Route Analysis (Max distance > 1km):")
# if not inconsistent_routes.empty:
#     print("Found routes with significant variation in start/end points:")
#     display(inconsistent_routes[['route_id', 'max_start_point_dist_km', 'max_end_point_dist_km']])
# else:
#     print("All routes appear to have consistent start and end points within a 1km radius.")