# Visualize Commute Routes
This notebook visualizes the commute routes generated by `main.py`. The routes are loaded from the `commute_routes.csv` file and plotted on an interactive map using Plotly.

In [2]:
import gcsfs
import pandas as pd

In [3]:
# Set your project ID and bucket name
project_id = 'dig-es-nws-gemini-projects'
bucket_name = 'marketplace-commutes'

# Initialize gcsfs
gcs = gcsfs.GCSFileSystem(project=project_id)

# List all CSV files in the bucket
files = gcs.glob(f'gs://{bucket_name}/*.csv')

print(files)

# Read all CSV files into a list of DataFrames
all_dfs = []
for f in files:
    df_temp = pd.read_csv(f"gs://{f}")
    # Extract timezone from filename
    filename = f.split('/')[-1]
    if 'atlantic' in filename:
        df_temp['timezone'] = 'America/Halifax'
    elif 'central' in filename:
        df_temp['timezone'] = 'America/Winnipeg'
    elif 'eastern' in filename:
        df_temp['timezone'] = 'America/Toronto'
    elif 'mountain' in filename:
        df_temp['timezone'] = 'America/Edmonton'
    elif 'pacific' in filename:
        df_temp['timezone'] = 'America/Vancouver'
    else:
        df_temp['timezone'] = 'UTC' # Default or handle as needed
    all_dfs.append(df_temp)


# Concatenate all DataFrames into a single DataFrame
df = pd.concat(all_dfs, ignore_index=True)

# Create a unique identifier for each route
df['route_id'] = df['origin'] + ' to ' + df['destination']

# Merge notes from the local CSV
df = pd.merge(df, notes_df[['route_id', 'notes']], on='route_id', how='left')


# Display the first few rows of the combined DataFrame
df.head()



['marketplace-commutes/commute_routes_atlantic.csv', 'marketplace-commutes/commute_routes_central.csv', 'marketplace-commutes/commute_routes_eastern.csv', 'marketplace-commutes/commute_routes_mountain.csv', 'marketplace-commutes/commute_routes_pacific.csv']


Unnamed: 0,origin,destination,travel_mode,distance_km,duration_min,warnings,line_geometry,timestamp,timezone,route_id,notes
0,"Larry Uteck, Halifax",Hospital in Halifax,DRIVE,10.113,20,,"LINESTRING (-63.67528 44.70207, -63.67451 44.7...",2025-08-26 20:10:05,America/Halifax,"Larry Uteck, Halifax to Hospital in Halifax",
1,"Windsor Street, Halifax","St. Margaret's Bay Road, Halifax",DRIVE,5.678,13,,"LINESTRING (-63.60553 44.65555, -63.60527 44.6...",2025-08-26 20:10:05,America/Halifax,"Windsor Street, Halifax to St. Margaret's Bay ...",
2,"Armdale, Halifax","Bayers Lake, Halifax",DRIVE,7.775,12,,"LINESTRING (-63.60788 44.63002, -63.60795 44.6...",2025-08-26 20:10:06,America/Halifax,"Armdale, Halifax to Bayers Lake, Halifax",
3,"North End, Halifax","Macdonald Bridge, Halifax",BICYCLE,1.723,10,,"LINESTRING (-63.60227 44.65962, -63.60234 44.6...",2025-08-26 20:10:06,America/Halifax,"North End, Halifax to Macdonald Bridge, Halifax",
4,"Larry Uteck, Halifax",Hospital in Halifax,DRIVE,10.113,21,,"LINESTRING (-63.67528 44.70207, -63.67451 44.7...",2025-08-26 20:20:05,America/Halifax,"Larry Uteck, Halifax to Hospital in Halifax",


In [11]:
# Ensure there is an 'hour' column for time-based analysis
if 'hour' not in df.columns:
    if 'datetime' in df.columns:
        df['hour'] = pd.to_datetime(df['datetime']).dt.hour
    elif 'timestamp' in df.columns:
        df['hour'] = pd.to_datetime(df['timestamp']).dt.hour
    else:
        raise ValueError("No suitable datetime or time column found to extract hour.")
df.head()

Unnamed: 0,origin,destination,travel_mode,distance_km,duration_min,warnings,line_geometry,timestamp,timezone,hour
0,"Larry Uteck, Halifax",Hospital in Halifax,DRIVE,10.113,20,,"LINESTRING (-63.67528 44.70207, -63.67451 44.7...",2025-08-26 20:10:05,America/Halifax,20
1,"Windsor Street, Halifax","St. Margaret's Bay Road, Halifax",DRIVE,5.678,13,,"LINESTRING (-63.60553 44.65555, -63.60527 44.6...",2025-08-26 20:10:05,America/Halifax,20
2,"Armdale, Halifax","Bayers Lake, Halifax",DRIVE,7.775,12,,"LINESTRING (-63.60788 44.63002, -63.60795 44.6...",2025-08-26 20:10:06,America/Halifax,20
3,"North End, Halifax","Macdonald Bridge, Halifax",BICYCLE,1.723,10,,"LINESTRING (-63.60227 44.65962, -63.60234 44.6...",2025-08-26 20:10:06,America/Halifax,20
4,"Larry Uteck, Halifax",Hospital in Halifax,DRIVE,10.113,21,,"LINESTRING (-63.67528 44.70207, -63.67451 44.7...",2025-08-26 20:20:05,America/Halifax,20


In [13]:
# Create a 'route_id' column using origin and destination
if 'origin' in df.columns and 'destination' in df.columns:
    df['route_id'] = df['origin'].astype(str) + ' to ' + df['destination'].astype(str)
else:
    raise ValueError("'origin' and/or 'destination' columns not found in DataFrame.")
df.head()

Unnamed: 0,origin,destination,travel_mode,distance_km,duration_min,warnings,line_geometry,timestamp,timezone,hour,route_id
0,"Larry Uteck, Halifax",Hospital in Halifax,DRIVE,10.113,20,,"LINESTRING (-63.67528 44.70207, -63.67451 44.7...",2025-08-26 20:10:05,America/Halifax,20,"Larry Uteck, Halifax to Hospital in Halifax"
1,"Windsor Street, Halifax","St. Margaret's Bay Road, Halifax",DRIVE,5.678,13,,"LINESTRING (-63.60553 44.65555, -63.60527 44.6...",2025-08-26 20:10:05,America/Halifax,20,"Windsor Street, Halifax to St. Margaret's Bay ..."
2,"Armdale, Halifax","Bayers Lake, Halifax",DRIVE,7.775,12,,"LINESTRING (-63.60788 44.63002, -63.60795 44.6...",2025-08-26 20:10:06,America/Halifax,20,"Armdale, Halifax to Bayers Lake, Halifax"
3,"North End, Halifax","Macdonald Bridge, Halifax",BICYCLE,1.723,10,,"LINESTRING (-63.60227 44.65962, -63.60234 44.6...",2025-08-26 20:10:06,America/Halifax,20,"North End, Halifax to Macdonald Bridge, Halifax"
4,"Larry Uteck, Halifax",Hospital in Halifax,DRIVE,10.113,21,,"LINESTRING (-63.67528 44.70207, -63.67451 44.7...",2025-08-26 20:20:05,America/Halifax,20,"Larry Uteck, Halifax to Hospital in Halifax"


In [4]:
df["timestamp"].min()

'2025-08-26 20:10:05'

In [None]:
# Filter for off-peak hours
off_peak_df = df[~((df['hour'] >= 7) & (df['hour'] < 9) | (df['hour'] >= 17) & (df['hour'] < 19))]

# Calculate the median travel time for each route during off-peak hours
off_peak_stats = off_peak_df.groupby(['route_id', 'travel_mode'])['duration_min'].agg(['median']).reset_index()
off_peak_stats.columns = ['route_id', 'travel_mode', 'median_off_peak_duration_min']

# Filter for peak hours
peak_df = df[((df['hour'] >= 7) & (df['hour'] < 9) | (df['hour'] >= 17) & (df['hour'] < 19))]

# Calculate the median and 95th percentile travel time for each route during peak hours
peak_stats = peak_df.groupby(['route_id', 'travel_mode'])['duration_min'].agg(['median']).reset_index()
peak_stats.columns = ['route_id', 'travel_mode', 'median_peak_duration_min']

# Merge the stats
final_stats1 = pd.merge(off_peak_stats, peak_stats, on=['route_id', 'travel_mode'], how='left')

# Add calculated columns
final_stats1['peak_vs_off_peak_median_diff'] = final_stats1['median_peak_duration_min'] - final_stats1['median_off_peak_duration_min']

# Calculate the score and sort by it
final_stats1['score'] = final_stats1['peak_vs_off_peak_median_diff']/ final_stats1['median_off_peak_duration_min']
final_stats = final_stats1.sort_values(by='score', ascending=False)

# Drop the specified route
final_stats1 = final_stats1[final_stats1['route_id'] != 'Westshore, Victoria to Downtown Victoria']

print("Travel time statistics:")
display(final_stats1)

Travel time statistics:


Unnamed: 0,route_id,travel_mode,median_off_peak_duration_min,median_peak_duration_min,peak_vs_off_peak_median_diff,score
46,"Stayner, Ontario L0M 1S0 to 43.67371164802397,...",DRIVE,93.0,111.0,18.0,0.193548
48,"The Glebe, Ottawa to Any highway, Ottawa",DRIVE,6.0,7.0,1.0,0.166667
4,"44.672247457480196, -63.478352018683125 to She...",TRANSIT,67.0,78.0,11.0,0.164179
44,"St. James, Winnipeg to Transcona, Winnipeg",DRIVE,26.0,30.0,4.0,0.153846
39,"Port Coquitlam, BC to Richmond Sea Island, BC",DRIVE,47.0,54.0,7.0,0.148936
34,"Larry Uteck, Halifax to Hospital in Halifax",DRIVE,16.0,18.0,2.0,0.125
18,"Charleswood, Winnipeg to St. James, Winnipeg",DRIVE,17.0,19.0,2.0,0.117647
49,"Tillicum Centre, Victoria to Patricia Bay High...",DRIVE,9.0,10.0,1.0,0.111111
53,"Windsor Street, Halifax to St. Margaret's Bay ...",DRIVE,9.0,10.0,1.0,0.111111
29,"Fleetwood, Surrey to Langley",DRIVE,27.0,30.0,3.0,0.111111


In [None]:
final_stats1.to_clipboard()

In [17]:
# Filter for weekdays (Monday=0, Sunday=6)
weekday_day = df
df["weekday_df"] = df['timestamp_local'].apply(lambda x: x.weekday())

weekday_df = df[df["weekday_df"] < 5]

# Fill NaN values in 'notes' with an empty string to prevent them from being dropped
weekday_df['notes'] = weekday_df['notes'].fillna('')

# Filter for off-peak hours
off_peak_df = weekday_df[~((weekday_df['hour'] >= 7) & (weekday_df['hour'] < 9) | (weekday_df['hour'] >= 17) & (weekday_df['hour'] < 19))]

# Calculate the median travel time for each route during off-peak hours
off_peak_stats = off_peak_df.groupby(['route_id', 'travel_mode', 'notes']).agg(
    median_off_peak_duration_min=('duration_min', 'median'),
    off_peak_data_points=('duration_min', 'count'),
).reset_index()

# Filter for peak hours
peak_df = weekday_df[((weekday_df['hour'] >= 7) & (weekday_df['hour'] < 9) | (weekday_df['hour'] >= 17) & (weekday_df['hour'] < 19))]

# Calculate the median and 90th percentile travel time for each route during peak hours
peak_stats = peak_df.groupby(['route_id', 'travel_mode'])['duration_min'].agg(['median', lambda x: x.quantile(0.90), 'count']).reset_index()
peak_stats.columns = ['route_id', 'travel_mode', 'median_peak_duration_min', 'p90_peak_duration_min', 'peak_data_points']

# Merge the stats
final_stats = pd.merge(off_peak_stats, peak_stats, on=['route_id', 'travel_mode'], how='left')

# Add calculated columns
final_stats['peak_vs_off_peak_median_diff'] = final_stats['median_peak_duration_min'] - final_stats['median_off_peak_duration_min']
final_stats['p90_vs_median_peak_diff'] = final_stats['p90_peak_duration_min'] - final_stats['median_peak_duration_min']

# Calculate the score and sort by it
final_stats['score'] = (final_stats['peak_vs_off_peak_median_diff'] + final_stats['p90_vs_median_peak_diff']) / final_stats['median_off_peak_duration_min']
final_stats = final_stats.sort_values(by='score', ascending=False)

# Drop the specified route
final_stats = final_stats[final_stats['route_id'] != 'Westshore, Victoria to Downtown Victoria']

print("Travel time statistics (Weekdays only):")
display(final_stats)

Travel time statistics (Weekdays only):


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weekday_df['notes'] = weekday_df['notes'].fillna('')


Unnamed: 0,route_id,travel_mode,notes,median_off_peak_duration_min,off_peak_data_points,median_peak_duration_min,p90_peak_duration_min,peak_data_points,peak_vs_off_peak_median_diff,p90_vs_median_peak_diff,score,Unnamed: 12,Unnamed: 13
22,"Dartmouth Bridge Terminal, Dartmouth to Mount ...",TRANSIT,23.0,1242.0,23.0,39.25,156.0,0.0,16.25,0.706522,,,
18,"Charleswood, Winnipeg to St. James, Winnipeg",DRIVE,Submitted by Steven Dennis,17.0,1415.0,19.0,25.65,168.0,2.0,6.65,0.508824,,
53,"Windsor Street, Halifax to St. Margaret's Bay ...",DRIVE,9.0,1419.0,10.0,13.0,168.0,1.0,3.0,0.444444,,,
15,"Bathurst and St. Clair, Toronto to 43.73014981...",DRIVE,17.0,1238.0,18.0,24.0,156.0,1.0,6.0,0.411765,,,
46,"Stayner, Ontario L0M 1S0 to 43.67371164802397,...",DRIVE,93.0,470.0,111.0,130.1,60.0,18.0,19.1,0.398925,,,
48,"The Glebe, Ottawa to Any highway, Ottawa",DRIVE,6.0,1415.0,7.0,8.0,168.0,1.0,1.0,0.333333,,,
34,"Larry Uteck, Halifax to Hospital in Halifax",DRIVE,16.0,1419.0,18.0,21.0,168.0,2.0,3.0,0.3125,,,
32,"Highway 401, Mississauga to Highway 401 and Do...",DRIVE,40.0,1238.0,43.0,52.0,156.0,3.0,9.0,0.3,,,
30,"Gardiner Expressway and Highway 427, Toronto t...",DRIVE,20.0,1415.0,20.5,26.0,168.0,0.5,5.5,0.3,,,
31,Gatineau to Ottawa,TRANSIT,20.0,1238.0,20.0,26.0,156.0,0.0,6.0,0.3,,,


In [None]:
from shapely.wkt import loads
import numpy as np
from geopy.distance import great_circle
from itertools import combinations

# --- Linestring Consistency Check ---

# Function to extract start and end points from a WKT linestring
def get_start_end_points(wkt_string):
    if not isinstance(wkt_string, str) or not wkt_string.startswith('LINESTRING'):
        return None, None
    try:
        line = loads(wkt_string)
        # Note: Shapely coordinates are (lon, lat), geopy expects (lat, lon)
        start = (line.coords[0][1], line.coords[0][0])
        end = (line.coords[-1][1], line.coords[-1][0])
        return start, end
    except Exception as e:
        return None, None

# Apply the function to the dataframe
df[['start_point', 'end_point']] = df['line_geometry'].apply(get_start_end_points).apply(pd.Series)

# Function to calculate max distance between a list of points
def get_max_distance(points):
    points = [p for p in points if p is not None and all(v is not None for v in p)]
    if len(points) < 2:
        return 0
    
    max_dist = 0
    for p1, p2 in combinations(points, 2):
        dist = great_circle(p1, p2).kilometers
        if dist > max_dist:
            max_dist = dist
    return max_dist

# Group by route and get unique start/end points
route_points = df.groupby('route_id').agg({
    'start_point': lambda x: list(x.unique()),
    'end_point': lambda x: list(x.unique())
}).reset_index()

# Calculate the max distance for start and end points
route_points['max_start_point_dist_km'] = route_points['start_point'].apply(get_max_distance)
route_points['max_end_point_dist_km'] = route_points['end_point'].apply(get_max_distance)

# Identify routes with start or end points spread by more than 1 km
inconsistent_routes = route_points[
    (route_points['max_start_point_dist_km'] > 1) | 
    (route_points['max_end_point_dist_km'] > 1)
]

print("Inconsistent Route Analysis (Max distance > 1km):")
if not inconsistent_routes.empty:
    print("Found routes with significant variation in start/end points:")
    display(inconsistent_routes[['route_id', 'max_start_point_dist_km', 'max_end_point_dist_km']])
else:
    print("All routes appear to have consistent start and end points within a 1km radius.")

Inconsistent Route Analysis (Max distance > 1km):
Found routes with significant variation in start/end points:


Unnamed: 0,route_id,max_start_point_dist_km,max_end_point_dist_km
8,"88th Ave, Surrey to Coquitlam",7.784787,0.0
12,"Barrhaven Centre Station, Ottawa to Campus Sta...",0.0,4.853496
19,"Cole Harbour, Nova Scotia to Shearwater, Nova ...",1.522534,0.0
45,"Stayner, Ontario L0M 1S0 to 43.67371164802397,...",8.436852,0.0
50,"Westshore, Victoria to Downtown Victoria",9.921697,0.002343
