In [None]:
# --- Rush Hour Analysis most recent year

df['time_dt'] = pd.to_datetime(df['time_full_str'])
start_date = pd.Timestamp(f"2024-10-01")
end_date = pd.Timestamp(f"2025-09-30")
all_data = df[(df['time_dt'] <= end_date) & (df['time_dt'] >= start_date) & (df['time_dt'].dt.weekday < 5) & (df['trip_dist'] >= 10) & (df['mode'] == 0)].copy()

hour = all_data['time_dt'].dt.hour
is_rush_hour = ((hour >= 8) & (hour < 10)) | ((hour >= 17) & (hour < 19))
all_data['period'] = 'Non-Rush Hour'
all_data.loc[is_rush_hour, 'period'] = 'Rush Hour'

grouping_cols = ['tripid', 'cityname_corrected', 'lat_orig', 'lon_orig', 'lat_dest', 'lon_dest', 'trip_dist', 'period']
agg_times = all_data.groupby(grouping_cols)['traffic_min'].agg(['median', 'count']).reset_index()

comparison_table = agg_times.pivot_table(
    index=[col for col in grouping_cols if col != 'period'],
    columns='period',
    values=['median', 'count']
).reset_index()

comparison_table.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in comparison_table.columns.values]
comparison_table.rename(columns={
    'median_Non-Rush Hour': 'median_non_rush_hour',
    'median_Rush Hour': 'median_rush_hour',
    'count_Non-Rush Hour': 'count_non_rush_hour',
    'count_Rush Hour': 'count_rush_hour',
    'tripid_': 'tripid',
    'mode_': 'mode',
    'cityname_corrected_': 'cityname_corrected',
    'lat_orig_': 'lat_orig',
    'lon_orig_': 'lon_orig',
    'lat_dest_': 'lat_dest',
    'lon_dest_': 'lon_dest',
    'trip_dist_': 'trip_dist'
}, inplace=True)

comparison_table.fillna(0, inplace=True)
comparison_table["rush_hour_diff"] = comparison_table['median_rush_hour'] - comparison_table['median_non_rush_hour']
comparison_table['rush_hour_impact'] = ((comparison_table['median_rush_hour'] - comparison_table['median_non_rush_hour']) / comparison_table['median_non_rush_hour']) * 100

for col in ['count_non_rush_hour', 'count_rush_hour']:
    if col in comparison_table.columns:
        comparison_table[col] = comparison_table[col].astype(int)

comparison_table['origin'] = comparison_table['lat_orig'].astype(str) + ',' + comparison_table['lon_orig'].astype(str)
comparison_table['dest'] = comparison_table['lat_dest'].astype(str) + ',' + comparison_table['lon_dest'].astype(str)

final_table = comparison_table.sort_values('rush_hour_diff', ascending=False)
display(final_table[(final_table["count_non_rush_hour"] >= 3) & (final_table["count_rush_hour"] >= 3)][['tripid', 'cityname_corrected', 'origin', 'dest', 'trip_dist', 'median_non_rush_hour', 'median_rush_hour', 'rush_hour_impact', 'count_non_rush_hour', 'count_rush_hour']].head(20))


In [None]:
grouping_cols = ['tripid', 'cityname_corrected', 'lat_orig', 'lon_orig', 'lat_dest', 'lon_dest', 'trip_dist', 'period']
agg_times = all_data.groupby(grouping_cols)['traffic_min'].agg(['median', 'count']).reset_index()

comparison_table = agg_times.pivot_table(
    index=[col for col in grouping_cols if col != 'period'],
    columns='period',
    values=['median', 'count']
).reset_index()

comparison_table.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in comparison_table.columns.values]
comparison_table.rename(columns={
    'median_Non-Rush Hour': 'median_non_rush_hour',
    'median_Rush Hour': 'median_rush_hour',
    'count_Non-Rush Hour': 'count_non_rush_hour',
    'count_Rush Hour': 'count_rush_hour',
    'tripid_': 'tripid',
    'mode_': 'mode',
    'cityname_corrected_': 'cityname_corrected',
    'lat_orig_': 'lat_orig',
    'lon_orig_': 'lon_orig',
    'lat_dest_': 'lat_dest',
    'lon_dest_': 'lon_dest',
    'trip_dist_': 'trip_dist'
}, inplace=True)

comparison_table.fillna(0, inplace=True)
comparison_table["rush_hour_diff"] = comparison_table['median_rush_hour'] - comparison_table['median_non_rush_hour']
comparison_table['rush_hour_impact'] = ((comparison_table['median_rush_hour'] - comparison_table['median_non_rush_hour']) / comparison_table['median_non_rush_hour']) * 100

for col in ['count_non_rush_hour', 'count_rush_hour']:
    if col in comparison_table.columns:
        comparison_table[col] = comparison_table[col].astype(int)

comparison_table['origin'] = comparison_table['lat_orig'].astype(str) + ',' + comparison_table['lon_orig'].astype(str)
comparison_table['dest'] = comparison_table['lat_dest'].astype(str) + ',' + comparison_table['lon_dest'].astype(str)

final_table = comparison_table.sort_values('rush_hour_diff', ascending=False)
display(final_table[(final_table["count_non_rush_hour"] >= 3) & (final_table["count_rush_hour"] >= 3)][['tripid', 'cityname_corrected', 'origin', 'dest', 'trip_dist', 'median_non_rush_hour', 'median_rush_hour', 'rush_hour_impact', 'count_non_rush_hour', 'count_rush_hour']].head(20))


In [None]:
# Convert to float32 to save memory
all_data['lat_orig'] = all_data['lat_orig'].astype(np.float32)
all_data['lon_orig'] = all_data['lon_orig'].astype(np.float32)
all_data['lat_dest'] = all_data['lat_dest'].astype(np.float32)
all_data['lon_dest'] = all_data['lon_dest'].astype(np.float32)

def cluster_and_assign(full_coords, sample_size=10000, eps_km=0.5):
    # Sample for clustering
    if len(full_coords) > sample_size:
        sample_idx = np.random.choice(len(full_coords), sample_size, replace=False)
        sample_coords = full_coords[sample_idx]
    else:
        sample_idx = np.arange(len(full_coords))
        sample_coords = full_coords

    # Cluster the sample
    db = DBSCAN(eps=eps_km/6371, min_samples=1, metric='haversine', algorithm='ball_tree')
    db.fit(np.radians(sample_coords))
    sample_labels = db.labels_

    # Assign clusters to all points using nearest neighbor in the sample
    tree = BallTree(np.radians(sample_coords), metric='haversine')
    dist, ind = tree.query(np.radians(full_coords), k=1)
    assigned_labels = sample_labels[ind.flatten()]
    return assigned_labels

# Cluster origins
orig_coords = all_data[['lat_orig', 'lon_orig']].values
all_data['origin_cluster'] = cluster_and_assign(orig_coords)

# Cluster destinations
dest_coords = all_data[['lat_dest', 'lon_dest']].values
all_data['dest_cluster'] = cluster_and_assign(dest_coords)

# Combine both clusters to define a route group
all_data['route_group'] = all_data['origin_cluster'].astype(str) + '_' + all_data['dest_cluster'].astype(str)

all_data.head()

In [None]:
# --- Rush Hour Analysis by Route Group (without trip_dist in grouping) ---
# This cell groups by route_group and computes rush hour/non-rush hour comparison as before, then merges trip_dist back in

grouping_cols = ['route_group', 'cityname_corrected', 'period']

# Ensure 'period' column exists
if 'period' not in all_data.columns:
    hour = all_data['time_dt'].dt.hour
    is_rush_hour = ((hour >= 8) & (hour < 10)) | ((hour >= 17) & (hour < 19))
    all_data['period'] = 'Non-Rush Hour'
    all_data.loc[is_rush_hour, 'period'] = 'Rush Hour'

agg_times = all_data.groupby(grouping_cols)['traffic_min'].agg(['mean', 'count']).reset_index()

comparison_table = agg_times.pivot_table(
    index=[col for col in grouping_cols if col != 'period'],
    columns='period',
    values=['mean', 'count']
).reset_index()

comparison_table.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in comparison_table.columns.values]
comparison_table.rename(columns={
    'mean_Non-Rush Hour': 'mean_non_rush_hour',
    'mean_Rush Hour': 'mean_rush_hour',
    'count_Non-Rush Hour': 'count_non_rush_hour',
    'count_Rush Hour': 'count_rush_hour',
    'route_group_': 'route_group',
    'cityname_corrected_': 'cityname_corrected'
}, inplace=True)

comparison_table.fillna(0, inplace=True)
comparison_table["rush_hour_diff"] = comparison_table['mean_rush_hour'] - comparison_table['mean_non_rush_hour']
# comparison_table['rush_hour_impact'] = ((comparison_table['mean_rush_hour'] - comparison_table['mean_non_rush_hour']) / comparison_table['mean_non_rush_hour']) * 100

for col in ['count_non_rush_hour', 'count_rush_hour']:
    if col in comparison_table.columns:
        comparison_table[col] = comparison_table[col].astype(int)

# Merge trip_dist back in (using first value for each route_group)
trip_dist_map = all_data.groupby('route_group')[['trip_dist', "origin", "dest"]].first().reset_index()
comparison_table = comparison_table.merge(trip_dist_map, on='route_group', how='left')

# Show top 20 grouped routes by rush hour impact
comparison_table = comparison_table.sort_values('rush_hour_diff', ascending=False)
display_table = comparison_table[(comparison_table["count_non_rush_hour"] >= 20) & (comparison_table["count_rush_hour"] >= 15)][['route_group', 'cityname_corrected', 'trip_dist', "origin", "dest", 'mean_non_rush_hour', 'median_rush_hour', 'count_non_rush_hour', 'count_rush_hour', "rush_hour_diff"]]
display_table.head(10)