# PT Route Generation

This notebook loads matched e-scooter trips and queries public transport routes for comparison.
The results are saved to `pt_comparison_all_trips.parquet` for analysis.

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import gc
import sys

sys.path.insert(0, '..')
from routing import get_pt_route, batch_pt_routes

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11

## Load Matched Trips

In [None]:
from city_info import normalize_city

needed_cols = ['d_idx', 'd_lat', 'd_lon', 'f_lat', 'f_lon', 'd_time', 'f_time', 'prob', 'prob_null', 
               'opt_route_km', 'opt_route_min', 'provider']
candidates = pd.read_parquet('../matching/matching_candidates_scored.parquet', columns=needed_cols)

best_idx = candidates.groupby('d_idx')['prob'].idxmax()
matched_trips = candidates.loc[best_idx].copy()
print(f"Matched trips: {len(matched_trips):,}")

del candidates
gc.collect()

# Merge clustered_provider from main events file
events_cp = pd.read_parquet('../vehicle_events_export.parquet', columns=['clustered_provider'])
events_cp['d_idx'] = events_cp.index
matched_trips = matched_trips.merge(events_cp[['d_idx', 'clustered_provider']], on='d_idx', how='left')
del events_cp
gc.collect()

# Calculate delta time (actual trip duration) in minutes
matched_trips['d_time'] = pd.to_datetime(matched_trips['d_time'])
matched_trips['f_time'] = pd.to_datetime(matched_trips['f_time'])
matched_trips['delta_time'] = (matched_trips['f_time'] - matched_trips['d_time']).dt.total_seconds() / 60
matched_trips['opt_route_min'] = matched_trips['delta_time']

# Create city column
def get_city(row):
    if pd.notna(row['clustered_provider']):
        city = row['clustered_provider'].replace('voi_', '')
    else:
        provider = row['provider']
        for prefix in ['dott_', 'bolt_', 'lime_', 'zeus_', 'voi_', 'yoio_', 'hopp_']:
            if provider.startswith(prefix):
                city = provider[len(prefix):]
                break
        else:
            city = provider
    return normalize_city(city)

matched_trips['city'] = matched_trips.apply(get_city, axis=1)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

ax1 = axes[0]
ax1.hist(matched_trips['opt_route_km'], bins=50, edgecolor='white', alpha=0.7, color='coral')
ax1.set_xlabel('Trip Distance (km)')
ax1.set_ylabel('Number of Trips')
ax1.set_title('E-Scooter Trip Distance Distribution')
ax1.axvline(matched_trips['opt_route_km'].median(), color='red', linestyle='--', 
            label=f"Median: {matched_trips['opt_route_km'].median():.1f} km")
ax1.legend()

ax2 = axes[1]
ax2.hist(matched_trips['opt_route_min'], bins=50, edgecolor='white', alpha=0.7, color='teal')
ax2.set_xlabel('Trip Duration (minutes)')
ax2.set_ylabel('Number of Trips')
ax2.set_title('E-Scooter Trip Duration Distribution')
ax2.axvline(matched_trips['opt_route_min'].median(), color='red', linestyle='--',
            label=f"Median: {matched_trips['opt_route_min'].median():.1f} min")
ax2.legend()

plt.tight_layout()
plt.show()

## Query Public Transport Routes

In [None]:
matched_trips['distance_bin'] = pd.cut(
    matched_trips['opt_route_km'], 
    bins=[0, 1, 2, 3, 5, 10, float('inf')],
    labels=['0-1km', '1-2km', '2-3km', '3-5km', '5-10km', '>10km']
)

In [7]:
pt_results = batch_pt_routes(
    matched_trips, 
    max_workers=24,
    checkpoint_path="pt_results_checkpoint.parquet",
    chunk_size=50000
)


[DEBUG] Starting batch_pt_routes...
[DEBUG] DataFrame shape: (2239465, 16)
[DEBUG] Loading checkpoint from pt_results_checkpoint.parquet...
[DEBUG] Checkpoint loaded in 1.1s, shape: (2200000, 12)
[DEBUG] Built completed_indices set in 0.1s
Resuming from checkpoint: 2,200,000 / 2,239,465 already done
[DEBUG] Building remaining indices list...
[DEBUG] Built remaining list in 0.1s, 39,465 items to process


Querying PT routes:  98%|#########8| 2200000/2239465 [00:00<?, ?it/s]

[DEBUG] Starting chunk 1, indices 0 to 39465
[DEBUG] Submitting 39465 tasks to executor...
[DEBUG] Submitted all tasks in 30.8s, waiting for results...
[DEBUG] Chunk 1 complete, processing results...
[DEBUG] Saving checkpoint...
[DEBUG] Checkpoint saved in 2.6s


## Save Results

In [None]:
# Merge PT results with matched_trips
pt_results_renamed = pt_results.rename(columns={
    'success': 'pt_success',
    'journey_duration_min': 'pt_journey_min',
    'walking_min': 'pt_walking_min',
    'transit_min': 'pt_transit_min',
    'wait_min': 'pt_wait_min',
    'total_min': 'pt_total_min',
    'transfers': 'pt_transfers',
    'modes': 'pt_modes',
    'walk_only_min': 'pt_walk_only_min',
    'journey_start': 'pt_journey_start',
})
pt_results_renamed = pt_results_renamed.drop(columns=['original_time', 'query_time'], errors='ignore')

pt_comparison = matched_trips.reset_index(drop=True).join(pt_results_renamed)
pt_comparison['time_diff_min'] = pt_comparison['pt_total_min'] - pt_comparison['delta_time']
pt_comparison['escooter_faster'] = pt_comparison['delta_time'] < pt_comparison['pt_total_min']

print(f"PT Success rate: {pt_comparison['pt_success'].mean()*100:.1f}%")
print(f"E-scooter faster: {pt_comparison['escooter_faster'].mean()*100:.1f}%")

In [None]:
pt_comparison.to_parquet('pt_comparison_all_trips.parquet')
print(f"Saved {len(pt_comparison):,} trips to pt_comparison_all_trips.parquet")