Read in pre-computed files of RT and scheduled data and compare!

In [11]:
# required for pandas to read csv from aws
import s3fs
import os
import pandas as pd
import pendulum
import shapely
import geopandas

In [12]:
schedule_feeds = [
    {'schedule_version': '20220507',
    'feed_start_date': '2022-05-20',
    'feed_end_date': '2022-06-02'},
    
    {'schedule_version': '20220603',
    'feed_start_date': '2022-06-04',
    'feed_end_date': '2022-06-07'},
    
    {'schedule_version': '20220608',
    'feed_start_date': '2022-06-09',
    'feed_end_date': '2022-07-08'},

    {'schedule_version': '20220709',
    'feed_start_date': '2022-07-10',
    'feed_end_date': '2022-07-17'},

    {'schedule_version': '20220718',
    'feed_start_date': '2022-07-19',
    'feed_end_date': '2022-07-20'} 
]

In [13]:
for feed in schedule_feeds:
    start_date = feed['feed_start_date']
    end_date = feed['feed_end_date']
    date_range = [d for d in pendulum.period(pendulum.from_format(start_date, 'YYYY-MM-DD'), pendulum.from_format(end_date, 'YYYY-MM-DD')).range('days')]
    
    print(f"Loading schedule version {feed['schedule_version']}")
    schedule_raw = pd.read_csv(f's3://chn-ghost-buses-private/schedule_summaries/route_level/schedule_route_daily_hourly_summary_v{feed["schedule_version"]}.csv')
    
    rt_raw = pd.DataFrame()

    for day in date_range:
        date_str = day.to_date_string()
        print(f"Processing {date_str} at {pendulum.now().to_datetime_string()}")    
        daily_data = pd.read_csv(f's3://chn-ghost-buses-private/bus_hourly_summary_v2/{date_str}.csv')
        rt_raw = rt_raw.append(daily_data)
        
    # basic reformatting
    rt = rt_raw.copy()
    schedule = schedule_raw.copy()
    rt['date'] = pd.to_datetime(rt.data_date, format = '%Y-%m-%d')
    rt['route_id'] = rt['rt']
    schedule['date'] = pd.to_datetime(schedule.date, format = '%Y-%m-%d')
    
    # get total by route by day
    rt_daily_by_rte = rt.groupby(by = ['date', 'route_id'])['trip_count'].sum().reset_index()
    sched_daily_by_rte = schedule.groupby(by = ['date', 'route_id'])['trip_count'].sum().reset_index()
    
    compare_daily_by_rte = rt_daily_by_rte.merge(sched_daily_by_rte, how = 'inner', 
                                             on = ['date', 'route_id'], suffixes = ['_rt', '_sched'])
    
    # compare by day of week 
    compare_daily_by_rte['dayofweek'] = compare_daily_by_rte['date'].dt.dayofweek
    compare_daily_by_rte['day_type'] = compare_daily_by_rte['dayofweek'].map({0: 'wk', 
                                                                             1: 'wk',
                                                                             2: 'wk',
                                                                             3: 'wk',
                                                                             4: 'wk',
                                                                             5: 'sat',
                                                                             6: 'sun'})

    compare_daily_by_rte.loc[compare_daily_by_rte.date.isin(['2022-05-31', '2022-07-04']), 'day_type'] = 'hol'
    
    compare_by_day_type = compare_daily_by_rte.groupby(['route_id', 'day_type'])[['trip_count_rt', 'trip_count_sched']].sum().reset_index()
    
    compare_by_day_type['ratio'] = compare_by_day_type['trip_count_rt'] / compare_by_day_type['trip_count_sched']
    
    # compare_by_day_type.sort_values(by = ['day_type', 'route_id'])
    
    # compare_by_day_type[compare_by_day_type.day_type == 'wk'].sort_values(by = 'ratio')
    
    compare_by_day_type.to_csv(f's3://chn-ghost-buses-private/schedule_rt_comparisons/route_level/schedule_v{feed["schedule_version"]}_realtime_rt_level_comparison_{feed["feed_start_date"]}_to_{feed["feed_end_date"]}.csv', index = False)
    
    

Loading schedule version 20220507
Processing 2022-05-20 at 2022-08-08 21:56:56
Processing 2022-05-21 at 2022-08-08 21:56:56
Processing 2022-05-22 at 2022-08-08 21:56:57
Processing 2022-05-23 at 2022-08-08 21:56:57
Processing 2022-05-24 at 2022-08-08 21:56:57
Processing 2022-05-25 at 2022-08-08 21:56:57
Processing 2022-05-26 at 2022-08-08 21:56:57
Processing 2022-05-27 at 2022-08-08 21:56:58
Processing 2022-05-28 at 2022-08-08 21:56:58
Processing 2022-05-29 at 2022-08-08 21:56:58
Processing 2022-05-30 at 2022-08-08 21:56:58
Processing 2022-05-31 at 2022-08-08 21:56:58
Processing 2022-06-01 at 2022-08-08 21:56:59
Processing 2022-06-02 at 2022-08-08 21:56:59
Loading schedule version 20220603
Processing 2022-06-04 at 2022-08-08 21:57:00
Processing 2022-06-05 at 2022-08-08 21:57:00
Processing 2022-06-06 at 2022-08-08 21:57:00
Processing 2022-06-07 at 2022-08-08 21:57:01
Loading schedule version 20220608
Processing 2022-06-09 at 2022-08-08 21:57:01
Processing 2022-06-10 at 2022-08-08 21:57:0

In [17]:
# build an overall summary from the versioned schedule comparisons
combined = pd.DataFrame()
for feed in schedule_feeds:
    print(f"Processing {feed['schedule_version']}")
    feed_comp = pd.read_csv(f's3://chn-ghost-buses-private/schedule_rt_comparisons/route_level/schedule_v{feed["schedule_version"]}_realtime_rt_level_comparison_{feed["feed_start_date"]}_to_{feed["feed_end_date"]}.csv')
    print(f"Loaded {len(feed_comp)} rows")
    combined = combined.append(feed_comp)
    

Processing 20220507
Loaded 423 rows
Processing 20220603
Loaded 301 rows
Processing 20220608
Loaded 385 rows
Processing 20220709
Loaded 298 rows
Processing 20220718
Loaded 123 rows


In [18]:
summary = combined.groupby(['route_id', 'day_type'])[['trip_count_rt', 'trip_count_sched']].sum().reset_index()

In [19]:
summary['ratio'] = summary['trip_count_rt'] / summary['trip_count_sched']

In [24]:
summary.to_csv(f's3://chn-ghost-buses-private/schedule_rt_comparisons/route_level/combined_schedule_realtime_rt_level_comparison_2022-05-20_to_2022-07-20.csv', index = False)
