Overall, it's an amazing library! Only 2 complaints/challenges:
1. doesn't deal well with the hours being over 24. Not exactly sure how to fix it... 
2. mapping the routes doesn't give the "normal" route. It gives ALL variations meaning that flexible routes like the 5 are an absolute mess
3. schedule variations are still present (weekday-1)

- [Documentation](https://mrcagney.github.io/gtfs_kit_docs/index.html)
- [Github](https://github.com/mrcagney/gtfs_kit/tree/master)
- [more gtfs links](https://gtfs.org/resources/visualizations/)

# Imports/Installation

In [86]:
import gtfs_kit as gk
import pandas as pd 
import os
import sys
import folium
import numpy as np

In [2]:
# getting functions from the parent directory
library_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
if library_path not in sys.path:
    sys.path.append(library_path)
from functions import *

In [3]:
path = "../data/google_transit"

# Using the Library


### Quality Assessment

In [4]:
# gk.list_feed(path)

In [5]:
# removing some rows with null start times
orig_feed = gk.read_feed(path, dist_units='mi')
feed = gk.miscellany.create_shapes(orig_feed, all_trips=True) 
feed = feed.clean()
feed.describe()

Unnamed: 0,indicator,value
0,agencies,[MTA New York City Transit]
1,timezone,America/New_York
2,start_date,20241215
3,end_date,20250117
4,num_routes,29
5,num_trips,20302
6,num_stops,1490
7,num_shapes,214
8,sample_date,20241219
9,num_routes_active_on_sample_date,29


In [6]:
# 214 shape variations!!
feed.trips['shape_id'].nunique()


214

In [7]:
# perfect except for num_stop_time_dists_missing
# gk.miscellany.assess_quality(feed)

In [8]:
gk.miscellany.summarize(feed)

Unnamed: 0,table,column,num_values,num_nonnull_values,num_unique_values,min_value,max_value
0,agency,agency_id,1,1,1,MTA_NYCT,MTA_NYCT
1,agency,agency_name,1,1,1,MTA New York City Transit,MTA New York City Transit
2,agency,agency_url,1,1,1,http://www.mta.info,http://www.mta.info
3,agency,agency_timezone,1,1,1,America/New_York,America/New_York
4,agency,agency_lang,1,1,1,en,en
5,agency,agency_phone,1,1,1,718-330-1234,718-330-1234
0,calendar,service_id,3,3,3,Saturday,Weekday
1,calendar,monday,3,3,2,0,1
2,calendar,tuesday,3,3,2,0,1
3,calendar,wednesday,3,3,2,0,1


In [9]:
week = feed.get_first_week()
# Getting 3 days so that we see the stats for each schedule type
dates = [week[4], week[5], week[6]]  # First Friday, Saturday & Sunday
dates

['20241220', '20241221', '20241222']

In [10]:
trip_stats = feed.compute_trip_stats()
trip_stats.head().T

Unnamed: 0,4,312,684,6,314
trip_id,AFA24GEN-1038-Sunday-00_007200_1..N03R,AFA24GEN-1039-Saturday-00_007200_1..N03R,AFA24GEN-1093-Weekday-00_007450_1..N03R,AFA24GEN-1038-Sunday-00_009200_1..N03R,AFA24GEN-1039-Saturday-00_009200_1..N03R
route_id,1,1,1,1,1
route_short_name,1,1,1,1,1
route_type,1,1,1,1,1
direction_id,0,0,0,0,0
shape_id,shape_006,shape_006,shape_006,shape_006,shape_006
stop_pattern_name,0-1,0-1,0-1,0-1,0-1
num_stops,38,38,38,38,38
start_time,01:12:00,01:12:00,01:14:30,01:32:00,01:32:00
end_time,02:10:30,02:10:30,02:12:30,02:30:30,02:30:30


In [11]:
trip_stats_L = trip_stats[trip_stats['route_id']=='L']

In [12]:
fts = feed.compute_feed_time_series((trip_stats_L), dates, freq='60min')
fts

indicator,num_trip_ends,num_trip_starts,num_trips,service_distance,service_duration,service_speed
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-12-20 00:00:00,4,10,9,106,7,14
2024-12-20 01:00:00,9,7,13,75,5,14
2024-12-20 02:00:00,6,6,10,58,4,14
2024-12-20 03:00:00,6,6,10,58,4,14
2024-12-20 04:00:00,6,6,10,58,4,14
...,...,...,...,...,...,...
2024-12-22 19:00:00,28,24,41,256,16,16
2024-12-22 20:00:00,24,24,38,235,15,16
2024-12-22 21:00:00,24,23,37,227,14,16
2024-12-22 22:00:00,20,15,28,171,11,16


## Plots of Service by Day Type

In [13]:
fts['service_speed'].plot()

<Axes: xlabel='datetime'>

In [14]:
fts['num_trips'].plot()

<Axes: xlabel='datetime'>

## Route Timetables

In [15]:
# possible routes feed.routes['route_id'].unique()
route_id = 'L'
route_timetables = feed.build_route_timetable(route_id, dates)

In [16]:
example_trip_id = [route_timetables['trip_id'].iloc[0]]
route_timetables.head()

Unnamed: 0,route_id,trip_id,service_id,trip_headsign,direction_id,shape_id,stop_id,arrival_time,departure_time,stop_sequence,date
407695,L,BFA24GEN-L049-Weekday-00_000650_L..N01R,Weekday,8 Av,0,shape_173,L29N,00:06:30,00:06:30,1,20241220
407696,L,BFA24GEN-L049-Weekday-00_000650_L..N01R,Weekday,8 Av,0,shape_173,L28N,00:08:00,00:08:00,2,20241220
407697,L,BFA24GEN-L049-Weekday-00_000650_L..N01R,Weekday,8 Av,0,shape_173,L27N,00:10:00,00:10:00,3,20241220
407698,L,BFA24GEN-L049-Weekday-00_000650_L..N01R,Weekday,8 Av,0,shape_173,L26N,00:11:30,00:11:30,4,20241220
407699,L,BFA24GEN-L049-Weekday-00_000650_L..N01R,Weekday,8 Av,0,shape_173,L25N,00:13:00,00:13:00,5,20241220


In [17]:
# Getting all Shape IDs in one file 
print(f"There are {route_timetables['shape_id'].nunique()} unique routes for the {route_id}")

There are 5 unique routes for the L


In [18]:
# getting one trip_id per shape_id
unique_shape_ids = route_timetables.drop_duplicates(subset=['route_id', 'service_id', 'shape_id'])
unique_shape_ids

Unnamed: 0,route_id,trip_id,service_id,trip_headsign,direction_id,shape_id,stop_id,arrival_time,departure_time,stop_sequence,date
407695,L,BFA24GEN-L049-Weekday-00_000650_L..N01R,Weekday,8 Av,0,shape_173,L29N,00:06:30,00:06:30,1,20241220
407791,L,BFA24GEN-L049-Weekday-00_005400_L..S01R,Weekday,Canarsie-Rockaway Pkwy,1,shape_170,L01S,00:54:00,00:54:00,1,20241220
408487,L,BFA24GEN-L049-Weekday-00_031950_L..N02R,Weekday,8 Av,0,shape_172,L28N,05:19:30,05:19:30,1,20241220
409581,L,BFA24GEN-L049-Weekday-00_045300_L..S05R,Weekday,Myrtle-Wyckoff Avs,1,shape_169,L01S,07:33:00,07:33:00,1,20241220
410003,L,BFA24GEN-L049-Weekday-00_048750_L..N05R,Weekday,8 Av,0,shape_171,L17N,08:07:30,08:07:30,1,20241220
385056,L,BFA24GEN-L026-Saturday-00_000650_L..N01R,Saturday,8 Av,0,shape_173,L29N,00:06:30,00:06:30,1,20241221
385152,L,BFA24GEN-L026-Saturday-00_005400_L..S01R,Saturday,Canarsie-Rockaway Pkwy,1,shape_170,L01S,00:54:00,00:54:00,1,20241221
386472,L,BFA24GEN-L026-Saturday-00_044400_L..N02R,Saturday,8 Av,0,shape_172,L28N,07:24:00,07:24:00,1,20241221
396998,L,BFA24GEN-L026-Sunday-00_000650_L..N01R,Sunday,8 Av,0,shape_173,L29N,00:06:30,00:06:30,1,20241222
397094,L,BFA24GEN-L026-Sunday-00_005400_L..S01R,Sunday,Canarsie-Rockaway Pkwy,1,shape_170,L01S,00:54:00,00:54:00,1,20241222


In [None]:
# You can map routes with a trip ID
# some routes don't work b/c there's no shape ID in trips for them
unique_shape_id_example = [unique_shape_ids['trip_id'].iloc[0]]
plot = feed.map_trips(trip_ids=unique_shape_id_example, show_stops=True, show_direction=True)
display_gk_plot(plot)


## Trip Stats

In [20]:
friday = dates = [week[4]]
trip_information_fri = feed.compute_route_stats(trip_stats, friday)
# I don't trust the peak information
trip_information_fri = trip_information_fri.drop(columns=['route_short_name', 'date', 'peak_num_trips', 'is_bidirectional'
                                                          ,'route_type', 'is_loop', 'peak_start_time', 'peak_end_time'])
trip_information_fri.round(3)

Unnamed: 0,route_id,num_trips,num_trip_starts,num_trip_ends,num_stop_patterns,start_time,end_time,max_headway,min_headway,mean_headway,service_distance,service_duration,service_speed,mean_trip_distance,mean_trip_duration
0,1,462,462,446,6,00:06:30,25:57:00,9,0,5,6634,434,15,14,1
1,2,324,324,301,7,00:19:00,27:40:30,13,0,7,7929,541,15,24,2
2,3,304,304,300,5,00:02:30,24:48:00,12,2,7,4750,315,15,16,1
3,4,370,370,351,21,00:05:30,27:06:00,12,0,6,7363,445,17,20,1
4,5,329,329,328,29,00:02:00,24:16:00,14,0,7,6337,391,16,19,1
5,6,439,439,419,6,00:07:00,26:06:00,10,0,5,5854,417,14,13,1
6,6X,113,113,113,2,06:09:00,21:15:00,10,3,7,1582,111,14,14,1
7,7,535,535,520,5,00:11:30,25:23:30,10,0,4,5207,342,15,10,1
8,7X,107,107,107,2,06:15:00,22:15:30,10,2,6,1052,66,16,10,1
9,A,375,375,351,22,00:11:30,27:49:00,16,0,7,9343,521,18,25,1


In [21]:
# the headway values don't help at all...
# gk.routes.compute_route_stats_0(trip_stats
#                                 , headway_start_time='00:00:00', headway_end_time='06:00:00', split_directions=True)

In [22]:
# to get the headway for this I'll need to include avg trip duration for each route 
gk.routes.compute_route_time_series_0(trip_stats, friday[0], split_directions=False, freq='60Min')['num_trips']

route_id,1,2,3,4,5,6,6X,7,7X,A,...,H,J,L,M,N,Q,R,SI,W,Z
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-12-20 00:00:00,25,35,17,32,18,27,0,25,0,44,...,16,18,27,21,28,21,18,12,0,0
2024-12-20 01:00:00,44,37,24,42,18,46,0,36,0,58,...,21,36,39,18,33,36,27,18,0,0
2024-12-20 02:00:00,37,52,24,48,18,38,0,27,0,69,...,18,36,30,18,42,36,27,18,0,0
2024-12-20 03:00:00,36,52,24,48,18,37,0,27,0,69,...,18,36,30,18,42,36,27,18,0,0
2024-12-20 04:00:00,37,53,24,50,19,38,0,30,0,70,...,18,36,31,18,42,37,27,18,0,0
2024-12-20 05:00:00,44,58,33,60,25,46,0,40,0,74,...,19,39,39,27,49,43,45,19,0,0
2024-12-20 06:00:00,58,74,45,71,46,57,7,82,9,86,...,20,49,58,44,64,58,65,23,4,0
2024-12-20 07:00:00,79,93,61,93,69,78,18,119,21,103,...,21,58,85,53,74,73,86,33,15,4
2024-12-20 08:00:00,101,104,74,101,86,97,19,130,21,111,...,24,67,117,65,79,83,90,29,23,6
2024-12-20 09:00:00,110,110,79,100,90,104,15,116,14,109,...,23,73,125,70,81,82,88,19,22,1


In [23]:
## this ignores trips that end after 24:00
# trip_information_fri.sort_values(by='num_trip_ends')
trip_information_fri.sort_values(by='service_speed').head()

Unnamed: 0,route_id,num_trips,num_trip_starts,num_trip_ends,num_stop_patterns,start_time,end_time,max_headway,min_headway,mean_headway,service_distance,service_duration,service_speed,mean_trip_distance,mean_trip_duration
15,FS,238,238,238,2,00:00:00,23:56:30,10,10,10,322,27,12,1,0
27,W,187,187,187,4,06:13:00,22:55:30,18,2,10,1805,138,13,10,1
25,R,303,303,297,9,00:15:00,25:17:00,12,2,7,5676,410,14,19,1
5,6,439,439,419,6,00:07:00,26:06:00,10,0,5,5854,417,14,13,1
6,6X,113,113,113,2,06:09:00,21:15:00,10,3,7,1582,111,14,14,1


## Stop Patterns

In [24]:
#  trip_stats = feed.compute_trip_stats()
trip_stats.head()

Unnamed: 0,trip_id,route_id,route_short_name,route_type,direction_id,shape_id,stop_pattern_name,num_stops,start_time,end_time,start_stop_id,end_stop_id,is_loop,duration,distance,speed
4,AFA24GEN-1038-Sunday-00_007200_1..N03R,1,1,1,0,shape_006,0-1,38,01:12:00,02:10:30,142N,101N,0,1,15,15
312,AFA24GEN-1039-Saturday-00_007200_1..N03R,1,1,1,0,shape_006,0-1,38,01:12:00,02:10:30,142N,101N,0,1,15,15
684,AFA24GEN-1093-Weekday-00_007450_1..N03R,1,1,1,0,shape_006,0-1,38,01:14:30,02:12:30,142N,101N,0,1,15,15
6,AFA24GEN-1038-Sunday-00_009200_1..N03R,1,1,1,0,shape_006,0-1,38,01:32:00,02:30:30,142N,101N,0,1,15,15
314,AFA24GEN-1039-Saturday-00_009200_1..N03R,1,1,1,0,shape_006,0-1,38,01:32:00,02:30:30,142N,101N,0,1,15,15


In [25]:
unique_f_stop_patterns = trip_stats[(trip_stats['route_id']=='F')].drop_duplicates(subset='stop_pattern_name').sort_values(by='stop_pattern_name')
unique_f_stop_patterns.head(3)

Unnamed: 0,trip_id,route_id,route_short_name,route_type,direction_id,shape_id,stop_pattern_name,num_stops,start_time,end_time,start_stop_id,end_stop_id,is_loop,duration,distance,speed
10336,BFA24GEN-F078-Weekday-00_030750_F..N69R,F,F,1,0,shape_122,0-1,45,05:07:30,06:37:30,D43N,F01N,0,2,26,17
9900,BFA24GEN-F045-Saturday-00_014150_F..N07R,F,F,1,0,shape_124,0-2,55,02:21:30,04:05:30,D43N,F01N,0,2,26,15
10398,BFA24GEN-F078-Weekday-00_049150_F..N68R,F,F,1,0,shape_145,0-3,40,08:11:30,09:42:00,F35N,F01N,0,2,24,16


In [26]:
# the Z has 6 trips with different stopping patterns
# I think the stop patterns are for number of variations of each direction shape
## e.g.: 0-1 is the most popular stop pattern going south, 0-2 is 2nd most popular, etc...
trip_stats[(trip_stats['route_id']=='F')].groupby('stop_pattern_name')['duration'].count()

stop_pattern_name
0-1    256
0-2     96
0-3     32
0-4     11
0-5      7
0-6      1
1-1    299
1-2     65
1-3     38
1-4      1
Name: duration, dtype: int64

In [27]:
unique_shape_ids1 = unique_f_stop_patterns
unique_shape_ids1.head(3)

Unnamed: 0,trip_id,route_id,route_short_name,route_type,direction_id,shape_id,stop_pattern_name,num_stops,start_time,end_time,start_stop_id,end_stop_id,is_loop,duration,distance,speed
10336,BFA24GEN-F078-Weekday-00_030750_F..N69R,F,F,1,0,shape_122,0-1,45,05:07:30,06:37:30,D43N,F01N,0,2,26,17
9900,BFA24GEN-F045-Saturday-00_014150_F..N07R,F,F,1,0,shape_124,0-2,55,02:21:30,04:05:30,D43N,F01N,0,2,26,15
10398,BFA24GEN-F078-Weekday-00_049150_F..N68R,F,F,1,0,shape_145,0-3,40,08:11:30,09:42:00,F35N,F01N,0,2,24,16


In [42]:
unique_shape_id_example = unique_shape_ids1['trip_id'].iloc[6]
# weekday-2 trips happen in august to sept (20240812,20240902)
feed.trips[feed.trips['trip_id']==unique_shape_id_example]

Unnamed: 0,route_id,trip_id,service_id,trip_headsign,direction_id,shape_id
10335,F,BFA24GEN-F078-Weekday-00_030250_F..S69R,Weekday,Coney Island-Stillwell Av,1,shape_140


In [43]:
# F
# 0-2 stop pattern, shape=186, local
plot = feed.map_trips(trip_ids=[unique_shape_id_example], show_stops=True, show_direction=True)
display_gk_plot(plot)

## Mapping Routes

In [44]:
# will show different variations depending on the service variations
plot = gk.routes.map_routes(feed, route_ids=['F'], show_stops=True)
display_gk_plot(plot)

## Mean duration by shape ID

In [31]:
duration_by_shape = trip_stats.groupby(['route_id', 'stop_pattern_name', 'shape_id']).mean('duration')
duration_by_shape_no_S = duration_by_shape.reset_index()[~duration_by_shape.reset_index().route_id.isin(['FS', 'GS', 'H'])]

In [32]:
# A has the longest service (barely beating the 2)
# Ironically the A shuttle (lefferts branch late night) is the shortest non-shuttle
# duration_by_shape.sort_values(by='duration')
duration_by_shape_no_S.sort_values(by='duration')


Unnamed: 0,route_id,stop_pattern_name,shape_id,route_type,direction_id,num_stops,is_loop,duration,distance,speed
100,A,1-4,shape_098,1,1,8,0,0,3,14
88,A,0-4,shape_100,1,0,8,0,0,3,14
61,5,1-2,shape_062,1,1,6,0,0,4,21
47,5,0-2,shape_014,1,0,6,0,0,4,20
172,M,1-3,shape_177,1,1,8,0,0,3,13
...,...,...,...,...,...,...,...,...,...,...
8,2,0-3,shape_037,1,0,52,0,2,26,15
10,2,1-2,shape_007,1,1,61,0,2,24,14
87,A,0-3,shape_163,1,0,59,0,2,31,17
11,2,1-3,shape_009,1,1,52,0,2,26,14


In [33]:
# the 2 has the most stops (61)
duration_by_shape.sort_values(by='num_stops')
duration_by_shape_no_S.sort_values(by='num_stops')[0:15]

Unnamed: 0,route_id,stop_pattern_name,shape_id,route_type,direction_id,num_stops,is_loop,duration,distance,speed
61,5,1-2,shape_062,1,1,6,0,0,4,21
47,5,0-2,shape_014,1,0,6,0,0,4,20
100,A,1-4,shape_098,1,1,8,0,0,3,14
88,A,0-4,shape_100,1,0,8,0,0,3,14
169,M,0-3,shape_174,1,0,8,0,0,3,13
199,R,1-3,shape_201,1,1,8,0,0,3,14
172,M,1-3,shape_177,1,1,8,0,0,3,13
14,3,0-2,shape_003,1,0,9,0,0,6,16
16,3,1-2,shape_041,1,1,9,0,0,6,16
206,SI,1-2,shape_211,2,1,10,0,1,14,25


### Mapping one of the routes

In [34]:
unique_shape_ids

Unnamed: 0,route_id,trip_id,service_id,trip_headsign,direction_id,shape_id,stop_id,arrival_time,departure_time,stop_sequence,date
407695,L,BFA24GEN-L049-Weekday-00_000650_L..N01R,Weekday,8 Av,0,shape_173,L29N,00:06:30,00:06:30,1,20241220
407791,L,BFA24GEN-L049-Weekday-00_005400_L..S01R,Weekday,Canarsie-Rockaway Pkwy,1,shape_170,L01S,00:54:00,00:54:00,1,20241220
408487,L,BFA24GEN-L049-Weekday-00_031950_L..N02R,Weekday,8 Av,0,shape_172,L28N,05:19:30,05:19:30,1,20241220
409581,L,BFA24GEN-L049-Weekday-00_045300_L..S05R,Weekday,Myrtle-Wyckoff Avs,1,shape_169,L01S,07:33:00,07:33:00,1,20241220
410003,L,BFA24GEN-L049-Weekday-00_048750_L..N05R,Weekday,8 Av,0,shape_171,L17N,08:07:30,08:07:30,1,20241220
385056,L,BFA24GEN-L026-Saturday-00_000650_L..N01R,Saturday,8 Av,0,shape_173,L29N,00:06:30,00:06:30,1,20241221
385152,L,BFA24GEN-L026-Saturday-00_005400_L..S01R,Saturday,Canarsie-Rockaway Pkwy,1,shape_170,L01S,00:54:00,00:54:00,1,20241221
386472,L,BFA24GEN-L026-Saturday-00_044400_L..N02R,Saturday,8 Av,0,shape_172,L28N,07:24:00,07:24:00,1,20241221
396998,L,BFA24GEN-L026-Sunday-00_000650_L..N01R,Sunday,8 Av,0,shape_173,L29N,00:06:30,00:06:30,1,20241222
397094,L,BFA24GEN-L026-Sunday-00_005400_L..S01R,Sunday,Canarsie-Rockaway Pkwy,1,shape_170,L01S,00:54:00,00:54:00,1,20241222


In [35]:
shape_ids_R = collect_shape_ids_for_route(feed, 'R')
print(shape_ids_R.keys())
plot = display_gk_plot(shape_ids_R['shape_199'])
plot

dict_keys(['shape_199', 'shape_202', 'shape_156', 'shape_205', 'shape_201', 'shape_203', 'shape_155', 'shape_200', 'shape_204'])


In [36]:
fig = folium.Figure(width=800, height=500)
fig.add_child(shape_ids_R[0])


KeyError: 0

# Testing Route ID identification

In [46]:
stop_times_df = pd.read_csv(f"../data/google_transit/stop_times.txt")
stops_df = pd.read_csv(f"../data/google_transit/stops.txt")
trips_df = pd.read_csv(f"../data/google_transit/trips.txt")

In [78]:
first_stop_in_trip = stop_times_df[stop_times_df['stop_sequence']==1]
first_stop_in_trip = first_stop_in_trip.drop(columns=['stop_sequence'])
# first_stop_in_trip['departure_hour'] = first_stop_in_trip['departure_time'].dt.hour
# first_stop_in_trip['route_id'] = [x.split("_")[-1].split('.')[0] 
#                                         for x in first_stop_in_trip['trip_id']]

In [79]:
first_stop_in_trip['route_id'] = [x for x in trips_df['service_id'] if ]

Unnamed: 0,trip_id,stop_id,arrival_time,departure_time
0,AFA24GEN-1038-Sunday-00_000600_1..S03R,101S,00:06:00,00:06:00
38,AFA24GEN-1038-Sunday-00_002600_1..S03R,101S,00:26:00,00:26:00
76,AFA24GEN-1038-Sunday-00_004600_1..S03R,101S,00:46:00,00:46:00
114,AFA24GEN-1038-Sunday-00_006600_1..S03R,101S,01:06:00,01:06:00
152,AFA24GEN-1038-Sunday-00_007200_1..N03R,142N,01:12:00,01:12:00
...,...,...,...,...
564955,SIR-FA2017-SI017-Weekday-08_138600_SI..S03R,S31S,23:06:00,23:06:00
564976,SIR-FA2017-SI017-Weekday-08_141100_SI..N03R,S09N,23:31:00,23:31:00
564997,SIR-FA2017-SI017-Weekday-08_141600_SI..S03R,S31S,23:36:00,23:36:00
565018,SIR-FA2017-SI017-Weekday-08_144100_SI..N03R,S09N,24:01:00,24:01:00


In [82]:
trips_df.drop(columns=['direction_id', 'service_id', 'trip_headsign', 'shape_id'])
first_stop_in_trip_w_new_route_id = first_stop_in_trip.merge(trips_df.drop(columns=['direction_id', 'service_id', 'trip_headsign', 'shape_id']), on='trip_id')

In [65]:
first_stop_in_trip_w_new_route_id = first_stop_in_trip.merge(trips_df.drop(columns=['direction_id']), on='trip_id')
first_stop_in_trip_w_new_route_id = first_stop_in_trip_w_new_route_id.drop(columns=['trip_headsign', 'shape_id'])
first_stop_in_trip_w_new_route_id.columns = ['trip_id', 'stop_id', 'arrival_time', 'departure_time', 'route_id_guess','route_id_reality', 'service_id']

In [None]:
# 423 trips that were different
## nothing in the trip id would have helped...
first_stop_in_trip_w_new_route_id[first_stop_in_trip_w_new_route_id['route_id_guess']!=first_stop_in_trip_w_new_route_id['route_id_reality']].head(10)

Unnamed: 0,trip_id,stop_id,arrival_time,departure_time,route_id_guess,route_id_reality,service_id
5165,AFA24GEN-6090-Weekday-00_036900_6..S02R,601S,06:09:00,06:09:00,6,6X,Weekday
5168,AFA24GEN-6090-Weekday-00_037800_6..S02R,601S,06:18:00,06:18:00,6,6X,Weekday
5172,AFA24GEN-6090-Weekday-00_038700_6..S02R,601S,06:27:00,06:27:00,6,6X,Weekday
5175,AFA24GEN-6090-Weekday-00_039500_6..S02R,601S,06:35:00,06:35:00,6,6X,Weekday
5178,AFA24GEN-6090-Weekday-00_040250_6..S02R,601S,06:42:30,06:42:30,6,6X,Weekday
5181,AFA24GEN-6090-Weekday-00_041000_6..S02R,601S,06:50:00,06:50:00,6,6X,Weekday
5184,AFA24GEN-6090-Weekday-00_041650_6..S02R,601S,06:56:30,06:56:30,6,6X,Weekday
5186,AFA24GEN-6090-Weekday-00_042200_6..S02R,601S,07:02:00,07:02:00,6,6X,Weekday
5188,AFA24GEN-6090-Weekday-00_042550_6..S02R,601S,07:05:30,07:05:30,6,6X,Weekday
5191,AFA24GEN-6090-Weekday-00_043150_6..S02R,601S,07:11:30,07:11:30,6,6X,Weekday


In [91]:
stop_times_df = pd.read_csv(f"../data/google_transit/stop_times.txt")
str_departure_time = [str_time_to_minutes(x) for x in stop_times_df['departure_time']]
stop_times_df['str_departure_time'] = str_departure_time
# getting the amount of time each trip takes
trip_time_diff = stop_times_df.groupby('trip_id')['str_departure_time'].agg(np.ptp)
valid_trip_times = pd.DataFrame(trip_time_diff).reset_index()
# valid_trip_times['route_id'] = valid_trip_times.merge(trips_df.drop(
#                                                         columns=['direction_id', 'service_id', 'trip_headsign', 'shape_id']), on='trip_id')

In [92]:
valid_trip_times
# trip_time_diff

Unnamed: 0,trip_id,str_departure_time
0,AFA24GEN-1038-Sunday-00_000600_1..S03R,58
1,AFA24GEN-1038-Sunday-00_002600_1..S03R,58
2,AFA24GEN-1038-Sunday-00_004600_1..S03R,58
3,AFA24GEN-1038-Sunday-00_006600_1..S03R,58
4,AFA24GEN-1038-Sunday-00_007200_1..N03R,58
...,...,...
20297,SIR-FA2017-SI017-Weekday-08_138600_SI..S03R,42
20298,SIR-FA2017-SI017-Weekday-08_141100_SI..N03R,42
20299,SIR-FA2017-SI017-Weekday-08_141600_SI..S03R,42
20300,SIR-FA2017-SI017-Weekday-08_144100_SI..N03R,42


In [94]:
valid_trip_times = valid_trip_times.merge(trips_df.drop(columns=['direction_id', 'service_id', 'trip_headsign', 'shape_id']), on='trip_id')

Unnamed: 0,trip_id,str_departure_time,route_id
0,AFA24GEN-1038-Sunday-00_000600_1..S03R,58,1
1,AFA24GEN-1038-Sunday-00_002600_1..S03R,58,1
2,AFA24GEN-1038-Sunday-00_004600_1..S03R,58,1
3,AFA24GEN-1038-Sunday-00_006600_1..S03R,58,1
4,AFA24GEN-1038-Sunday-00_007200_1..N03R,58,1
...,...,...,...
20297,SIR-FA2017-SI017-Weekday-08_138600_SI..S03R,42,SI
20298,SIR-FA2017-SI017-Weekday-08_141100_SI..N03R,42,SI
20299,SIR-FA2017-SI017-Weekday-08_141600_SI..S03R,42,SI
20300,SIR-FA2017-SI017-Weekday-08_144100_SI..N03R,42,SI
