## GTFS
In order to compute accessibility based on GTFS data (General Transit Feed Specification, can by downloaded from ftp://199.203.58.18/), we will first perform basic processing of the data.
We are using the pandas library.

The code is based on the following structure of GTFS tables:
![GTFS Tables](../../input_data/GTFS_tables.PNG)

In [329]:
import pandas as pd
import datetime as dt
import pickle
from tqdm.auto import tqdm
tqdm.pandas()

DATA_PATH = '../../../input_data/synthetic_examples/input_data/Test1/'

In [330]:
OUTPUT_PATH = '../../../output_data/validation/test1/'

In [331]:
DAY = dt.datetime(2016, 1, 4)

In [386]:
weekdays = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
DAY_OF_WEEK = weekdays[DAY.weekday()]

In [388]:
DAY_OF_WEEK

'monday'

## Process Calendar - Get trips for a single day

In [389]:
# Load calendar
calendar_df = pd.read_csv(DATA_PATH + 'calendar.txt')
# Convert dates to python's datetime type
calendar_df['start_date'] = calendar_df['start_date'].apply(lambda x: dt.datetime.strptime(str(x), '%Y%m%d'))
calendar_df['end_date'] = calendar_df['end_date'].apply(lambda x: dt.datetime.strptime(str(x), '%Y%m%d'))

calendar_df.columns = ['service_id',
 'sunday',
 'monday',
 'tuesday',
 'wednesday',
 'thursday',
 'friday',
 'saturday',
 'start_date',
 'end_date']

calendar_df[:3]

Unnamed: 0,service_id,sunday,monday,tuesday,wednesday,thursday,friday,saturday,start_date,end_date
0,31349658,1,1,1,1,1,0,0,2015-12-06,2015-12-07
1,31349659,1,1,1,1,1,0,0,2015-12-15,2016-02-04
2,31349660,0,0,0,0,0,1,0,2015-12-15,2016-02-04


In [390]:
# Let's say we want all trips that occurred on the first Sunday after the feed was published 

# Filter so we only keep services that are active on Sunday.
sunday_services_df = calendar_df[calendar_df[DAY_OF_WEEK] == 1][['service_id', 'start_date', 'end_date']]

# Keep only services that start during/before selected date
sunday_services_df = sunday_services_df[sunday_services_df['start_date'] <= DAY]

# Keep only services that end during/after selected date
sunday_services_df = sunday_services_df[sunday_services_df['end_date'] >= DAY]

In [391]:
sunday_services_df

Unnamed: 0,service_id,start_date,end_date
1,31349659,2015-12-15,2016-02-04
14,31360273,2015-12-16,2016-02-04


## Process Trips

In [392]:
# Load trips
trips_df = pd.read_csv(DATA_PATH + 'trips.txt')

In [393]:
trips_df.head(2)

Unnamed: 0,route_id,service_id,trip_id,direction_id,shape_id
0,9807,31360272,18673606_161215,1,64069
1,9807,31360272,18673610_161215,1,64069


In [394]:


# Fix column names (some columns have special 'hudden' characters that we want to remove)
trips_df.columns = ['route_id', 'service_id', 'trip_id', 'direction_is', 'shape_id']

trips_calendar_df = sunday_services_df.merge(trips_df, on='service_id', suffixes=('_calendar', '_trips'))
sunday_trips_df = trips_calendar_df.drop(['start_date', 'end_date'], axis=1)
sunday_trips_df[:3]

Unnamed: 0,service_id,route_id,trip_id,direction_is,shape_id
0,31349659,2544,19286012_151215,1,62838
1,31349659,2544,19286013_151215,1,62838
2,31349659,2544,19286016_151215,1,62838


In [395]:
sunday_trips_df.nunique()

service_id        2
route_id          2
trip_id         150
direction_is      1
shape_id          2
dtype: int64

### We now have all trips that occurred on the selected date

## Process Stop Times
Note: This is pretty heavy compared to the rest of the tables

In [396]:
# Load stop times
stop_times_df = pd.read_csv(DATA_PATH + 'stop_times.txt')

# Get all trips departures by getting the minimal departure time for each trip
trips_start_times_df = stop_times_df.groupby('trip_id').agg({'departure_time': 'min'})

# Let's join the last two tabled to get the departure times of all sunday trips
sunday_departures_df = sunday_trips_df.merge(trips_start_times_df, on='trip_id', suffixes=('_departures', '_trips'))

In [397]:
# Remove this
stop_times_df[stop_times_df['stop_id'] == 12816]

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type


In [398]:
sunday_departures_df.head(2)

Unnamed: 0,service_id,route_id,trip_id,direction_is,shape_id,departure_time
0,31349659,2544,19286012_151215,1,62838,18:50:00
1,31349659,2544,19286013_151215,1,62838,19:05:00


## Process Stops

In [399]:
def convert_gtfs_time_to_datetime(gtfs_time):
#     date = dt.datetime(2019, 11, 3)
    h, m, s = [int(x) for x in gtfs_time.split(':')]
    if h < 24:
        # This is a 'normal' situation, we can simply create a datetime object using the date we defined before
        return DAY + dt.timedelta(hours=h, minutes=m, seconds=s)
    # Otherwise we have a 'strange' time: it's after midnight
    new_date = DAY + dt.timedelta(days=1)
    return new_date + dt.timedelta(hours=h-24, minutes=m, seconds=s)

In [400]:
# Load stops
stops_df = pd.read_csv(DATA_PATH + 'stops.txt')

stops_df.head(2)

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,location_type,parent_station
0,12834,21038,הועד הפועל/ארלוזורוב,רחוב:ארלוזורוב 93 עיר: תל אביב יפו רציף: קו...,32.084755,34.784677,0,
1,12841,21068,ארלוזורוב/משה שרת,רחוב:ארלוזורוב 117 עיר: תל אביב יפו רציף: ק...,32.084074,34.787875,0,


In [401]:
# Remove this! only for validation:
stops_df[stops_df['stop_id'] == 12816]

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,location_type,parent_station


In [402]:
# Add stop code and zone id to stop times
stop_times_with_stop_codes_df = stop_times_df.merge(
    stops_df[['stop_id', 'stop_code']], on='stop_id')

stop_times_with_stop_codes_df['departure_time'] = stop_times_with_stop_codes_df[
    'departure_time'].apply(convert_gtfs_time_to_datetime)

## Construct Nodes

In [403]:
# We want to (right) join this table with stop_times in order to get the sunday stop times with trip departure time.
sunday_nodes_df = stop_times_df.merge(sunday_departures_df, how='right', on='trip_id', suffixes=('_stop', '_trip_departure'))

# Remove some columns to clear the data
sunday_nodes_df = sunday_nodes_df.drop(['pickup_type',
                                        'drop_off_type', 'service_id', 'direction_is', 'shape_id'], 
                                       axis=1)

# Add stops data to nodes
nodes_df = sunday_nodes_df.merge(stops_df, on='stop_id', suffixes=('_node', '_stop'))
nodes_df = nodes_df.drop(['stop_desc', 'stop_name', 'parent_station', 'location_type'],axis=1)


nodes_df[:3]

Unnamed: 0,trip_id,arrival_time,departure_time_stop,stop_id,stop_sequence,route_id,departure_time_trip_departure,stop_code,stop_lat,stop_lon
0,19285948_151215,05:20:00,05:20:00,25123,1,2544,05:20:00,38160,32.050923,34.849426
1,19285949_151215,05:40:00,05:40:00,25123,1,2544,05:40:00,38160,32.050923,34.849426
2,19285950_151215,06:00:00,06:00:00,25123,1,2544,06:00:00,38160,32.050923,34.849426


In [404]:
sunday_nodes_df = stop_times_df.merge(sunday_departures_df, how='right', on='trip_id', suffixes=('_stop', '_trip_departure'))

In [405]:
sunday_nodes_df.head(1)

Unnamed: 0,trip_id,arrival_time,departure_time_stop,stop_id,stop_sequence,pickup_type,drop_off_type,service_id,route_id,direction_is,shape_id,departure_time_trip_departure
0,19285948_151215,05:20:00,05:20:00,25123,1,0,1,31349659,2544,1,62838,05:20:00


In [406]:
# Remove some columns to clear the data
sunday_nodes_df = sunday_nodes_df.drop(['pickup_type',
                                        'drop_off_type', 'service_id', 'direction_is', 'shape_id'], 
                                       axis=1)

In [407]:
nodes_df = sunday_nodes_df.merge(stops_df, on='stop_id', suffixes=('_node', '_stop'))


In [408]:
nodes_df = nodes_df.drop(['stop_desc', 'stop_name', 'parent_station', 'location_type'],axis=1)


In [409]:
# Convert GTFS times to match "real-world time".
nodes_df['arrival'] = nodes_df['arrival_time'].apply(convert_gtfs_time_to_datetime)
nodes_df['departure'] = nodes_df['departure_time_stop'].apply(convert_gtfs_time_to_datetime)

In [410]:
nodes_df.head()

Unnamed: 0,trip_id,arrival_time,departure_time_stop,stop_id,stop_sequence,route_id,departure_time_trip_departure,stop_code,stop_lat,stop_lon,arrival,departure
0,19285948_151215,05:20:00,05:20:00,25123,1,2544,05:20:00,38160,32.050923,34.849426,2016-01-04 05:20:00,2016-01-04 05:20:00
1,19285949_151215,05:40:00,05:40:00,25123,1,2544,05:40:00,38160,32.050923,34.849426,2016-01-04 05:40:00,2016-01-04 05:40:00
2,19285950_151215,06:00:00,06:00:00,25123,1,2544,06:00:00,38160,32.050923,34.849426,2016-01-04 06:00:00,2016-01-04 06:00:00
3,19285951_151215,06:15:00,06:15:00,25123,1,2544,06:15:00,38160,32.050923,34.849426,2016-01-04 06:15:00,2016-01-04 06:15:00
4,19285952_151215,06:29:00,06:29:00,25123,1,2544,06:29:00,38160,32.050923,34.849426,2016-01-04 06:29:00,2016-01-04 06:29:00


In [411]:
nodes_df.shape

(4902, 12)

## Some Stats on the Overall Nodes For the Day 

In [412]:
nodes_df.nunique()

trip_id                           150
arrival_time                     4711
departure_time_stop              4711
stop_id                            64
stop_sequence                      37
route_id                            2
departure_time_trip_departure     143
stop_code                          64
stop_lat                           64
stop_lon                           64
arrival                          4711
departure                        4711
dtype: int64

In [413]:
# TODO: add node_id according to index, and save in pkl

In [414]:
nodes_df.to_pickle(OUTPUT_PATH + 'all_nodes.pkl')

In [415]:
start_time = DAY + dt.timedelta(hours=7)
end_time = start_time + dt.timedelta(hours=3, minutes=0)

morning_nodes_df = nodes_df[nodes_df['arrival'] > start_time][nodes_df['arrival'] < end_time]
morning_nodes_df.head(3)

  after removing the cwd from sys.path.


Unnamed: 0,trip_id,arrival_time,departure_time_stop,stop_id,stop_sequence,route_id,departure_time_trip_departure,stop_code,stop_lat,stop_lon,arrival,departure
25,19285956_151215,07:08:00,07:08:00,25123,1,2544,07:08:00,38160,32.050923,34.849426,2016-01-04 07:08:00,2016-01-04 07:08:00
26,19285957_151215,07:17:00,07:17:00,25123,1,2544,07:17:00,38160,32.050923,34.849426,2016-01-04 07:17:00,2016-01-04 07:17:00
27,19285958_151215,07:26:00,07:26:00,25123,1,2544,07:26:00,38160,32.050923,34.849426,2016-01-04 07:26:00,2016-01-04 07:26:00


In [416]:
morning_nodes_df.nunique()

trip_id                            42
arrival_time                     1080
departure_time_stop              1080
stop_id                            64
stop_sequence                      37
route_id                            2
departure_time_trip_departure      41
stop_code                          64
stop_lat                           64
stop_lon                           64
arrival                          1080
departure                        1080
dtype: int64

In [417]:
morning_nodes_df = morning_nodes_df.reset_index()

In [418]:
morning_nodes_df['node_id'] = morning_nodes_df.index

In [419]:
morning_nodes_df.head(3)

Unnamed: 0,index,trip_id,arrival_time,departure_time_stop,stop_id,stop_sequence,route_id,departure_time_trip_departure,stop_code,stop_lat,stop_lon,arrival,departure,node_id
0,25,19285956_151215,07:08:00,07:08:00,25123,1,2544,07:08:00,38160,32.050923,34.849426,2016-01-04 07:08:00,2016-01-04 07:08:00,0
1,26,19285957_151215,07:17:00,07:17:00,25123,1,2544,07:17:00,38160,32.050923,34.849426,2016-01-04 07:17:00,2016-01-04 07:17:00,1
2,27,19285958_151215,07:26:00,07:26:00,25123,1,2544,07:26:00,38160,32.050923,34.849426,2016-01-04 07:26:00,2016-01-04 07:26:00,2


In [420]:
morning_nodes_df.to_pickle(OUTPUT_PATH + 'morning_nodes.pkl')