## GTFS
In order to compute accessibility based on GTFS data (General Transit Feed Specification, can by downloaded from ftp://199.203.58.18/), we will first perform basic processing of the data.
We are using the pandas library.

The code is based on the following structure of GTFS tables:
![GTFS Tables](./GTFS_tables.PNG)

In [9]:
import pandas as pd
import datetime

# DATA_PATH = '../data/'
DATA_PATH = '../../input_data/GTFS-28-Oct-19/'

In [10]:
OUTPUT_PATH = '../../output_data/'
# OUTPUT_PATH = 'D:/'

## Process Calendar - Get trips for a single day

In [11]:
# Load calendar
calendar_df = pd.read_csv(DATA_PATH + 'calendar.txt')
# Convert dates to python's datetime type
calendar_df['start_date'] = calendar_df['start_date'].apply(lambda x: datetime.datetime.strptime(str(x), '%Y%m%d'))
calendar_df['end_date'] = calendar_df['end_date'].apply(lambda x: datetime.datetime.strptime(str(x), '%Y%m%d'))

calendar_df.columns = ['service_id',
 'sunday',
 'monday',
 'tuesday',
 'wednesday',
 'thursday',
 'friday',
 'saturday',
 'start_date',
 'end_date']

calendar_df[:3]

Unnamed: 0,service_id,sunday,monday,tuesday,wednesday,thursday,friday,saturday,start_date,end_date
0,1,1,1,1,1,1,0,0,2019-10-27,2019-10-31
1,2,1,1,1,1,1,0,0,2019-11-01,2019-12-26
2,3,0,0,0,0,0,1,0,2019-11-01,2019-12-26


In [13]:
# Let's say we want all trips that occurred on the first Sunday after the feed was published 
# (Feed was published on Tuesday October 24, 2017, and the following Sunday is October 29, 2017).

# Filter so we only keep services that are active on Sunday.
sunday_services_df = calendar_df[calendar_df['sunday'] == 1][['service_id', 'start_date', 'end_date']]

date = datetime.datetime(2019, 11, 3)

# Keep only services that start during/before November 3
sunday_services_df = sunday_services_df[sunday_services_df['start_date'] <= date]

# Keep only services that end during/after November 3
sunday_services_df = sunday_services_df[sunday_services_df['end_date'] >= date]

## Process Trips

In [15]:
trips_df.head()

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,direction_id,shape_id
0,1,1,28876224_271019,רכבת יבנה מזרח,0,97105.0
1,1,1,28876225_271019,רכבת יבנה מזרח,0,97105.0
2,1,1,28876226_271019,רכבת יבנה מזרח,0,97105.0
3,1,1,28876227_271019,רכבת יבנה מזרח,0,97105.0
4,1,1,28876228_271019,רכבת יבנה מזרח,0,97105.0


In [16]:
# Load trips
trips_df = pd.read_csv(DATA_PATH + 'trips.txt')

# Fix column names (some columns have special 'hudden' characters that we want to remove)
trips_df.columns = ['route_id', 'service_id', 'trip_id', 'trip_headsign', 'direction_is', 'shape_id']

trips_calendar_df = sunday_services_df.merge(trips_df, on='service_id', suffixes=('_calendar', '_trips'))
sunday_trips_df = trips_calendar_df.drop(['start_date', 'end_date', 'trip_headsign'], axis=1)
sunday_trips_df[:3]

Unnamed: 0,service_id,route_id,trip_id,direction_is,shape_id
0,2,1,30900053_011119,0,97105.0
1,2,1,30900054_011119,0,97105.0
2,2,1,30900055_011119,0,97105.0


In [17]:
sunday_trips_df.nunique()

service_id       6742
route_id         6742
trip_id         99653
direction_is        2
shape_id         6665
dtype: int64

### We now have all trips that occurred on Sunday, November 3, 2019

## Process Stop Times
Note: This is pretty heavy compared to the rest of the tables

In [18]:
# Load stop times
stop_times_df = pd.read_csv(DATA_PATH + 'stop_times.txt')

# Get all trips departures by getting the minimal departure time for each trip
trips_start_times_df = stop_times_df.groupby('trip_id').agg({'departure_time': 'min'})

# Let's join the last two tabled to get the departure times of all sunday trips
sunday_departures_df = sunday_trips_df.merge(trips_start_times_df, on='trip_id', suffixes=('_departures', '_trips'))

## Process Stops

In [19]:
import datetime as dt
def convert_gtfs_time_to_datetime(gtfs_time):
    date = dt.datetime(2019, 11, 3)
    h, m, s = [int(x) for x in gtfs_time.split(':')]
    if h < 24:
        # This is a 'normal' situation, we can simply create a datetime object using the date we defined before
        return date + dt.timedelta(hours=h, minutes=m, seconds=s)
    # Otherwise we have a 'strange' time: it's after midnight
    new_date = date + dt.timedelta(days=1)
    return new_date + dt.timedelta(hours=h-24, minutes=m, seconds=s)

In [20]:
# Load stops
stops_df = pd.read_csv(DATA_PATH + 'stops.txt')

# Add stop code and zone id to stop times
stop_times_with_stop_codes_df = stop_times_df.merge(
    stops_df[['stop_id', 'stop_code', 'zone_id']], on='stop_id')

stop_times_with_stop_codes_df['departure_time'] = stop_times_with_stop_codes_df[
    'departure_time'].apply(convert_gtfs_time_to_datetime)

## Construct "Nodes"

In [21]:
# We want to (right) join this table with stop_times in order to get the sunday stop times with trip departure time.
sunday_nodes_df = stop_times_df.merge(sunday_departures_df, how='right', on='trip_id', suffixes=('_stop', '_trip_departure'))

# Remove some columns to clear the data
sunday_nodes_df = sunday_nodes_df.drop(['pickup_type', 'shape_dist_traveled', 
                                        'drop_off_type', 'service_id', 'direction_is', 'shape_id'], 
                                       axis=1)

# Add stops data to nodes
nodes_df = sunday_nodes_df.merge(stops_df, on='stop_id', suffixes=('_node', '_stop'))
nodes_df = nodes_df.drop(['stop_desc', 'stop_name', 'zone_id', 'parent_station', 'location_type'],axis=1)
nodes_df[:3]

Unnamed: 0,trip_id,arrival_time,departure_time_stop,stop_id,stop_sequence,route_id,departure_time_trip_departure,stop_code,stop_lat,stop_lon
0,10021427_271019,00:00:00,00:00:00,36133,1,10376,00:00:00,47915,32.791771,35.034289
1,10021428_271019,01:00:00,01:00:00,36133,1,10376,01:00:00,47915,32.791771,35.034289
2,10021429_271019,02:00:00,02:00:00,36133,1,10376,02:00:00,47915,32.791771,35.034289


In [22]:
nodes_df[5300:5306]

Unnamed: 0,trip_id,arrival_time,departure_time_stop,stop_id,stop_sequence,route_id,departure_time_trip_departure,stop_code,stop_lat,stop_lon
5300,33543899_271019,10:46:23,10:46:23,2364,11,6543,10:30:00,41484,32.83619,35.084962
5301,33543907_271019,11:16:23,11:16:23,2364,11,6543,11:00:00,41484,32.83619,35.084962
5302,33543912_271019,08:16:23,08:16:23,2364,11,6543,08:00:00,41484,32.83619,35.084962
5303,33543917_271019,08:46:23,08:46:23,2364,11,6543,08:30:00,41484,32.83619,35.084962
5304,33543922_271019,12:16:23,12:16:23,2364,11,6543,12:00:00,41484,32.83619,35.084962
5305,35199651_271019,06:17:18,06:17:18,2364,51,15761,05:32:00,41484,32.83619,35.084962


In [23]:
import datetime as dt
def convert_gtfs_time_to_datetime(gtfs_time):
    date = dt.datetime(2019, 11, 3)
    h, m, s = [int(x) for x in gtfs_time.split(':')]
    if h < 24:
        # This is a 'normal' situation, we can simply create a datetime object using the date we defined before
        return date + dt.timedelta(hours=h, minutes=m, seconds=s)
    # Otherwise we have a 'strange' time: it's after midnight
    new_date = date + dt.timedelta(days=1)
    return new_date + dt.timedelta(hours=h-24, minutes=m, seconds=s)

In [24]:
nodes_df['arrival'] = nodes_df['arrival_time'].apply(convert_gtfs_time_to_datetime)

In [25]:
nodes_df['departure'] = nodes_df['departure_time_stop'].apply(convert_gtfs_time_to_datetime)

In [26]:
nodes_df[5300:5306]

Unnamed: 0,trip_id,arrival_time,departure_time_stop,stop_id,stop_sequence,route_id,departure_time_trip_departure,stop_code,stop_lat,stop_lon,arrival,departure
5300,33543899_271019,10:46:23,10:46:23,2364,11,6543,10:30:00,41484,32.83619,35.084962,2019-11-03 10:46:23,2019-11-03 10:46:23
5301,33543907_271019,11:16:23,11:16:23,2364,11,6543,11:00:00,41484,32.83619,35.084962,2019-11-03 11:16:23,2019-11-03 11:16:23
5302,33543912_271019,08:16:23,08:16:23,2364,11,6543,08:00:00,41484,32.83619,35.084962,2019-11-03 08:16:23,2019-11-03 08:16:23
5303,33543917_271019,08:46:23,08:46:23,2364,11,6543,08:30:00,41484,32.83619,35.084962,2019-11-03 08:46:23,2019-11-03 08:46:23
5304,33543922_271019,12:16:23,12:16:23,2364,11,6543,12:00:00,41484,32.83619,35.084962,2019-11-03 12:16:23,2019-11-03 12:16:23
5305,35199651_271019,06:17:18,06:17:18,2364,51,15761,05:32:00,41484,32.83619,35.084962,2019-11-03 06:17:18,2019-11-03 06:17:18


## Some stats

In [27]:
nodes_df.nunique()

trip_id                          99653
arrival_time                     88667
departure_time_stop              88667
stop_id                          27622
stop_sequence                      117
route_id                          6742
departure_time_trip_departure     1202
stop_code                        27273
stop_lat                         26961
stop_lon                         26795
arrival                          88667
departure                        88667
dtype: int64

In [28]:
start_time = datetime.datetime(2019, 11, 3, 8)
end_time = datetime.datetime(2019, 11, 3, 9)

morning_trips_df = nodes_df[nodes_df['arrival'] > start_time][nodes_df['arrival'] < end_time]
morning_trips_df.head(5)

  after removing the cwd from sys.path.


Unnamed: 0,trip_id,arrival_time,departure_time_stop,stop_id,stop_sequence,route_id,departure_time_trip_departure,stop_code,stop_lat,stop_lon,arrival,departure
18,14824097_271019,08:02:40,08:02:40,2356,2,1606,08:00:00,41476,32.793214,35.038925,2019-11-03 08:02:40,2019-11-03 08:02:40
36,19590744_271019,08:14:17,08:14:17,2356,2,16379,08:10:00,41476,32.793214,35.038925,2019-11-03 08:14:17,2019-11-03 08:14:17
61,24004495_271019,08:09:09,08:09:09,2356,2,4418,08:05:00,41476,32.793214,35.038925,2019-11-03 08:09:09,2019-11-03 08:09:09
63,24004510_271019,08:54:09,08:54:09,2356,2,4418,08:50:00,41476,32.793214,35.038925,2019-11-03 08:54:09,2019-11-03 08:54:09
68,24004542_271019,08:29:09,08:29:09,2356,2,4418,08:25:00,41476,32.793214,35.038925,2019-11-03 08:29:09,2019-11-03 08:29:09


In [29]:
morning_trips_df.nunique()

trip_id                          10728
arrival_time                      3599
departure_time_stop               3599
stop_id                          24750
stop_sequence                      112
route_id                          3556
departure_time_trip_departure      169
stop_code                        24437
stop_lat                         24172
stop_lon                         24031
arrival                           3599
departure                         3599
dtype: int64

In [30]:
morning_trips_df.to_pickle(OUTPUT_PATH + 'morning_trips_nodes.pkl')

## Filter only Tel Aviv Metropolitan stops

Let's get all Tel Aviv (TLV) stops 

In [37]:
for zone in stops_df['zone_id'].unique():
    print(zone)

6900
469
8300
7000
8400
276
2530
753
379
1748
14039
193
7400
759
2550
12926
12921
7200
1165
1200
8500
828
1166
288
157
316
182
195
154
175
880
13575
12831
1304
466
1893
681
680
734
422
219
1309
2018
377
12433
41
1905
698
1876
807
661
167
12732
7900
8600
14050
2640
10687
382
1882
3760
12623
6600
8700
168
12565
12566
6200
12544
12543
565
12558
12752
12753
12737
717
2038
802
722
705
9700
592
187
1746
170
1167
327
800
6500
13173
13172
13175
13176
13180
13179
1869
1020
13171
9800
12483
13165
13164
13166
13167
13168
13163
13162
4000
9300
12221
12331
10746
7800
1860
2500
12208
12207
12205
12327
12204
12326
12201
12203
12200
12202
12188
12325
12324
12230
12321
12229
12227
12228
12225
12224
12223
6800
220
8200
9500
12074
11943
11942
12316
12314
12183
679
12313
12344
12343
2008
12340
12338
12186
12337
2055
12431
12440
715
12439
12336
12437
12335
12436
12334
773
12481
12480
12487
12488
12434
12475
12485
12486
12473
12185
617
12490
12491
12493
12430
12492
444
6000
360
628
1247
12428
648
12347
1393

520
347
997
528
11753
482
14014
11754
10907
11844
80
13121
11948
11949
11868
29
357
10751
11950
12128
10770
854
11076
13537
12127
12283
14000
10944
10943
10940
10960
4013
11460
11450
11684
11763
12255
12254
11639
11961
11829
11628
11627
10800
10952
10948
10946
1160
1174
11018
11608
11605
10866
319
10890
11751
11749
11747
11746
11744
11743
11745
10841
10838
10837
10835
10834
10833
10832
10831
11595
4012
13993
10848
10766
10845
10846
10756
10755
10754
10753
10865
10864
10953
10930
10931
10926
4101
10810
11726
10869
10910
10777
10871
10919
10920
10889
4009
10959
10872
10868
10888
10882
10883
11447
10842
10927
10925
10924
10752
10767
10764
11906
11796
4002
4011
11268
10763
10759
10823
11905
10829
10899
10898
10900
10896
11600
4014
4007
11269
11597
11596
10897
453
4006
11414
11691
11687
12050
12059
12060
12048
12236
11547
11546
11544
11541
11540
11542
11548
11552
11554
843
11675
11424
11838
12235
1179
10884
10885
12064
12062
12056
11839
12035
13100
12105
12108
10784
10761
10783
10962
10961


In [38]:
test_stops = stops_df[stops_df['zone_id'] == 6900]
test_stops.to_csv(OUTPUT_PATH + 'test_stops.csv')

In [39]:
test_stops.shape

(196, 9)

In [32]:
stops_df.shape

(27989, 9)

In [33]:
# Let's see how manu NaN zones we have:
stops_df['zone_id'].isna().sum()

0

In [34]:
# I think zone 210 is tel aviv metropolitan area (only the small surrounding part, we would need to extend for our real computations). 
# We need to filter only trips that contain stops (and then only nodes with those stops) that are in this zone.
tlv_stops_df = stops_df[stops_df['zone_id'] == 210]

In [35]:
tlv_stops_df.shape

(0, 9)

Now we need to get the TLV stop times, in order to find all trips that include TLV stops

In [177]:
# stop_times_df
tlv_stop_times_df = stop_times_df.merge(tlv_stops_df[['stop_id']], on='stop_id', how='inner')

In [179]:
tlv_stop_times_df.shape

(3040687, 8)

In [180]:
stop_times_df.shape

(11880208, 8)

Next, the TLV stop times include all actual trip ids with TLV stops in them. Let's find relevant trip in the Sunday's trips we're examining

In [191]:
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#
# We want to get unique trips which pass through TLV, then we will use these trips to filter 
# Sunday trips to contain only trips that pass through TLV.
#
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

tlv_trips_df = tlv_stop_times_df[['trip_id']].drop_duplicates(subset ="trip_id", 
                     keep = False, inplace = False)

In [195]:
tlv_trips_df.head()

Unnamed: 0,trip_id
11766,29454550_291017
11767,28918821_301017
11768,28918821_051117
11769,28916754_301017
11770,28916754_051117


In [194]:
sunday_trips_df.head()

Unnamed: 0,service_id,route_id,trip_id,direction_is,shape_id
0,52601699,4322,25569747_241017,1,77876.0
1,52601699,4322,25569748_241017,1,77876.0
2,52596472,39,27729716_241017,0,82768.0
3,52596472,39,27729718_241017,0,82768.0
4,52596472,39,27729719_241017,0,82768.0


In [196]:
tlv_sunday_trips_df = sunday_trips_df.merge(tlv_trips_df[['trip_id']], on='trip_id', how='inner')

In [197]:
sunday_trips_df.shape

(89319, 5)

In [198]:
tlv_trips_df.shape

(2905, 1)

Now, we want to save only stops that are part of TLV trips

In [199]:
tlv_stop_times_df.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type,shape_dist_traveled
0,23005614_051117,20:43:00,20:43:00,37336,3,0.0,0.0,59700
1,29577221_051117,15:58:00,15:58:00,37336,4,0.0,0.0,59700
2,29577222_051117,14:58:00,14:58:00,37336,4,0.0,0.0,59700
3,15441332_051117,20:39:00,20:39:00,37336,5,0.0,0.0,20100
4,29684401_031117,14:58:00,14:58:00,37336,4,0.0,0.0,59700


In [203]:
tlv_stop_times_df.merge(tlv_trips_df[['trip_id']], on='trip_id', how='inner').shape
#[['stop_id']].drop_duplicates(subset ='stop_id', keep = False, inplace = False)

(2905, 8)

In [None]:
tlv_trips_stops_df = tlv_stop_times_df.merge(tlv_trips_df[['trip_id']], on='trip_id', how='inner')

Finally, let's save only nodes of these stops, so that all nodes are part of TLV trips

In [168]:
tlv_nodes_df = nodes_df.merge(tlv_stops_df[['stop_id']], on='stop_id', how='inner')

In [171]:
nodes_df.shape

(3186123, 13)

In [172]:
tlv_nodes_df.shape

(864459, 13)

In [173]:
tlv_nodes_df.to_csv('tlv_nodes.csv')

## Compute Direct Edges

In [174]:
# First we verify that the stops in each trip are consecutive

# tmp_df = nodes_df
tmp_df = tlv_nodes_df

def verify_consective(l):
    if sorted(list(l)) != list(range(min(l), max(l)+1)):
        print('Found non-consecutive stop sequence')
        raise Exception('NON CONSECUTIVE STOP SEQUENCE')

In [175]:
tmp_df[['trip_id', 'stop_sequence']].groupby('trip_id').apply(lambda x: verify_consective(x.stop_sequence))

Found non-consecutive stop sequence
Found non-consecutive stop sequence
Found non-consecutive stop sequence
Found non-consecutive stop sequence


Exception: NON CONSECUTIVE STOP SEQUENCE

In [81]:
# ==> No exception -> all stops are consecutive, we can resume

In [140]:
def create_direct_edges_for_trip(x):
    for index, node in x.iterrows():
        stop_seq = node['stop_sequence']
        # For the same trip we want to take the next node
        next_node = x[x['stop_sequence'] == stop_seq + 1]
        if next_node.shape[0] == 0:
            # This is the last node of the current trip, no outgoing edge
            continue
        assert next_node.shape[0] == 1
        next_node_index = next_node.index[0]

        w = ((next_node['departure'] - node['arrival']) / np.timedelta64(1, 's')).values[0]    
        direct_edges.add((index, next_node_index, w))

<span style="background-color: #F1D9F9">TODO: make this multiprocessed: group by trip_id, and then split the data to batches of groups (trips). For each group we will apply the 'create direct edges' using a pool. We need to make sure that directed_edges is thread/process-safe.</span>

In [145]:
# For each trip (grouping by trip), create edges for this trip's consecutive nodes.
from tqdm.auto import tqdm
tqdm.pandas()

direct_edges = set()
tmp_df[['trip_id', 'stop_sequence', 'arrival', 'departure']].groupby('trip_id').progress_apply(create_direct_edges_for_trip)

HBox(children=(IntProgress(value=0, max=89319), HTML(value='')))

In [147]:
len(direct_edges)

3096804

In [148]:
import pickle

with open('direct_edges_all_israel.pkl', 'wb') as f:
    pickle.dump(direct_edges, f)

In [149]:
nodes_df.shape

(3186123, 13)

In [150]:
nodes_df.to_pickle('all_israel_nodes.pkl')

In [152]:
nodes_df[['id']].to_pickle('all_israel_node_ids.pkl')

## Compute Transfer Edges