## GTFS - Transfer Edges

In [4]:
import bisect
import pandas as pd
import datetime
import numpy as np
from tqdm.auto import tqdm
from tqdm import tqdm_notebook
tqdm.pandas()
import datetime as dt
from multiprocessing import Pool, cpu_count
from contextlib import closing
import math

# from dask.diagnostics import ProgressBar
# ProgressBar().register()

DATA_PATH = '../../input_data/GTFS-28-Oct-19/'

In [6]:
OUTPUT_PATH = '../../output_data/'

In [3]:
DAY = dt.datetime(2019, 11, 3)

## Load Nodes

In [4]:
# Load nodes
FILES_PREFIX = 'morning'
file_name = OUTPUT_PATH + FILES_PREFIX + '_nodes.pkl'
print(f'Loading nodes from file {file_name}')
nodes_df = pd.read_pickle(file_name)
nodes_df.head(3)

Loading nodes from file ../../output_data/morning_nodes.pkl


Unnamed: 0,index,trip_id,arrival_time,departure_time_stop,stop_id,stop_sequence,route_id,trip_headsign,departure_time_trip_departure,stop_code,stop_lat,stop_lon,arrival,departure,node_id
0,18,14824097_271019,08:02:40,08:02:40,2356,2,1606,נוף הגליל_שלום,08:00:00,41476,32.793214,35.038925,2019-11-03 08:02:40,2019-11-03 08:02:40,0
1,19,14824098_271019,09:02:40,09:02:40,2356,2,1606,נוף הגליל_שלום,09:00:00,41476,32.793214,35.038925,2019-11-03 09:02:40,2019-11-03 09:02:40,1
2,36,19590744_271019,08:14:17,08:14:17,2356,2,16379,טבריה_תחנה מרכזית,08:10:00,41476,32.793214,35.038925,2019-11-03 08:14:17,2019-11-03 08:14:17,2


In [5]:
nodes_df.shape

(325870, 15)

In [59]:
nodes_df.nunique()

index                            325870
trip_id                           13765
arrival_time                       5399
departure_time_stop                5399
stop_id                           25713
stop_sequence                       117
route_id                           3793
trip_headsign                      1127
departure_time_trip_departure       199
stop_code                         25383
stop_lat                          25107
stop_lon                          24961
arrival                            5399
departure                          5399
node_id                          325870
dtype: int64

### Filter nodes for development

In [None]:
# start_time = DAY + dt.timedelta(hours=8)
# end_time = start_time + dt.timedelta(minutes=10)

# nodes_df = nodes_df[nodes_df['arrival'] > start_time][nodes_df['arrival'] < end_time]

In [None]:
# nodes_df.shape

In [None]:
# nodes_df.nunique()

## Parallelizing

In [None]:
def run_parallel_df(df_grouped, func):
    with closing(Pool()) as pool:
        res_list = pool.map(func, [group for name, group in df_grouped])
    return pd.concat(res_list)

In [None]:
def run_parallel_dict(data, num_of_batches, func):
    batch_size = math.ceil(len(data) / num_of_batches)
    with closing(Pool()) as pool:
#         res_list = []
        data_items = list(data.items())
        print(f'Processing data in workers pool. Numner of batches is {num_of_batches} where each batch has approx. {batch_size} keys')
        sliced_data = [data_items[i:i+batch_size] for i in range(num_of_batches)]
        print('Finished slicing the data.')
#         curr_batch = 1
        res_list = pool.map(func, sliced_data)
#         for res in tqdm_notebook(pool.imap(func, sliced_data), total=num_of_batches):
#             res_list.append(res)
#             print(f'Finished processing batch number {curr_batch} of {num_of_batches} total. Number of transfers from this batch: {len(res)}')
#             curr_batch += 1
    return [item for sublist in res_list for item in sublist]

#### Testing parallelization

In [None]:
# d = {'a': [1,2,3], 'b':[1]}
# # num_of_batches = 2
# # d_items = list(d.items())
# # batch_size = math.ceil(len(d) / num_of_batches)
# # sliced_d = [d_items[i:i+batch_size] for i in range(num_of_batches)]
# # print(len(d))

# def print_dict_with_sleep(data):
#     print('processing..')
#     data = dict(data)
#     for k,v in data.items():
#         print(len(v))
# #     time.sleep(3)
# run_parallel_dict(d, 2, print_dict_with_sleep)

## Load Nearby Stops

In [7]:
stops_mapping_df = pd.read_csv(OUTPUT_PATH + 'stop_nbh_1000.txt')

In [None]:
# stops_mapping_df.head(3)

In [None]:
# stops_mapping_df.shape

In [None]:
# stops_mapping_df.nunique()

If we allow walking 300 meters we get an average of 6.24 transfers to neighboring stops from each stop. 

In [None]:
# TODO: 
# change walk speed to 1m/s - IMPL
# maybe switch to 100-150
# 15 minutes walk + wait

# hoping to get to 10-20 edges per node

In [60]:
stops_df = pd.read_csv(DATA_PATH + 'stops.txt')

### Get all stops from which we wish to start computations

In [None]:
nodes_df.head()

In [12]:
t = DAY+dt.timedelta(hours=8, minutes=25)
start_nodes_df = nodes_df[nodes_df['departure'] < t]

In [13]:
start_nodes_df.shape

(96674, 15)

In [None]:
start_nodes_df.head()

#### Validation - let's find the most frequent origin stops and make sure it seems right.

In [None]:
# stops_mapping_150m_df.IN_FID.mode()

In [None]:
# stops_df.iloc[10841]

In [None]:
# stops_df.iloc[10841]

We can see this is the central bus station, which makes sense it will have the most possible transfers.

Now we must re-map the nearby stops to stop_id as used in the GTFS feeds.

In [61]:
stops_df['IN_FID'] = stops_df.index

In [62]:
stops_df[['IN_FID', 'stop_id']].head()

Unnamed: 0,IN_FID,stop_id
0,0,1
1,1,2
2,2,3
3,3,6
4,4,7


In [63]:
stops_df.shape

(27989, 10)

In [13]:
stops_mapping_df = stops_mapping_df.merge(stops_df[['IN_FID', 'stop_id']], on='IN_FID', how='left')
stops_mapping_df.rename(columns={'stop_id':'from_stop_id'}, inplace=True)
stops_mapping_df = stops_mapping_df.merge(stops_df[['IN_FID', 'stop_id']], left_on='NEAR_FID', right_on='IN_FID', how='left')
stops_mapping_df.rename(columns={'stop_id':'to_stop_id', 'NEAR_DIST':'dist'}, inplace=True)

In [14]:
stops_mapping_df = stops_mapping_df[['from_stop_id', 'to_stop_id', 'dist']]
stops_mapping_df.head()

Unnamed: 0,from_stop_id,to_stop_id,dist
0,1,1011,97.818515
1,1,848,210.233162
2,1,850,214.393854
3,1,846,268.285651
4,1,849,346.204359


In [15]:
AVG_WALK_SPEED = 1  # meters per second (m/s)
stops_mapping_df['walk_time_sec'] = stops_mapping_df['dist'] / AVG_WALK_SPEED

In [16]:
stops_mapping_df.head(3)

Unnamed: 0,from_stop_id,to_stop_id,dist,walk_time_sec
0,1,1011,97.818515,97.818515
1,1,848,210.233162,210.233162
2,1,850,214.393854,214.393854


In [17]:
stops_mapping_df.to_csv(OUTPUT_PATH + 'nearby_stops.csv')

In [18]:
stops_mapping_df.to_pickle(OUTPUT_PATH + 'nearby_stops.pkl')

In [21]:
stops_mapping_df.shape

(1260752, 4)

In [22]:
stops_mapping_df.nunique()

from_stop_id      27785
to_stop_id        27785
dist             600126
walk_time_sec    600126
dtype: int64

In [None]:
# stops_mapping_df = pd.read_pickle(OUTPUT_PATH + 'nearby_stops.pkl')

In [38]:
stops_mapping_200m_df = stops_mapping_df[stops_mapping_df['dist'] < 200]

In [39]:
stops_mapping_200m_df.shape

(90360, 4)

In [41]:
stops_mapping_200m_df.nunique()

from_stop_id     25438
to_stop_id       25438
dist             38970
walk_time_sec    38970
dtype: int64

In [64]:
stops_mapping_150m_df.shape

(58766, 4)

## Compute Transfer Edges

In [65]:
stops_to_nodes = nodes_df.groupby('stop_id')[[
    'node_id', 'trip_id', 'arrival', 'departure', 'route_id']].apply(lambda g: g.values.tolist()).to_dict()

In [66]:
dict(list(stops_to_nodes.items())[0:2])

{2: [[210307,
   '39289949_271019',
   Timestamp('2019-11-03 08:43:17'),
   Timestamp('2019-11-03 08:43:17'),
   26018],
  [210308,
   '39554229_271019',
   Timestamp('2019-11-03 08:15:41'),
   Timestamp('2019-11-03 08:15:41'),
   25529],
  [210309,
   '42203990_271019',
   Timestamp('2019-11-03 08:17:34'),
   Timestamp('2019-11-03 08:17:34'),
   26015],
  [210310,
   '42203991_271019',
   Timestamp('2019-11-03 09:02:34'),
   Timestamp('2019-11-03 09:02:34'),
   26015],
  [210311,
   '42203992_271019',
   Timestamp('2019-11-03 09:22:34'),
   Timestamp('2019-11-03 09:22:34'),
   26015]],
 3: [[196896,
   '36196412_271019',
   Timestamp('2019-11-03 08:11:42'),
   Timestamp('2019-11-03 08:11:42'),
   10109],
  [196897,
   '36196413_271019',
   Timestamp('2019-11-03 08:26:42'),
   Timestamp('2019-11-03 08:26:42'),
   10109],
  [196898,
   '36196414_271019',
   Timestamp('2019-11-03 08:41:42'),
   Timestamp('2019-11-03 08:41:42'),
   10109],
  [196899,
   '36196415_271019',
   Timestamp('20

In [None]:
dict(list(stops_to_nodes.items()))

In [67]:
DEPARTURE_INDEX = 3
for stop, nodes in stops_to_nodes.items():
    stops_to_nodes[stop] = sorted(nodes, key=lambda x: x[DEPARTURE_INDEX])

In [68]:
total_values = 0
max_nodes_in_stop = 0
for n in stops_to_nodes.values():
    num_nodes = len(n)
    if num_nodes > max_nodes_in_stop:
        max_nodes_in_stop = num_nodes
    total_values += num_nodes
print(f'There is a total of {total_values} nodes in the stops_to_nodes dictionary')

There is a total of 325870 nodes in the stops_to_nodes dictionary


In [69]:
len(stops_to_nodes)

25713

In [70]:
print(f'We have an average of {total_values/len(stops_to_nodes)}, and a max of {max_nodes_in_stop} nodes in a single stop.')

We have an average of 12.673355890016722, and a max of 305 nodes in a single stop.


In [71]:
ARRIVAL_INDEX = 2
DEPARTURE_INDEX = 3
ROUTE_ID_INDEX = 4
MAX_WAIT_TIME = dt.timedelta(minutes=15)
def get_transfer_edges_from_nodes(stops_to_nodes_batch):
    print('Processing batch...')
    stops_to_nodes_batch = dict(stops_to_nodes_batch)
    transfer_edges = []
    for stop, nodes in tqdm(stops_to_nodes_batch.items()):
        for start_n in nodes:
            nearby_stops_df = stops_mapping_200m_df[stops_mapping_200m_df['from_stop_id'] == stop]
            # Add current stop to check transfers from the same stop
            nearby_stops_df.append({'from_stop_id': [stop], 'to_stop_id': [stop], 'dist': [0], 'walk_time_sec': [0]}, ignore_index=True)
            # TODO: verify this we're not staying to the same line in same direction
            for s in nearby_stops_df.iterrows():
                nearby_stop_id = s[1]['to_stop_id']
                if nearby_stop_id not in stops_to_nodes:
                    # Some stops don't have trips that operate all week. Some operate only on weekends.
                    # If this is such a stop we should continue to look at other stops, we won't find any
                    # nodes here.
                    continue
                nearby_nodes = stops_to_nodes[nearby_stop_id]
                second_line_earliest_start_time = start_n[ARRIVAL_INDEX] + dt.timedelta(seconds=s[1]['walk_time_sec'])
                second_line_latest_start_time = second_line_earliest_start_time + MAX_WAIT_TIME
                # Find index of first node that departs at least at second_start_time or later
                _, _, _, departures, _ = zip(*nearby_nodes)
                i = bisect.bisect_left(departures, second_line_earliest_start_time)
                while (i < len(nearby_nodes) and 
                       nearby_nodes[i][DEPARTURE_INDEX] >= second_line_earliest_start_time and 
                       nearby_nodes[i][DEPARTURE_INDEX] <= second_line_latest_start_time):
                    node = nearby_nodes[i][0]
                    if node == start_n[0] or (nearby_nodes[i][ROUTE_ID_INDEX] == start_n[ROUTE_ID_INDEX] and s == stop):
                        # We don't wish to transfer to the same node (no self-edges). 
                        # Another case we wish to avoid is transferring to the same line (route__id) in the same stop.
                        i += 1
                        continue
                    transfer_edges.append((start_n[0], nearby_nodes[i][0], nearby_nodes[i][DEPARTURE_INDEX] - start_n[ARRIVAL_INDEX]))
                    i += 1
    return transfer_edges

In [None]:
# transfer_edges = run_parallel_dict(stops_to_nodes, 1000, get_transfer_edges_from_nodes)

In [72]:
transfer_edges = get_transfer_edges_from_nodes(stops_to_nodes)

Processing batch...


HBox(children=(IntProgress(value=0, max=25713), HTML(value='')))




In [73]:
len(transfer_edges)

4191638

In [74]:
transfer_edges_sec = []
for s, t, w in transfer_edges:
    transfer_edges_sec.append((s, t, w.total_seconds()))

In [76]:
transfer_edges_sec[1000:1005]

[(104345, 301218, 360.0),
 (104345, 301213, 844.0),
 (104345, 301223, 709.0),
 (104345, 104315, 592.0),
 (104345, 104309, 737.0)]

In [75]:
import pickle
with open(OUTPUT_PATH + FILES_PREFIX + '_transfer_edges.pkl', 'wb') as f:
    print(f'Saving transfers to file {f.name}')
    pickle.dump(transfer_edges_sec, f)

Saving transfers to file ../../output_data/morning_transfer_edges.pkl


## [FOR TESTING] Let's check some transfers manually

In [46]:
transfer_edges[1000:1005]

[(1120240, 3243432, Timedelta('0 days 00:06:00')),
 (1120240, 3243400, Timedelta('0 days 00:14:04')),
 (1120240, 3243469, Timedelta('0 days 00:11:49')),
 (1120240, 1119942, Timedelta('0 days 00:09:52')),
 (1120240, 1119886, Timedelta('0 days 00:12:17'))]

In [47]:
nodes_df[nodes_df['node_id'] == 3243432]

Unnamed: 0,trip_id,arrival_time,departure_time_stop,stop_id,stop_sequence,route_id,trip_headsign,departure_time_trip_departure,stop_code,stop_lat,stop_lon,arrival,departure,node_id
3243432,36877179_271019,08:41:49,08:41:49,38971,39,25434,ראשון לציון_רכבת הראשונים,07:50:00,30064,31.947781,34.823278,2019-11-03 08:41:49,2019-11-03 08:41:49,3243432


In [48]:
nodes_df[nodes_df['node_id'] == 1120240]

Unnamed: 0,trip_id,arrival_time,departure_time_stop,stop_id,stop_sequence,route_id,trip_headsign,departure_time_trip_departure,stop_code,stop_lat,stop_lon,arrival,departure,node_id
1120240,42245728_271019,08:35:49,08:35:49,76,53,25581,רכבת הראשונים,07:40:00,38906,31.948552,34.822422,2019-11-03 08:35:49,2019-11-03 08:35:49,1120240


In [49]:
nodes_df[nodes_df['node_id'] == 3243400]

Unnamed: 0,trip_id,arrival_time,departure_time_stop,stop_id,stop_sequence,route_id,trip_headsign,departure_time_trip_departure,stop_code,stop_lat,stop_lon,arrival,departure,node_id
3243400,36873380_271019,08:49:53,08:49:53,38971,31,25405,ראשון לציון_רכבת הראשונים,08:10:00,30064,31.947781,34.823278,2019-11-03 08:49:53,2019-11-03 08:49:53,3243400


In [50]:
transfer_edges[4000000:4000002]

[(1763216, 3026510, Timedelta('0 days 00:09:05')),
 (1763216, 3026503, Timedelta('0 days 00:14:05'))]

In [51]:
nodes_df[nodes_df['node_id'] == 3026510]

Unnamed: 0,trip_id,arrival_time,departure_time_stop,stop_id,stop_sequence,route_id,trip_headsign,departure_time_trip_departure,stop_code,stop_lat,stop_lon,arrival,departure,node_id
3026510,42299653_011119,09:00:00,09:00:00,42875,1,51,אשדוד_תחנה מרכזית,09:00:00,20001,32.083157,34.796091,2019-11-03 09:00:00,2019-11-03 09:00:00,3026510


In [52]:
nodes_df[nodes_df['node_id'] == 1763216]

Unnamed: 0,trip_id,arrival_time,departure_time_stop,stop_id,stop_sequence,route_id,trip_headsign,departure_time_trip_departure,stop_code,stop_lat,stop_lon,arrival,departure,node_id
1763216,36873382_271019,08:50:55,08:50:55,38946,2,25405,ראשון לציון_רכבת הראשונים,08:50:00,20015,32.081706,34.795561,2019-11-03 08:50:55,2019-11-03 08:50:55,1763216


In [53]:
nodes_df[nodes_df['node_id'] == 3026503]

Unnamed: 0,trip_id,arrival_time,departure_time_stop,stop_id,stop_sequence,route_id,trip_headsign,departure_time_trip_departure,stop_code,stop_lat,stop_lon,arrival,departure,node_id
3026503,35771757_271019,09:05:00,09:05:00,42875,1,20871,ראש העין_פארק תעשיה אפק,09:05:00,20001,32.083157,34.796091,2019-11-03 09:05:00,2019-11-03 09:05:00,3026503


## Load edges for stats

In [9]:
import pickle

with open(OUTPUT_PATH + 'morning_direct_edges.pkl', 'rb') as f:
    direct_edges = pickle.load(f)

In [10]:
with open(OUTPUT_PATH + 'morning_transfer_edges.pkl', 'rb') as f:
    transfer_edges = pickle.load(f)

In [12]:
len(direct_edges)

312105

In [13]:
len(transfer_edges)

4191638