## GTFS - Transfer Edges

In [74]:
import bisect
import pandas as pd
import datetime
import numpy as np
import geopy.distance
from tqdm.auto import tqdm
from tqdm import tqdm_notebook
tqdm.pandas()
import dask.dataframe as dd
from dask.multiprocessing import get
from geopy.point import Point
import datetime as dt
from multiprocessing import Pool, cpu_count
from contextlib import closing
import math

from dask.diagnostics import ProgressBar
ProgressBar().register()

DATA_PATH = '../../../input_data/synthetic_examples/input_data/Test0_b_TwoLines/'

In [75]:
OUTPUT_PATH = '../../../output_data/validation/two_lines/'

In [76]:
DAY = dt.datetime(2015, 12, 14)

## Load Nodes

In [77]:
# Load nodes
FILES_PREFIX = 'morning'
file_name = OUTPUT_PATH + FILES_PREFIX + '_nodes.pkl'
print(f'Loading nodes from file {file_name}')
nodes_df = pd.read_pickle(file_name)
nodes_df.head(3)

Loading nodes from file ../../../output_data/validation/two_lines/morning_nodes.pkl


Unnamed: 0,index,trip_id,arrival_time,departure_time_stop,stop_id,stop_sequence,route_id,departure_time_trip_departure,stop_code,stop_lat,stop_lon,arrival,departure,node_id
0,22,18854212_161215,07:35:48,07:35:48,13257,30,9823,06:53:00,22557,32.040172,34.775255,2015-12-14 07:35:48,2015-12-14 07:35:48,0
1,40,18854213_161215,07:52:48,07:52:48,13257,30,9823,07:10:00,22557,32.040172,34.775255,2015-12-14 07:52:48,2015-12-14 07:52:48,1
2,41,18854214_161215,08:22:48,08:22:48,13257,30,9823,07:40:00,22557,32.040172,34.775255,2015-12-14 08:22:48,2015-12-14 08:22:48,2


In [78]:
nodes_df.shape

(557, 14)

In [79]:
nodes_df.nunique()

index                            557
trip_id                           22
arrival_time                     533
departure_time_stop              533
stop_id                           80
stop_sequence                     50
route_id                           2
departure_time_trip_departure     21
stop_code                         80
stop_lat                          80
stop_lon                          80
arrival                          533
departure                        533
node_id                          557
dtype: int64

### Filter nodes for development

In [80]:
# start_time = DAY + dt.timedelta(hours=8)
# end_time = start_time + dt.timedelta(minutes=10)

# nodes_df = nodes_df[nodes_df['arrival'] > start_time][nodes_df['arrival'] < end_time]

In [81]:
# nodes_df.shape

In [82]:
# nodes_df.nunique()

## Parallelizing

In [83]:
def run_parallel_df(df_grouped, func):
    with closing(Pool()) as pool:
        res_list = pool.map(func, [group for name, group in df_grouped])
    return pd.concat(res_list)

In [84]:
def run_parallel_dict(data, num_of_batches, func):
    batch_size = math.ceil(len(data) / num_of_batches)
    with closing(Pool()) as pool:
#         res_list = []
        data_items = list(data.items())
        print(f'Processing data in workers pool. Numner of batches is {num_of_batches} where each batch has approx. {batch_size} keys')
        sliced_data = [data_items[i:i+batch_size] for i in range(num_of_batches)]
        print('Finished slicing the data.')
#         curr_batch = 1
        res_list = pool.map(func, sliced_data)
#         for res in tqdm_notebook(pool.imap(func, sliced_data), total=num_of_batches):
#             res_list.append(res)
#             print(f'Finished processing batch number {curr_batch} of {num_of_batches} total. Number of transfers from this batch: {len(res)}')
#             curr_batch += 1
    return [item for sublist in res_list for item in sublist]

#### Testing parallelization

In [85]:
# d = {'a': [1,2,3], 'b':[1]}
# # num_of_batches = 2
# # d_items = list(d.items())
# # batch_size = math.ceil(len(d) / num_of_batches)
# # sliced_d = [d_items[i:i+batch_size] for i in range(num_of_batches)]
# # print(len(d))

# def print_dict_with_sleep(data):
#     print('processing..')
#     data = dict(data)
#     for k,v in data.items():
#         print(len(v))
# #     time.sleep(3)
# run_parallel_dict(d, 2, print_dict_with_sleep)

## Load Nearby Stops

In [86]:
stops_mapping_df = pd.read_csv(DATA_PATH + 'StopsToStops.csv')

In [87]:
stops_mapping_df.head(3)

Unnamed: 0,NEAR_DIST,IN_STOP_ID,NEAR_STOP_ID
0,175.710767,12936,13950
1,199.559271,12936,13049
2,199.559271,13049,12936


In [88]:
stops_mapping_df.columns = ['NEAR_DIST', 'IN_FID', 'NEAR_FID']

In [89]:
# stops_mapping_df.shape

In [90]:
# stops_mapping_df.nunique()

If we allow walking 300 meters we get an average of 6.24 transfers to neighboring stops from each stop. 

In [91]:
# TODO: 
# change walk speed to 1m/s - IMPL
# maybe switch to 100-150
# 15 minutes walk + wait

# hoping to get to 10-20 edges per node

In [92]:
stops_df = pd.read_csv(DATA_PATH + 'stops.txt')

### Get all stops from which we wish to start computations

In [93]:
nodes_df.head()

Unnamed: 0,index,trip_id,arrival_time,departure_time_stop,stop_id,stop_sequence,route_id,departure_time_trip_departure,stop_code,stop_lat,stop_lon,arrival,departure,node_id
0,22,18854212_161215,07:35:48,07:35:48,13257,30,9823,06:53:00,22557,32.040172,34.775255,2015-12-14 07:35:48,2015-12-14 07:35:48,0
1,40,18854213_161215,07:52:48,07:52:48,13257,30,9823,07:10:00,22557,32.040172,34.775255,2015-12-14 07:52:48,2015-12-14 07:52:48,1
2,41,18854214_161215,08:22:48,08:22:48,13257,30,9823,07:40:00,22557,32.040172,34.775255,2015-12-14 08:22:48,2015-12-14 08:22:48,2
3,49,18854385_161215,08:07:48,08:07:48,13257,30,9823,07:25:00,22557,32.040172,34.775255,2015-12-14 08:07:48,2015-12-14 08:07:48,3
4,63,18854384_161215,07:17:48,07:17:48,13257,30,9823,06:35:00,22557,32.040172,34.775255,2015-12-14 07:17:48,2015-12-14 07:17:48,4


In [94]:
t = DAY+dt.timedelta(hours=7, minutes=15)
start_nodes_df = nodes_df[nodes_df['departure'] < t]

In [95]:
start_nodes_df.shape

(79, 14)

In [96]:
start_nodes_df.head()

Unnamed: 0,index,trip_id,arrival_time,departure_time_stop,stop_id,stop_sequence,route_id,departure_time_trip_departure,stop_code,stop_lat,stop_lon,arrival,departure,node_id
5,85,18854211_161215,07:00:37,07:00:37,13428,31,9823,06:15:00,25273,32.03778,34.777806,2015-12-14 07:00:37,2015-12-14 07:00:37,5
11,149,18854211_161215,07:05:45,07:05:45,21520,32,9823,06:15:00,33347,32.027036,34.773495,2015-12-14 07:05:45,2015-12-14 07:05:45,11
16,213,18854211_161215,07:07:09,07:07:09,24141,33,9823,06:15:00,36375,32.02375,34.774357,2015-12-14 07:07:09,2015-12-14 07:07:09,16
21,277,18854211_161215,07:08:07,07:08:07,24135,34,9823,06:15:00,36367,32.021815,34.773176,2015-12-14 07:08:07,2015-12-14 07:08:07,21
26,341,18854211_161215,07:09:23,07:09:23,24717,35,9823,06:15:00,37566,32.018598,34.770659,2015-12-14 07:09:23,2015-12-14 07:09:23,26


#### Validation - let's find the most frequent origin stops and make sure it seems right.

In [97]:
# stops_mapping_150m_df.IN_FID.mode()

In [98]:
# stops_df.iloc[10841]

In [99]:
# stops_df.iloc[10841]

We can see this is the central bus station, which makes sense it will have the most possible transfers.

Now we must re-map the nearby stops to stop_id as used in the GTFS feeds.

In [100]:
stops_df['IN_FID'] = stops_df['stop_id']

In [101]:
stops_df[['IN_FID', 'stop_id']].head()

Unnamed: 0,IN_FID,stop_id
0,775,775
1,12882,12882
2,12936,12936
3,13042,13042
4,13049,13049


In [102]:
stops_df.shape

(80, 9)

In [103]:
stops_mapping_df.head(2)

Unnamed: 0,NEAR_DIST,IN_FID,NEAR_FID
0,175.710767,12936,13950
1,199.559271,12936,13049


In [104]:
stops_mapping_df = stops_mapping_df.merge(stops_df[['IN_FID', 'stop_id']], on='IN_FID', how='left')
stops_mapping_df.rename(columns={'stop_id':'from_stop_id'}, inplace=True)
stops_mapping_df = stops_mapping_df.merge(stops_df[['IN_FID', 'stop_id']], left_on='NEAR_FID', right_on='IN_FID', how='left')
stops_mapping_df.rename(columns={'stop_id':'to_stop_id', 'NEAR_DIST':'dist'}, inplace=True)

In [105]:
stops_mapping_df = stops_mapping_df[['from_stop_id', 'to_stop_id', 'dist']]
stops_mapping_df.head()

Unnamed: 0,from_stop_id,to_stop_id,dist
0,12936,13950,175.710767
1,12936,13049,199.559271
2,13049,12936,199.559271
3,13226,13226,0.0
4,13598,29488,177.205069


In [106]:
AVG_WALK_SPEED = 1  # meters per second (m/s)
stops_mapping_df['walk_time_sec'] = stops_mapping_df['dist'] / AVG_WALK_SPEED

In [107]:
stops_mapping_df.head(3)

Unnamed: 0,from_stop_id,to_stop_id,dist,walk_time_sec
0,12936,13950,175.710767,175.710767
1,12936,13049,199.559271,199.559271
2,13049,12936,199.559271,199.559271


In [108]:
stops_mapping_df.to_csv(OUTPUT_PATH + 'nearby_stops.csv')

In [109]:
stops_mapping_df.to_pickle(OUTPUT_PATH + 'nearby_stops.pkl')

In [110]:
stops_mapping_df.shape

(98, 4)

In [111]:
stops_mapping_df.nunique()

from_stop_id     80
to_stop_id       80
dist             10
walk_time_sec    10
dtype: int64

In [112]:
# stops_mapping_df = pd.read_pickle(OUTPUT_PATH + 'nearby_stops.pkl')

In [113]:
stops_mapping_200m_df = stops_mapping_df[stops_mapping_df['dist'] < 200]

In [114]:
stops_mapping_200m_df.shape

(98, 4)

In [115]:
stops_mapping_200m_df.nunique()

from_stop_id     80
to_stop_id       80
dist             10
walk_time_sec    10
dtype: int64

## Compute Transfer Edges

In [116]:
stops_to_nodes = nodes_df.groupby('stop_id')[[
    'node_id', 'trip_id', 'arrival', 'departure', 'route_id']].apply(lambda g: g.values.tolist()).to_dict()

In [117]:
dict(list(stops_to_nodes.items())[0:2])

{775: [[31,
   '18854211_161215',
   Timestamp('2015-12-14 07:10:36'),
   Timestamp('2015-12-14 07:10:36'),
   9823],
  [32,
   '18854212_161215',
   Timestamp('2015-12-14 07:48:36'),
   Timestamp('2015-12-14 07:48:36'),
   9823],
  [33,
   '18854213_161215',
   Timestamp('2015-12-14 08:05:36'),
   Timestamp('2015-12-14 08:05:36'),
   9823],
  [34,
   '18854385_161215',
   Timestamp('2015-12-14 08:20:36'),
   Timestamp('2015-12-14 08:20:36'),
   9823],
  [35,
   '18854384_161215',
   Timestamp('2015-12-14 07:30:36'),
   Timestamp('2015-12-14 07:30:36'),
   9823]],
 12816: [[332,
   '19283212_161215',
   Timestamp('2015-12-14 07:08:16'),
   Timestamp('2015-12-14 07:08:16'),
   2517],
  [333,
   '19283218_161215',
   Timestamp('2015-12-14 08:05:16'),
   Timestamp('2015-12-14 08:05:16'),
   2517],
  [334,
   '19283220_161215',
   Timestamp('2015-12-14 08:23:16'),
   Timestamp('2015-12-14 08:23:16'),
   2517],
  [335,
   '19283214_161215',
   Timestamp('2015-12-14 07:28:16'),
   Timestamp(

In [118]:
dict(list(stops_to_nodes.items()))

{775: [[31,
   '18854211_161215',
   Timestamp('2015-12-14 07:10:36'),
   Timestamp('2015-12-14 07:10:36'),
   9823],
  [32,
   '18854212_161215',
   Timestamp('2015-12-14 07:48:36'),
   Timestamp('2015-12-14 07:48:36'),
   9823],
  [33,
   '18854213_161215',
   Timestamp('2015-12-14 08:05:36'),
   Timestamp('2015-12-14 08:05:36'),
   9823],
  [34,
   '18854385_161215',
   Timestamp('2015-12-14 08:20:36'),
   Timestamp('2015-12-14 08:20:36'),
   9823],
  [35,
   '18854384_161215',
   Timestamp('2015-12-14 07:30:36'),
   Timestamp('2015-12-14 07:30:36'),
   9823]],
 12816: [[332,
   '19283212_161215',
   Timestamp('2015-12-14 07:08:16'),
   Timestamp('2015-12-14 07:08:16'),
   2517],
  [333,
   '19283218_161215',
   Timestamp('2015-12-14 08:05:16'),
   Timestamp('2015-12-14 08:05:16'),
   2517],
  [334,
   '19283220_161215',
   Timestamp('2015-12-14 08:23:16'),
   Timestamp('2015-12-14 08:23:16'),
   2517],
  [335,
   '19283214_161215',
   Timestamp('2015-12-14 07:28:16'),
   Timestamp(

In [119]:
DEPARTURE_INDEX = 3
for stop, nodes in stops_to_nodes.items():
    stops_to_nodes[stop] = sorted(nodes, key=lambda x: x[DEPARTURE_INDEX])

In [120]:
total_values = 0
max_nodes_in_stop = 0
for n in stops_to_nodes.values():
    num_nodes = len(n)
    if num_nodes > max_nodes_in_stop:
        max_nodes_in_stop = num_nodes
    total_values += num_nodes
print(f'There is a total of {total_values} nodes in the stops_to_nodes dictionary')

There is a total of 557 nodes in the stops_to_nodes dictionary


In [121]:
len(stops_to_nodes)

80

In [122]:
print(f'We have an average of {total_values/len(stops_to_nodes)}, and a max of {max_nodes_in_stop} nodes in a single stop.')

We have an average of 6.9625, and a max of 15 nodes in a single stop.


In [123]:
ARRIVAL_INDEX = 2
DEPARTURE_INDEX = 3
ROUTE_ID_INDEX = 4
MAX_WAIT_TIME = dt.timedelta(minutes=10)
def get_transfer_edges_from_nodes(stops_to_nodes_batch):
    print('Processing batch...')
    stops_to_nodes_batch = dict(stops_to_nodes_batch)
    transfer_edges = []
    for stop, nodes in tqdm(stops_to_nodes_batch.items()):
#         print(f'Handling stop: {stop}')
        for start_n in nodes:
#             print(f'Start node: {start_n}')
            nearby_stops_df = stops_mapping_200m_df[stops_mapping_200m_df['from_stop_id'] == stop]
            # Add current stop to check transfers from the same stop
            nearby_stops_df.append({'from_stop_id': [stop], 'to_stop_id': [stop], 'dist': [0], 'walk_time_sec': [0]}, ignore_index=True)
            # TODO: verify this we're not staying to the same line in same direction
#             print(f'Going though nearby stops')
            for s in nearby_stops_df.iterrows():
                nearby_stop_id = s[1]['to_stop_id']
#                 print(f'Nearby stop: {nearby_stop_id}')
                if nearby_stop_id not in stops_to_nodes:
                    # Some stops don't have trips that operate all week. Some operate only on weekends.
                    # If this is such a stop we should continue to look at other stops, we won't find any
                    # nodes here.
                    continue
                nearby_nodes = stops_to_nodes[nearby_stop_id]
                second_line_earliest_start_time = start_n[ARRIVAL_INDEX] + dt.timedelta(seconds=s[1]['walk_time_sec'])
                second_line_latest_start_time = second_line_earliest_start_time + MAX_WAIT_TIME
                # Find index of first node that departs at least at second_start_time or later
                _, _, _, departures, _ = zip(*nearby_nodes)
                i = bisect.bisect_left(departures, second_line_earliest_start_time)
#                 print(f'Number of nodes in current stop: {len(nearby_nodes)}')
                while (i < len(nearby_nodes) and 
                       nearby_nodes[i][DEPARTURE_INDEX] >= second_line_earliest_start_time and 
                       nearby_nodes[i][DEPARTURE_INDEX] <= second_line_latest_start_time):
#                     print(f'Handling node with index {i}: {nearby_nodes[i]}')
                    node = nearby_nodes[i][0]
                    if node == start_n[0] or (nearby_nodes[i][ROUTE_ID_INDEX] == start_n[ROUTE_ID_INDEX] and s == stop):
                        # We don't wish to transfer to the same node (no self-edges). 
                        # Another case we wish to avoid is transferring to the same line (route__id) in the same stop.
                        i += 1
                        continue
                    transfer_edges.append((start_n[0], nearby_nodes[i][0], nearby_nodes[i][DEPARTURE_INDEX] - start_n[ARRIVAL_INDEX]))
                    i += 1
    return transfer_edges

In [124]:
# transfer_edges = run_parallel_dict(stops_to_nodes, 1000, get_transfer_edges_from_nodes)

In [125]:
transfer_edges = get_transfer_edges_from_nodes(stops_to_nodes)

Processing batch...


HBox(children=(IntProgress(value=0, max=80), HTML(value='')))

In [126]:
len(transfer_edges)

318

In [66]:
transfer_edges[:3]

[(9, 63, Timedelta('0 days 00:08:01')),
 (6, 59, Timedelta('0 days 00:08:01')),
 (7, 60, Timedelta('0 days 00:08:01'))]

In [68]:
nodes_df[nodes_df['node_id'] == 9]

Unnamed: 0,index,trip_id,arrival_time,departure_time_stop,stop_id,stop_sequence,route_id,departure_time_trip_departure,stop_code,stop_lat,stop_lon,arrival,departure,node_id
9,90,19415585_091215,07:12:00,07:12:00,12936,2,9823,07:10:00,21227,32.106707,34.802776,2015-12-14 07:12:00,2015-12-14 07:12:00,9


In [69]:
nodes_df[nodes_df['node_id'] == 63]

Unnamed: 0,index,trip_id,arrival_time,departure_time_stop,stop_id,stop_sequence,route_id,departure_time_trip_departure,stop_code,stop_lat,stop_lon,arrival,departure,node_id
63,639,19415585_091215,07:20:01,07:20:01,13950,11,9823,07:10:00,26315,32.106109,34.8045,2015-12-14 07:20:01,2015-12-14 07:20:01,63


## TODO: we should eliminate transfers of the same trip_id

In [127]:
transfer_edges_sec = []
for s, t, w in transfer_edges:
    transfer_edges_sec.append((s, t, w.total_seconds()))

In [128]:
transfer_edges_sec[0:5]

[(332, 340, 600.0),
 (340, 335, 600.0),
 (335, 336, 600.0),
 (336, 337, 540.0),
 (337, 338, 540.0)]

In [129]:
import pickle
with open(OUTPUT_PATH + FILES_PREFIX + '_transfer_edges.pkl', 'wb') as f:
    print(f'Saving transfers to file {f.name}')
    pickle.dump(transfer_edges_sec, f)

Saving transfers to file ../../../output_data/validation/two_lines/morning_transfer_edges.pkl


## Let's check some transfers manually

In [46]:
transfer_edges[1000:1005]

[(1120240, 3243432, Timedelta('0 days 00:06:00')),
 (1120240, 3243400, Timedelta('0 days 00:14:04')),
 (1120240, 3243469, Timedelta('0 days 00:11:49')),
 (1120240, 1119942, Timedelta('0 days 00:09:52')),
 (1120240, 1119886, Timedelta('0 days 00:12:17'))]

In [47]:
nodes_df[nodes_df['node_id'] == 3243432]

Unnamed: 0,trip_id,arrival_time,departure_time_stop,stop_id,stop_sequence,route_id,trip_headsign,departure_time_trip_departure,stop_code,stop_lat,stop_lon,arrival,departure,node_id
3243432,36877179_271019,08:41:49,08:41:49,38971,39,25434,ראשון לציון_רכבת הראשונים,07:50:00,30064,31.947781,34.823278,2019-11-03 08:41:49,2019-11-03 08:41:49,3243432


In [48]:
nodes_df[nodes_df['node_id'] == 1120240]

Unnamed: 0,trip_id,arrival_time,departure_time_stop,stop_id,stop_sequence,route_id,trip_headsign,departure_time_trip_departure,stop_code,stop_lat,stop_lon,arrival,departure,node_id
1120240,42245728_271019,08:35:49,08:35:49,76,53,25581,רכבת הראשונים,07:40:00,38906,31.948552,34.822422,2019-11-03 08:35:49,2019-11-03 08:35:49,1120240


In [49]:
nodes_df[nodes_df['node_id'] == 3243400]

Unnamed: 0,trip_id,arrival_time,departure_time_stop,stop_id,stop_sequence,route_id,trip_headsign,departure_time_trip_departure,stop_code,stop_lat,stop_lon,arrival,departure,node_id
3243400,36873380_271019,08:49:53,08:49:53,38971,31,25405,ראשון לציון_רכבת הראשונים,08:10:00,30064,31.947781,34.823278,2019-11-03 08:49:53,2019-11-03 08:49:53,3243400


In [50]:
transfer_edges[4000000:4000002]

[(1763216, 3026510, Timedelta('0 days 00:09:05')),
 (1763216, 3026503, Timedelta('0 days 00:14:05'))]

In [51]:
nodes_df[nodes_df['node_id'] == 3026510]

Unnamed: 0,trip_id,arrival_time,departure_time_stop,stop_id,stop_sequence,route_id,trip_headsign,departure_time_trip_departure,stop_code,stop_lat,stop_lon,arrival,departure,node_id
3026510,42299653_011119,09:00:00,09:00:00,42875,1,51,אשדוד_תחנה מרכזית,09:00:00,20001,32.083157,34.796091,2019-11-03 09:00:00,2019-11-03 09:00:00,3026510


In [52]:
nodes_df[nodes_df['node_id'] == 1763216]

Unnamed: 0,trip_id,arrival_time,departure_time_stop,stop_id,stop_sequence,route_id,trip_headsign,departure_time_trip_departure,stop_code,stop_lat,stop_lon,arrival,departure,node_id
1763216,36873382_271019,08:50:55,08:50:55,38946,2,25405,ראשון לציון_רכבת הראשונים,08:50:00,20015,32.081706,34.795561,2019-11-03 08:50:55,2019-11-03 08:50:55,1763216


In [53]:
nodes_df[nodes_df['node_id'] == 3026503]

Unnamed: 0,trip_id,arrival_time,departure_time_stop,stop_id,stop_sequence,route_id,trip_headsign,departure_time_trip_departure,stop_code,stop_lat,stop_lon,arrival,departure,node_id
3026503,35771757_271019,09:05:00,09:05:00,42875,1,20871,ראש העין_פארק תעשיה אפק,09:05:00,20001,32.083157,34.796091,2019-11-03 09:05:00,2019-11-03 09:05:00,3026503
