## GTFS - Direct Edges

In [1]:
import pandas as pd
import datetime
import numpy as np
import pickle

DATA_PATH = '../../input_data/GTFS-28-Oct-19/'

In [2]:
OUTPUT_PATH = '../../output_data/'

## Load Nodes

In [4]:
# Load nodes
FILES_PREFIX = 'morning'
# FILES_PREFIX = 'single_trip'
file_name = OUTPUT_PATH + FILES_PREFIX + '_nodes.pkl'
print(f'Loading nodes from file {file_name}')
nodes_df = pd.read_pickle(file_name)
nodes_df.head(3)

Loading nodes from file ../../output_data/morning_nodes.pkl


Unnamed: 0,index,trip_id,arrival_time,departure_time_stop,stop_id,stop_sequence,route_id,trip_headsign,departure_time_trip_departure,stop_code,stop_lat,stop_lon,arrival,departure,node_id
0,18,14824097_271019,08:02:40,08:02:40,2356,2,1606,נוף הגליל_שלום,08:00:00,41476,32.793214,35.038925,2019-11-03 08:02:40,2019-11-03 08:02:40,0
1,19,14824098_271019,09:02:40,09:02:40,2356,2,1606,נוף הגליל_שלום,09:00:00,41476,32.793214,35.038925,2019-11-03 09:02:40,2019-11-03 09:02:40,1
2,36,19590744_271019,08:14:17,08:14:17,2356,2,16379,טבריה_תחנה מרכזית,08:10:00,41476,32.793214,35.038925,2019-11-03 08:14:17,2019-11-03 08:14:17,2


In [5]:
nodes_df.shape

(325870, 15)

## Compute Direct Edges

In [8]:
# First we verify that the stops in each trip are consecutive

tmp_df = nodes_df

def verify_consective(l):
    if sorted(list(l)) != list(range(min(l), max(l)+1)):
        print('Found non-consecutive stop sequence')
        raise Exception('NON CONSECUTIVE STOP SEQUENCE')

In [9]:
tmp_df[['trip_id', 'stop_sequence']].groupby('trip_id').apply(lambda x: verify_consective(x.stop_sequence))

No exception -> all stops are consecutive, we can resume

In [10]:
del tmp_df

In [11]:
def create_direct_edges_for_trip(x):
    # TODO: Save node_id instead of index
    for index, node in x.iterrows():
        stop_seq = node['stop_sequence']
        # For the same trip we want to take the next node
        next_node = x[x['stop_sequence'] == stop_seq + 1]
        if next_node.shape[0] == 0:
            # This is the last node of the current trip, no outgoing edge
            continue
        assert next_node.shape[0] == 1

        w = ((next_node['departure'] - node['arrival'])).values[0] / np.timedelta64(1, 's')
        d_edge = (node['node_id'], next_node['node_id'].values[0], w)
        direct_edges.add(d_edge)

<span style="background-color: #F1D9F9">TODO: make this multiprocessed: group by trip_id, and then split the data to batches of groups (trips). For each group we will apply the 'create direct edges' using a pool. We need to make sure that directed_edges is thread/process-safe.</span>

In [12]:
# For each trip (grouping by trip), create edges for this trip's consecutive nodes.
from tqdm.auto import tqdm
tqdm.pandas()

direct_edges = set()
nodes_df[['node_id', 'trip_id', 'stop_sequence', 'arrival', 'departure']].groupby('trip_id').progress_apply(create_direct_edges_for_trip)

  from pandas import Panel


HBox(children=(FloatProgress(value=0.0, max=13765.0), HTML(value='')))




In [13]:
len(direct_edges)

312105

In [14]:
with open(OUTPUT_PATH + FILES_PREFIX + '_direct_edges.pkl', 'wb') as f:
    pickle.dump(direct_edges, f)