In [16]:
from tqdm import tqdm
import json
import pandas as pd
import sys

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
CRS_LATLON = 'EPSG:4326'
DATA_DIR = '../../../data'
EXPORTS_DIR = f'{DATA_DIR}/exports'
LIB_DIR = '../../lib'

In [3]:
# Custom imports
sys.path.append(LIB_DIR)
from gtfs_sequence import Sequence

In [3]:
def peek(df):
    print(len(df))
    display(df.iloc[:3])

In [4]:
# Extract stops used for each trip.
stop_times_df = pd.read_csv(f'{DATA_DIR}/gtfs/manhattan/stop_times.txt')
stop_times_df = stop_times_df[['trip_id', 'stop_id']]
peek(stop_times_df)

934109


Unnamed: 0,trip_id,stop_id
0,MQ_C1-Weekday-032400_M57_451,400745
1,MQ_C1-Weekday-032400_M57_451,402233
2,MQ_C1-Weekday-032400_M57_451,403986


In [5]:
# Convert to/from a stop sequence.
def serialize_stop_ids(stop_ids):
    return ','.join([str(s) for s in stop_ids])
def deserialize_stops_key(stops_key):
    return [int(s) for s in stops_key.split(',')]

In [6]:
# Create dictionary of trip ID to stop sequence.
trip_id_stop_ids_dict = stop_times_df.groupby('trip_id').apply(
    lambda x: list(x['stop_id'])).to_dict()

# Serialize stop sequences.
trip_id_stops_key_dict = {
    trip_id: serialize_stop_ids(stop_ids)
    for trip_id, stop_ids in trip_id_stop_ids_dict.items()
}

# Map stop sequences to list of matching trips.
stops_key_trip_ids_dict = {}
for trip_id, stops_key in trip_id_stops_key_dict.items():
    if stops_key not in stops_key_trip_ids_dict:
        stops_key_trip_ids_dict[stops_key] = []
    stops_key_trip_ids_dict[stops_key].append(trip_id)

In [7]:
# Extract information for each trip.
trips_df = pd.read_csv(f'{DATA_DIR}/gtfs/manhattan/trips.txt')
trips_df = trips_df.set_index('trip_id')
trip_id_route_id_dict = trips_df['route_id'].to_dict()
peek(trips_df)

29055


Unnamed: 0_level_0,route_id,service_id,trip_headsign,direction_id,shape_id
trip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MQ_C1-Weekday-032400_M57_451,M57,MQ_C1-Weekday,WEST SIDE BROADWAY-72 ST CROSSTOWN,1,M570110
MQ_C1-Weekday-034000_M57_451,M57,MQ_C1-Weekday,EAST SIDE YORK-60 ST CROSSTOWN,0,M570111
MQ_C1-Weekday-037300_M57_451,M57,MQ_C1-Weekday,WEST SIDE BROADWAY-72 ST CROSSTOWN,1,M570109


In [8]:
# Create sequences from map of stops and matching trips.
sequences = [
    Sequence(deserialize_stops_key(stops_key), trip_ids, trips_df)
    for stops_key, trip_ids in tqdm(stops_key_trip_ids_dict.items())
]

  0%|          | 0/146 [00:00<?, ?it/s]

NameError: name 'Sequence' is not defined

In [None]:
# Export sequences to JSON.
with open(f'{EXPORTS_DIR}/json/manhattan/sequences.json', 'w') as fp:
    sequence_dicts = [s.to_dict() for s in sequences]
    json.dump(sequence_dicts, fp)