In [1]:
from collections import Counter
from tqdm import tqdm
import json
import pandas as pd

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
CRS_LATLON = 'EPSG:4326'
DATA_DIR = '../../../data'
EXPORTS_DIR = f'{DATA_DIR}/exports'

In [3]:
def peek(df):
    print(len(df))
    display(df.iloc[:3])

In [4]:
# Extract stops used for each trip.
stop_times_df = pd.read_csv(f'{DATA_DIR}/gtfs/manhattan/stop_times.txt')
stop_times_df = stop_times_df[['trip_id', 'stop_id']]
peek(stop_times_df)

NameError: name 'pd' is not defined

In [None]:
# Convert to/from a stop sequence.
def serialize_stop_ids(stop_ids):
    return ','.join([str(s) for s in stop_ids])
def deserialize_stop_ids(stop_ids):
    return [int(s) for s in stop_ids.split(',')]

In [None]:
# Create dictionary of trip ID to stop sequence.
trip_id_stop_ids_dict = stop_times_df.groupby('trip_id').apply(
    lambda x: list(x['stop_id'])).to_dict()

# Serialize stop sequences.
trip_id_stops_key_dict = {
    trip_id: serialize_stop_ids(stop_ids)
    for trip_id, stop_ids in trip_id_stop_ids_dict.items()
}

# Map stop sequences to list of matching trips.
stops_key_trip_ids_dict = {}
for trip_id, stops_key in trip_id_stops_key_dict.items():
    if stops_key not in stops_key_trip_ids_dict:
        stops_key_trip_ids_dict[stops_key] = []
    stops_key_trip_ids_dict[stops_key].append(trip_id)

In [None]:
# Extract information for each trip.
trips_df = pd.read_csv(f'{DATA_DIR}/gtfs/manhattan/trips.txt')
trips_df = trips_df.set_index('trip_id')
trip_id_route_id_dict = trips_df['route_id'].to_dict()
peek(trips_df)

In [None]:
def most_common(list):
    """Returns the most common value in the given list."""
    counts = Counter(list)
    return counts.most_common(1)[0][0]

In [None]:
class Sequence:
    def __init__(self, stop_ids, trip_ids, trips_df):
        self.stop_ids = stop_ids
        self.trip_ids = trip_ids
        self.set_attributes(trips_df)

    def get_most_common(self, trips_df, column):
        """Returns the most common value of the given column for this
        sequence."""
        values = [trips_df.loc[trip_id][column] for trip_id in self.trip_ids]
        return most_common(values)
        
    def set_attributes(self, trips_df):
        """Assign attributes to this sequence based on the attribute that
        appears most often in matching trips. For example, if the majority of
        trips with this stop sequence have a route ID "M15", then this sequence
        of stops will be labelled as having an "M15" route ID. This is necessary
        due to mislabelled attributes."""
        self.direction_id = self.get_most_common(trips_df, 'direction_id')
        self.route_id = self.get_most_common(trips_df, 'route_id')
        self.service_id = self.get_most_common(trips_df, 'service_id')
        self.trip_headsign = self.get_most_common(trips_df, 'trip_headsign')
        self.shape_id = self.get_most_common(trips_df, 'shape_id')
    
    def to_dict(self):
        return {
            'direction_id': int(self.direction_id),
            'route_id': self.route_id,
            'service_id': self.trip_headsign,
            'shape_id': self.shape_id,
            'stop_ids': [int(s) for s in self.stop_ids],
            'trip_headsign': self.trip_headsign,
            'trip_ids': self.trip_ids,
        }

In [None]:
# Create sequences from map of stops and matching trips.
sequences = [
    Sequence(deserialize_stops(stops_key), trip_ids, trips_df)
    for stops_key, trip_ids in tqdm(stops_key_trip_ids_dict.items())
]

In [None]:
# Export sequences to JSON.
with open(f'{EXPORTS_DIR}/json/manhattan/sequences.json', 'w') as fp:
    sequence_dicts = [s.to_dict() for s in sequences]
    json.dump(sequence_dicts, fp)