In [None]:
# Tell jupyter notebook to autoload config file
%load_ext autoreload
%autoreload 2

In [None]:
from config import *

## Import GTFS datasets

In [None]:
stops = pd.read_csv(os.path.join(gtfs_path, 'stops.txt'), dtype={"stop_id": str, "parent_station": str})
stop_times = pd.read_csv(os.path.join(gtfs_path, 'stop_times.txt'))
stop_times = stop_times.sort_values(['trip_id', 'stop_sequence'], ascending=[1, 1]).reset_index(drop=True)
print(f"Using \"{gtfs_dataset}\" dataset")

In [None]:
stops.head(10)

In [None]:
stop_times.head(10)

## Convert information about the stop from each row to "from_stop" and "to_stop" column

In [None]:
# Function to process each group
def process_trip(trip):
    stop_sequence = trip.stop_sequence.values[:-1]
    from_stop = trip.stop_id.values[:-1]
    to_stop = trip.stop_id.values[1:]
    from_stop_departure = trip.departure_time.values[:-1]
    to_stop_arrival = trip.arrival_time.values[1:]

    return pd.DataFrame({
        'trip_id': trip.trip_id.values[0],
        'stop_sequence': stop_sequence,
        'from_stop': from_stop,
        'to_stop': to_stop,
        'departure_time': from_stop_departure,
        'arrival_time': to_stop_arrival
    })


# Group by trip_id and apply the processing function
grouped = stop_times.groupby('trip_id', as_index=False)[stop_times.columns.tolist()]
trips = grouped.apply(process_trip, include_groups=False).reset_index(drop=True)

## Load pathways.txt if it exists

In [None]:
connections = pd.DataFrame()

if os.path.exists(os.path.join(gtfs_path, 'pathways.txt')):
    print("Loading pathways...")
    pathways = pd.read_csv(os.path.join(gtfs_path, 'pathways.txt'))
    pathways = pathways[pathways['traversal_time'].notna()]
    pathways.traversal_time = pathways.traversal_time.apply(lambda traversal_time: "{:0>8}".format(str(timedelta(seconds=traversal_time))))
    pathways_df = pd.DataFrame({
        'trip_id': -1,
        'stop_sequence': -1,
        'from_stop': pathways['from_stop_id'].values,
        'to_stop': pathways['to_stop_id'].values,
        'departure_time': "00:00:00",
        'arrival_time': pathways['traversal_time'].values
    })
    connections = pathways_df

## Add connections to parent stations

In [None]:
def process_station(station):
    return pd.DataFrame({
        'trip_id': 0,
        'stop_sequence': -1,
        'from_stop': station['parent_station'].values,
        'to_stop': station['stop_id'].values,
        'departure_time': "00:00:00",
        'arrival_time': "00:00:00"
    })


grouped = stops.groupby('parent_station', as_index=False)[stops.columns.tolist()]
parent_connections = grouped.apply(process_station, include_groups=False).reset_index(drop=True)

In [None]:
data = pd.concat([trips, connections, parent_connections])
data.trip_id = data.trip_id.astype(int)
data.stop_sequence = data.stop_sequence.astype(int)

data = data.sort_values(['trip_id', 'stop_sequence'], ascending=[1, 1]).reset_index(drop=True)

In [None]:
data.head(10)

In [None]:
data.to_csv(data_path, index=False)