# Notebook with GTFS methods

Goals: 

* Make a way to calculate the scheduled number of current active trips given a date, time, and route. 
    - Take datetime and find what services are active on that date 
    - Find what trips run on those services + route 
    - Find which of those trips are "in progress" per stop_times
* ~Output most common shape by route~

In [33]:
# imports 

import boto3
import os
import pandas as pd
import zipfile
import requests
import pendulum
from io import BytesIO
import shapely
import geopandas

In [3]:
# local 
# CTA_GTFS = zipfile.ZipFile('cta_gtfs_20220509.zip')
# s3
# follow https://pythonguides.com/download-zip-file-from-url-using-python/
# CTA_GTFS = zipfile.ZipFile(BytesIO(requests.get('https://chn-ghost-buses-public.s3.us-east-2.amazonaws.com/cta_static_gtfs/cta_gtfs_20220509.zip').content))
# cta website
CTA_GTFS = zipfile.ZipFile(BytesIO(requests.get('https://www.transitchicago.com/downloads/sch_data/google_transit.zip').content))


In [5]:
class GTFSFeed:
    def __init__(self, gtfs_zipfile):
        self.gtfs_zipfile = gtfs_zipfile
        try: 
            with self.gtfs_zipfile.open('stops.txt') as file:
                    self.stops = pd.read_csv(file, dtype = 'object')
                    print("stops.txt loaded")
            with self.gtfs_zipfile.open('stop_times.txt') as file:
                    self.stop_times = pd.read_csv(file, dtype = 'object')
                    print("stop_times.txt loaded")
            with self.gtfs_zipfile.open('routes.txt') as file:
                    self.routes = pd.read_csv(file, dtype = 'object')
                    print("routes.txt loaded")
            with self.gtfs_zipfile.open('trips.txt') as file:
                    self.trips = pd.read_csv(file, dtype = 'object')
                    print("trips.txt loaded")
        except KeyError as e:
            print("GTFS is missing required file")
            print(e)
        if 'calendar.txt' in self.gtfs_zipfile.namelist():
                with self.gtfs_zipfile.open('calendar.txt') as file:
                        self.calendar = pd.read_csv(file, dtype = 'object')
                        print("calendar.txt loaded")
        else:
            print("no calendar.txt found")
        if 'calendar_dates.txt' in self.gtfs_zipfile.namelist():
                with self.gtfs_zipfile.open('calendar_dates.txt') as file:
                        self.calendar_dates = pd.read_csv(file, dtype = 'object')
                        print("calendar_dates.txt loaded")
        else:
            print("no calendar_dates.txt found")
        if 'shapes.txt' in self.gtfs_zipfile.namelist():
                with self.gtfs_zipfile.open('shapes.txt') as file:
                        self.shapes = pd.read_csv(file, dtype = 'object')
                        print("shapes.txt loaded")
        else:
            print("no shapes.txt found")
            

In [6]:
data = GTFSFeed(CTA_GTFS)

stops.txt loaded
stop_times.txt loaded
routes.txt loaded
trips.txt loaded
calendar.txt loaded
calendar_dates.txt loaded
shapes.txt loaded


## Basic data transformations

Ex. creating actual timestamps

In [7]:
ex = data.stop_times.head().copy()

In [8]:
ex

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,shape_dist_traveled
0,68227594281,08:34:30,08:34:30,30166,9,Midway,0,55376
1,68227594281,08:35:00,08:35:00,30031,10,Midway,0,56336
2,68227594281,08:36:30,08:36:30,30007,11,Midway,0,57514
3,68227594281,08:38:00,08:38:00,30141,12,Midway,0,58962
4,68227594281,08:40:30,08:40:30,30074,13,Midway,0,60890


In [9]:
def make_timestamp(s, date = pendulum.now()):
    parts = s.split(':')
    assert len(parts)==3
    if int(parts[0]) > 23:
        num_parts = [int(parts[0]) - 24, int(parts[1]), int(parts[2])]
    else:
        num_parts = [int(parts[0]), int(parts[1]), int(parts[2])]
    return pendulum.datetime(year = date.year, month = date.month, day = date.day, hour = num_parts[0], minute = num_parts[1], second = num_parts[2])

In [10]:
ex['arrival_timestamp'] = ex.arrival_time.apply(lambda x: make_timestamp(x))
ex['departure_timestamp'] = ex.arrival_time.apply(lambda x: make_timestamp(x))

In [11]:
ex

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,shape_dist_traveled,arrival_timestamp,departure_timestamp
0,68227594281,08:34:30,08:34:30,30166,9,Midway,0,55376,2022-06-21 08:34:30+00:00,2022-06-21 08:34:30+00:00
1,68227594281,08:35:00,08:35:00,30031,10,Midway,0,56336,2022-06-21 08:35:00+00:00,2022-06-21 08:35:00+00:00
2,68227594281,08:36:30,08:36:30,30007,11,Midway,0,57514,2022-06-21 08:36:30+00:00,2022-06-21 08:36:30+00:00
3,68227594281,08:38:00,08:38:00,30141,12,Midway,0,58962,2022-06-21 08:38:00+00:00,2022-06-21 08:38:00+00:00
4,68227594281,08:40:30,08:40:30,30074,13,Midway,0,60890,2022-06-21 08:40:30+00:00,2022-06-21 08:40:30+00:00


## Most common shape by route

In [20]:
# get trip count by route, direction, shape id
trips_by_rte_direction = data.trips.groupby(['route_id', 'shape_id', 'direction'])['trip_id'].count().reset_index()

In [26]:
# keep only most common shape id by route, direction
# follow: https://stackoverflow.com/a/54041328
most_common_shapes = trips_by_rte_direction.sort_values('trip_id').drop_duplicates(['route_id','direction'],keep='last')

In [27]:
# get additional route attributes
most_common_shapes = most_common_shapes.merge(data.routes, how = 'left', on = 'route_id')

In [45]:
# make shapely points
# https://www.geeksforgeeks.org/apply-function-to-every-row-in-a-pandas-dataframe/
data.shapes['pt'] = data.shapes.apply(lambda row: shapely.geometry.Point(float(row['shape_pt_lat']), float(row['shape_pt_lon'])), axis = 1)

  arr = construct_1d_object_array_from_listlike(values)


In [53]:
# construct sorted list of shapely points
# custom aggregation function: https://stackoverflow.com/a/10964938

def make_list_of_points(sub_df):
    sorted_df = sub_df.sort_values(by = 'shape_pt_sequence')
    return list(sorted_df['pt'])

constructed_shapes = data.shapes.groupby('shape_id').apply(make_list_of_points).reset_index()

In [54]:
# merge in the other route attributes
final = most_common_shapes.merge(constructed_shapes, how = 'left', on = 'shape_id')

In [57]:
# make a "geometry" column for geopandas
final['geometry'] = final[0].apply(lambda x: geopandas.tools.collect(x))

In [60]:
# construct the geopandas geodataframe
final_gdf = geopandas.GeoDataFrame(data = final)

In [67]:
# drop the column that's a list of shapely points
# otherwise the json serialization complains about the Point geometries
final_gdf = final_gdf.drop(0, axis = 1)

In [69]:
# save to file as geojson
with open('route_shapes.geojson', 'w') as f:
    f.write(final_gdf.loc[final_gdf['route_type'] == '3'].to_json())