In [1]:
import geopandas as gpd
import os
from shapely.geometry import shape

from data.constants import (DATA_FOLDER, WORLD_CRS, 
    L_STATIONS_TABLE, BUS_ROUTES_TABLE, BUS_STOPS_TABLE)
from data.cta import CTAClient

In [2]:
train_file_out = os.path.join(DATA_FOLDER, "raw", "train_stations.geojson")
bus_routes_file_out = os.path.join(DATA_FOLDER, "raw", "bus_routes.geojson")
bus_stops_file_out = os.path.join(DATA_FOLDER, "raw", "bus_stops.geojson")

In [3]:
client = CTAClient(60)



# Pipeline in

(none)

# Train Stations

In [4]:
train_stations = client.soda_get_all(L_STATIONS_TABLE, select="stop_id, direction_id, stop_name, station_name, map_id, location")

In [5]:
# train_stations['location'].apply(shape) # is not working!
train_stations['latitude'] = train_stations['location'].apply(lambda x: x['latitude'])
train_stations['longitude'] = train_stations['location'].apply(lambda x: x['longitude'])
train_stations['geometry'] = gpd.points_from_xy(train_stations['longitude'], train_stations['latitude'])
train_stations = train_stations.drop(columns=['location', 'latitude', 'longitude'])
train_stations = gpd.GeoDataFrame(train_stations, geometry='geometry',crs=WORLD_CRS)
# nb: Each train station is represented as two "stops" per station: one in each direction.
#     For our purposes, since we don't model the direction of travel, we will drop the redundant "stop".
train_stations = train_stations.drop_duplicates(['station_name','map_id','geometry'])

# Bus Routes

In [6]:
bus_routes =  client.soda_get_all(BUS_ROUTES_TABLE, select="the_geom, route, name")
bus_routes['geometry'] = bus_routes['the_geom'].apply(shape)
bus_routes = bus_routes.drop(columns='the_geom')
bus_routes = gpd.GeoDataFrame(bus_routes, geometry='geometry',crs=WORLD_CRS)

# Bus Stops

In [7]:
bus_stops = gpd.read_file(BUS_STOPS_TABLE, columns=['STREET','CROSS_ST','CITY','PUBLIC_NAM','ROUTESSTPG','geometry'])
bus_stops = bus_stops.rename(columns={'PUBLIC_NAM':'PUBLIC_NAME'})
# STREET CROSS_ST CITY is the composite non-unique PK for this table
#   - it is not unique due to large multi-bay transit centers that are conceptually co-located
# PUBLIC_NAM is human-readable but it is ambiguous concerning:
#   - repeated street names in chicago vs evanston
#   - repeated street/transit intersections across chicago e.g. Western

In [8]:
# Drop bus stops without route labels. Won't be able to get ridership for these.
print("Dropping {} rows".format(bus_stops['ROUTESSTPG'].isna().sum()))
bus_stops = bus_stops.dropna(subset='ROUTESSTPG')

Dropping 4 rows


In [9]:
# Impute city for unknown cities
print("Imputing {} rows".format(bus_stops['CITY'].isna().sum()))
bus_stops['CITY'] = bus_stops['CITY'].fillna('UNKNOWN')

Imputing 1 rows


In [10]:
bus_stops['ROUTESSTPG'] = bus_stops['ROUTESSTPG'].str.split(',')
bus_stops = bus_stops.explode('ROUTESSTPG').rename(columns={'ROUTESSTPG':'route'})
# nb: Compared to train stations, bus stop pairs on opposite sides of the street
#     aren't AS CLEANLY paired in the dataset. Though we could spatially join them
#     as 1-nearest-neighbor if we really wanted.

# Metra Lines

Metra does not provide machine-readable ridership reports. They have bar graphs of weekly total ridership and monthly ridership by line.

https://metra.com/ridership-reports

TODO!

But actually the Regional Transit Authority does provide machine-readable monthly ridership by line.

https://rtams.org/media/datasets/metra-ridership

# Pipeline out

In [11]:
train_stations.to_file(train_file_out, index=False)
bus_routes.to_file(bus_routes_file_out, index=False)
bus_stops.to_file(bus_stops_file_out, index=False)