In [1]:
import geopandas as gpd
from shapely.geometry import shape
from shapely.ops import linemerge, unary_union
import pandas as pd
import numpy as np

from data.constants import (WORLD_CRS, 
    L_STATIONS_TABLE, L_LINES_TABLE, BUS_ROUTES_TABLE, BUS_STOPS_TABLE)
from data.cta import CTAClient

In [2]:
train_stations_out = "../data/raw/train_stations.geojson"
train_lines_out = "../data/raw/train_lines.geojson"
bus_routes_file_out = "../data/raw/bus_routes.geojson"
bus_stops_file_out = "../data/raw/bus_stops.geojson"

In [3]:
client = CTAClient(60)

# Pipeline in

(none)

# Train Stations

In [4]:
train_stations = client.soda_get_all(L_STATIONS_TABLE, select="stop_id, direction_id, stop_name, station_name, map_id, location")

In [5]:
# train_stations['location'].apply(shape) # is not working!
train_stations['latitude'] = train_stations['location'].apply(lambda x: x['latitude'])
train_stations['longitude'] = train_stations['location'].apply(lambda x: x['longitude'])
train_stations['geometry'] = gpd.points_from_xy(train_stations['longitude'], train_stations['latitude'])
train_stations = train_stations.drop(columns=['location', 'latitude', 'longitude'])
train_stations = gpd.GeoDataFrame(train_stations, geometry='geometry',crs=WORLD_CRS)
# nb: Each train station is represented as two "stops" per station: one in each direction.
#     For our purposes, since we don't model the direction of travel, we will drop the redundant "stop".
train_stations = train_stations.drop_duplicates(['station_name','map_id','geometry'])

# Train Lines

Need to get lines in two parts becuase the datasets are hard to merge.

1. First, we just take the line identifiers, which do exist in the stations table.
2. Next we get the actual line segments table, which is better for geo mapping.

In [6]:
# This is wide, so we need to consolidate and narrow it.
lines = ['red','blue','g','brn','p','pexp','y','pnk','o']
line_names = ['red','blue','green','brown','purple','purpleexp','yellow','pink','orange']
station_ids = ['station_name', 'map_id']

station_to_line = client.soda_get_all(L_STATIONS_TABLE, select=f"{','.join(station_ids)}, {','.join(lines)}")
station_to_line = station_to_line.rename(columns=dict(zip(lines, line_names)))
station_to_line['purple'] = station_to_line['purple'] | station_to_line['purpleexp']
station_to_line = station_to_line.drop(columns=['purpleexp'])
station_to_line = station_to_line.melt(id_vars=station_ids, var_name='line', value_name='is_line')
station_to_line = station_to_line.query('is_line').drop(columns='is_line')
station_to_line = station_to_line.groupby(station_ids, as_index=False)['line'].agg(lambda x: ','.join(set(x)))

train_stations = train_stations.merge(station_to_line, how='left')
assert train_stations['line'].notna().all()

In [7]:
# This is the legit lines gdf but it doesn't have neat identifiers to relate to the stations.
train_lines = gpd.read_file(L_LINES_TABLE)
train_lines['single_line'] = train_lines['legend'].map({'BR':'brown','YL':'yellow','GR':'green',
                                        'OR':'orange','RD':'red','BL':'blue',
                                        'PK':'pink','PR':'purple'})
train_lines['multi_lines'] = train_lines['lines'].str.split(', ')
train_lines = train_lines.explode('multi_lines')
train_lines['multi_lines'] = train_lines['multi_lines'].str.lower().str.replace('(express)','').str.replace('(exp)','').str.strip()
train_lines['line'] = np.where(train_lines['single_line'].notna(), train_lines['single_line'], train_lines['multi_lines'])
train_lines = train_lines.filter(['geometry','line']).drop_duplicates()

In [8]:
# This table is station-to-station segments. Need to merge into one row per line.

def merge_lines(x: gpd.GeoSeries):
    merged = unary_union(x)
    return linemerge(merged) if merged.is_simple else merged

crs = train_lines.crs
train_lines = train_lines.groupby('line', as_index=False)['geometry'].agg(merge_lines)
train_lines = train_lines.set_crs(crs)


# Bus Routes

In [9]:
bus_routes =  client.soda_get_all(BUS_ROUTES_TABLE, select="the_geom, route, name")
# HACK: This one route is mysteriously present online but not returned by API??
roosevelt_route = gpd.read_file("../data/raw/roosevelt_route.geojson")
bus_routes['geometry'] = bus_routes['the_geom'].apply(shape)
bus_routes = bus_routes.drop(columns='the_geom')
bus_routes = pd.concat([bus_routes, roosevelt_route], ignore_index=True)
bus_routes = gpd.GeoDataFrame(bus_routes, geometry='geometry',crs=WORLD_CRS)

# Bus Stops

In [10]:
bus_stops = gpd.read_file(BUS_STOPS_TABLE, columns=['STREET','CROSS_ST','CITY','PUBLIC_NAM','ROUTESSTPG','geometry'])
bus_stops = bus_stops.rename(columns={'PUBLIC_NAM':'PUBLIC_NAME'})
# STREET CROSS_ST CITY is the composite non-unique PK for this table
#   - it is not unique due to large multi-bay transit centers that are conceptually co-located
# PUBLIC_NAM is human-readable but it is ambiguous concerning:
#   - repeated street names in chicago vs evanston
#   - repeated street/transit intersections across chicago e.g. Western

In [11]:
# Drop bus stops without route labels. Won't be able to get ridership for these.
print("Dropping {} rows".format(bus_stops['ROUTESSTPG'].isna().sum()))
bus_stops = bus_stops.dropna(subset='ROUTESSTPG')

Dropping 4 rows


In [12]:
# Impute city for unknown cities
print("Imputing {} rows".format(bus_stops['CITY'].isna().sum()))
bus_stops['CITY'] = bus_stops['CITY'].fillna('UNKNOWN')

Imputing 1 rows


In [13]:
bus_stops['ROUTESSTPG'] = bus_stops['ROUTESSTPG'].str.split(',')
bus_stops = bus_stops.explode('ROUTESSTPG').rename(columns={'ROUTESSTPG':'route'})
# nb: Compared to train stations, bus stop pairs on opposite sides of the street
#     aren't AS CLEANLY paired in the dataset. Though we could spatially join them
#     as 1-nearest-neighbor if we really wanted.

# Metra Lines

Metra does not provide machine-readable ridership reports. They have bar graphs of weekly total ridership and monthly ridership by line.

https://metra.com/ridership-reports

TODO!

But actually the Regional Transit Authority does provide machine-readable monthly ridership by line.

https://rtams.org/media/datasets/metra-ridership

# Pipeline out

In [14]:
train_stations.to_file(train_stations_out, index=False)
train_lines.to_file(train_lines_out, index=False)
bus_routes.to_file(bus_routes_file_out, index=False)
bus_stops.to_file(bus_stops_file_out, index=False)