In [1]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
from shapely import from_wkt

from data.constants import (WORLD_CRS, LOCAL_CRS,
                                OHARE_CENTROID, MIDWAY_CENTROID,
                                DNC_START, DNC_END, L_STATIONS_TABLE)
from data.geo import meter_to_foot, dms_to_decimal
from data.cta import CTAClient

In [2]:
train_file_in = "../data/raw/train_stations.geojson"
bus_routes_file_in = "../data/raw/bus_routes.geojson"
bus_stops_file_in = "../data/raw/bus_stops.geojson"
bike_stations_file_in = "../data/interim/bike_stations_gbfs.geoparquet"
tract_file_in = "../data/raw/tracts.geojson"
comm_file_in = "../data/raw/communities.geojson"
poi_file_in = "../data/raw/poi_buildings.geojson"

train_rides_in = "../data/raw/train_rides.csv"
bus_rides_in = "../data/raw/bus_rides.csv"
bike_rides_in = "../data/interim/bike_rides_v2.geoparquet"
uber_rides_in = "../data/raw/uber_rides.parquet"

train_file_out = "../data/interim/train_stations.geojson"
bus_routes_file_out = "../data/interim/bus_routes.geojson"
bus_stops_file_out = "../data/interim/bus_stops.geojson"
bike_stations_file_out = "../data/interim/bike_stations_gbfs_v2.geojson"
tract_file_out = "../data/interim/tracts.geoparquet"
comm_file_out = "../data/interim/communities.geojson"

train_rides_out = "../data/interim/train_rides.csv"
bus_rides_out = "../data/interim/bus_rides.csv"
bike_rides_out = "../data/interim/bike_rides_v3.geoparquet"
uber_rides_out = "../data/interim/uber_rides.parquet"

# Pipeline in

In [3]:
train_stations = gpd.read_file(train_file_in)
bus_routes = gpd.read_file(bus_routes_file_in)
bus_stops = gpd.read_file(bus_stops_file_in)
tract_points = gpd.read_file(tract_file_in)
comm_points = gpd.read_file(comm_file_in)
poi_buildings = gpd.read_file(poi_file_in)
bike_stations = gpd.read_parquet(bike_stations_file_in)

In [4]:
train_rides = pd.read_csv(train_rides_in)
bus_rides = pd.read_csv(bus_rides_in)
bike_rides = gpd.read_parquet(bike_rides_in)
uber_rides = pd.read_parquet(uber_rides_in)

# Time-Invariant Features

## Airports

### -> CTA

We can label up the train and bus stops via name.

In [5]:
train_stations['airport'] = (train_stations.station_name == "O'Hare") \
                            | (train_stations.station_name == "Midway")
train_stations['airport'] = train_stations['airport'].astype(float)

In [6]:
# Note: We only check for Midway because CTA busses don't go directly into O'Hare,
#       nor even to the adjacent Mixed Modal Transit center.
bus_stops['airport'] = (bus_stops['PUBLIC_NAME'] == "Midway Orange Line Station").astype(float)

### -> Uber

The rideshare pickups are anonymized to census area so we can't use building catchements.

In [7]:
oh_xy = dms_to_decimal(*OHARE_CENTROID[1]), dms_to_decimal(*OHARE_CENTROID[0])
oh_xy = Point(*oh_xy) # lng/lat
oh_xy = gpd.GeoSeries([oh_xy], crs=WORLD_CRS).to_crs(LOCAL_CRS).iloc[0]
oh_tract = tract_points.set_index('geoid10').geometry.to_crs(LOCAL_CRS).contains(oh_xy)
oh_comm = comm_points.set_index('area_num_1').geometry.to_crs(LOCAL_CRS).contains(oh_xy)

In [8]:
mdw_xy = dms_to_decimal(*MIDWAY_CENTROID[1]), dms_to_decimal(*MIDWAY_CENTROID[0])
mdw_xy = Point(*mdw_xy) # lng/lat
mdw_xy = gpd.GeoSeries([mdw_xy], crs=WORLD_CRS).to_crs(LOCAL_CRS).iloc[0]
mdw_tract = tract_points.set_index('geoid10').geometry.to_crs(LOCAL_CRS).contains(mdw_xy)
mdw_comm = comm_points.set_index('area_num_1').geometry.to_crs(LOCAL_CRS).contains(mdw_xy)

In [9]:
tract_points['airport'] = (tract_points['geoid10'].map(oh_tract) | tract_points['geoid10'].map(mdw_tract)).astype(float)
comm_points['airport'] = (comm_points['area_num_1'].map(oh_comm) | comm_points['area_num_1'].map(mdw_comm)).astype(float)

### -> Divvy

We're going to assume that no one rides a bike to the airport for out of town travel.

In [10]:
bike_stations['airport'] = 0.0
bike_rides['airport'] = 0.0

## United Center, McCormick Place catchements

In [11]:
uc_building = poi_buildings[poi_buildings['name'] == 'UNITED CENTER']
mp_building = poi_buildings[poi_buildings['name'] == 'HYATT REGENCY MCCORMICK PLACE']

In [12]:
def buffered_intersects(gdf: gpd.GeoDataFrame, geom:gpd.GeoSeries, geom_prefix:str, dists:list[int]):
    """
    Computes multiple buffer distances around geom and EQUIVALENTLY computes:
        - whether each row of gdf intersects buffer
        - whether buffer intersects each row of gdf
    Params:
        - gdf: compares each row of gdf to geom
        - geom: the shape to compute buffers around. EXPECTS SINGLE ROW
        - geom_prefix: name for resulting buffer comparison column
        - dists: list of buffer radii in meters
    """
    assert len(geom) == 1, "Expects only one geom to buffer around."
    building_proj = geom.geometry.to_crs(LOCAL_CRS)
    buffers = [building_proj.buffer(meter_to_foot(d)) for d in dists]
    df_proj = gdf.geometry.to_crs(LOCAL_CRS)
    codes = {f"{geom_prefix}_{d}": df_proj.intersects(b.iloc[0]) * 1.0 for d,b in zip(dists,buffers)}
    return gdf.assign(**codes)

**Coded as** *"is this station within the catchement of the POI"*

In [13]:
# Stations are coded as 
train_stations = train_stations.pipe(buffered_intersects, uc_building, "uc", [400,800,1600])
bike_stations = bike_stations.pipe(buffered_intersects, uc_building, "uc", [400,800,1600])
bike_rides = bike_rides.pipe(buffered_intersects, uc_building, "uc", [400,800,1600])
bus_stops = bus_stops.pipe(buffered_intersects, uc_building, "uc", [400,800,1600])

train_stations = train_stations.pipe(buffered_intersects, mp_building, "mp", [400,800,1600])
bike_stations = bike_stations.pipe(buffered_intersects, mp_building, "mp", [400,800,1600])
bike_rides = bike_rides.pipe(buffered_intersects, mp_building, "mp", [400,800,1600])
bus_stops = bus_stops.pipe(buffered_intersects, mp_building, "mp", [400,800,1600])

**Coded as** *"is this POI catchement intersecting the tract"*

In [14]:
tract_points = tract_points.pipe(buffered_intersects, uc_building, "uc", [400,800,1600])
comm_points = comm_points.pipe(buffered_intersects, uc_building, "uc", [400,800,1600])

tract_points = tract_points.pipe(buffered_intersects, mp_building, "mp", [400,800,1600])
comm_points = comm_points.pipe(buffered_intersects, mp_building, "mp", [400,800,1600])

## Station Lines

In [15]:
cta_client = CTAClient(60)
lines = ['red','blue','g','brn','p','pexp','y','pnk','o']
line_names = ['red','blue','green','brown','purple','purpleexp','yellow','pink','orange']
station_ids = ['station_name', 'map_id']

l_lines = cta_client.soda_get_all(L_STATIONS_TABLE, select=f"{','.join(station_ids)}, {','.join(lines)}")
l_lines = l_lines.rename(columns=dict(zip(lines, line_names)))
l_lines['purple'] = l_lines['purple'] | l_lines['purpleexp']
l_lines = l_lines.drop(columns=['purpleexp'])
l_lines = l_lines.melt(id_vars=station_ids, var_name='line', value_name='is_line')
l_lines = l_lines.query('is_line').drop(columns='is_line')
l_lines = l_lines.groupby(station_ids, as_index=False)['line'].agg(lambda x: ','.join(set(x)))



In [16]:
train_stations = train_stations.merge(l_lines, how='left')
assert train_stations['line'].notna().all()

## Station Tracts

Label the station tracts so we can aggregate to tract-level to match uber data.

In [17]:
def code_tract(gdf):
    """
    Spatial join point to census tract
    """
    tracts_proj = tract_points.to_crs(LOCAL_CRS)
    gdf_proj = gdf.to_crs(LOCAL_CRS)
    comm_proj = comm_points.to_crs(LOCAL_CRS)
    coded_tracts = gdf_proj[['geometry']].sjoin(tracts_proj[['geoid10','geometry']], how='left', predicate='within')
    coded_comms = gdf_proj[['geometry']].sjoin(comm_proj[['area_num_1','geometry']], how='left', predicate='within')
    gdf = pd.concat([gdf, 
                    coded_tracts['geoid10'].rename('tract'), 
                    coded_comms['area_num_1'].rename('comm_area')],
                    axis=1)
    return gdf

In [18]:
train_stations = train_stations.pipe(code_tract)
bike_stations = bike_stations.pipe(code_tract)
bike_rides = bike_rides.pipe(code_tract)
bus_stops = bus_stops.pipe(code_tract)

## Tract Centroids

Just converting this to geom dtype.

In [19]:
tract_points['centroid'] = gpd.GeoSeries(tract_points['centroid'].apply(from_wkt), crs=tract_points.crs)

# Time-Varying Features

## DNC

In [20]:
train_rides['DNC'] = (train_rides['date'] >= DNC_START) & (train_rides['date'] <= DNC_END)
bus_rides['DNC'] = (bus_rides['date'] >= DNC_START) & (bus_rides['date'] <= DNC_END)
bike_rides['DNC'] = (bike_rides['date'] >= DNC_START) & (bike_rides['date'] <= DNC_END)
uber_rides['DNC'] = (uber_rides['date'] >= DNC_START) & (uber_rides['date'] <= DNC_END)

## Daytype

In [21]:
daytypes = pd.concat([train_rides[['date','daytype']],bus_rides[['date','daytype']]],ignore_index=True)
daytypes = daytypes.groupby('date')['daytype'].first()

In [22]:
bike_rides['daytype'] = bike_rides['date'].map(daytypes)
uber_rides['daytype'] = uber_rides['date'].map(daytypes)

# Pipeline out

In [23]:
train_stations.to_file(train_file_out, index=False)
bus_routes.to_file(bus_routes_file_out, index=False)
bus_stops.to_file(bus_stops_file_out, index=False)
tract_points.to_parquet(tract_file_out, index=False)
comm_points.to_file(comm_file_out, index=False)
bike_stations.to_file(bike_stations_file_out, index=False)

In [24]:
train_rides.to_csv(train_rides_out, index=False)
bus_rides.to_csv(bus_rides_out, index=False)
bike_rides.to_parquet(bike_rides_out, index=False)
uber_rides.to_parquet(uber_rides_out, index=False)