In [1]:
import geopandas as gpd
import pandas as pd
from shapely import from_wkt
from shapely.geometry import Point, shape
from calendar import SATURDAY

from data.constants import (WORLD_CRS, LOCAL_CRS,
                                OHARE_CENTROID, MIDWAY_CENTROID,
                                DNC_START, DNC_END)
from data.geo import meter_to_foot, dms_to_decimal
from data.datemath import from_ymd

In [2]:
train_file_in = "../data/raw/train_stations.geojson"
bus_stops_file_in = "../data/raw/bus_stops.geojson"
bike_stations_file_in = "../data/interim/bike_stations_gbfs.geoparquet"
tract_file_in = "../data/raw/tracts.geojson"
comm_file_in = "../data/raw/communities.geojson"
poi_file_in = "../data/raw/poi_buildings.geojson"

train_rides_in = "../data/raw/train_rides.csv"
bus_rides_in = "../data/raw/bus_rides.csv"
bike_rides_in = "../data/interim/bike_rides_v2.geoparquet"
uber_tract_rides_in = "../data/raw/uber_tract_rides.parquet"
uber_comm_rides_in = "../data/raw/uber_comm_rides.parquet"

train_stations_out = "../data/interim/train_stations.geojson"
train_lines_out = "../data/interim/train_lines.csv"
bus_stops_file_out = "../data/interim/bus_stops.geojson"
bike_stations_file_out = "../data/interim/bike_stations_gbfs_v2.geojson"
tract_file_out = "../data/interim/tracts.geoparquet"
comm_file_out = "../data/interim/communities.geojson"

train_rides_out = "../data/interim/train_rides.csv"
bus_rides_out = "../data/interim/bus_rides.csv"
bike_rides_out = "../data/interim/bike_rides_v3.geoparquet"
uber_rides_out = "../data/interim/uber_rides.parquet"

In [3]:
TOLERANCE = 200  # meters

# Pipeline in

In [4]:
train_stations = gpd.read_file(train_file_in)
bus_stops = gpd.read_file(bus_stops_file_in)
tract_points = gpd.read_file(tract_file_in)
comm_points = gpd.read_file(comm_file_in)
poi_buildings = gpd.read_file(poi_file_in)
bike_stations = gpd.read_parquet(bike_stations_file_in)

In [5]:
train_rides = pd.read_csv(train_rides_in)
bus_rides = pd.read_csv(bus_rides_in)
bike_rides = gpd.read_parquet(bike_rides_in)
uber_tract_rides = pd.read_parquet(uber_tract_rides_in)
uber_comm_rides = pd.read_parquet(uber_comm_rides_in)

In [6]:
uber_rides = pd.concat([uber_tract_rides.rename(columns={'tract':'id'}), 
                        uber_comm_rides.rename(columns={'comm':'id'})],
                        axis=0, keys=['tract','comm'], names=['unit','index']) \
                .reset_index('unit').reset_index(drop=True)

# Time-Invariant Features

## Airports

### -> CTA

We can label up the train and bus stops via name.

In [7]:
train_stations['airport'] = (train_stations.station_name == "O'Hare") \
                            | (train_stations.station_name == "Midway")
train_stations['airport'] = train_stations['airport'].astype(float)

In [8]:
# Note: We only check for Midway because CTA busses don't go directly into O'Hare,
#       nor even to the adjacent Mixed Modal Transit center.
bus_stops['airport'] = (bus_stops['PUBLIC_NAME'] == "Midway Orange Line Station").astype(float)

### -> Uber

The rideshare pickups are anonymized to census area so we can't use building catchements.

In [9]:
oh_xy = dms_to_decimal(*OHARE_CENTROID[1]), dms_to_decimal(*OHARE_CENTROID[0])
oh_xy = Point(*oh_xy) # lng/lat
oh_xy = gpd.GeoSeries([oh_xy], crs=WORLD_CRS).to_crs(LOCAL_CRS).iloc[0]
oh_tract = tract_points.set_index('geoid10').geometry.to_crs(LOCAL_CRS).contains(oh_xy)
oh_comm = comm_points.set_index('comm_area').geometry.to_crs(LOCAL_CRS).contains(oh_xy)

In [10]:
mdw_xy = dms_to_decimal(*MIDWAY_CENTROID[1]), dms_to_decimal(*MIDWAY_CENTROID[0])
mdw_xy = Point(*mdw_xy) # lng/lat
mdw_xy = gpd.GeoSeries([mdw_xy], crs=WORLD_CRS).to_crs(LOCAL_CRS).iloc[0]
mdw_tract = tract_points.set_index('geoid10').geometry.to_crs(LOCAL_CRS).contains(mdw_xy)
mdw_comm = comm_points.set_index('comm_area').geometry.to_crs(LOCAL_CRS).contains(mdw_xy)

In [11]:
tract_points['airport'] = (tract_points['geoid10'].map(oh_tract) | tract_points['geoid10'].map(mdw_tract)).astype(float)
comm_points['airport'] = (comm_points['comm_area'].map(oh_comm) | comm_points['comm_area'].map(mdw_comm)).astype(float)

### -> Divvy

We're going to assume that no one rides a bike to the airport for out of town travel.

In [12]:
bike_stations['airport'] = 0.0
bike_rides['airport'] = 0.0

## United Center, McCormick Place catchements

In [13]:
uc_building = poi_buildings[poi_buildings['name'] == 'UNITED CENTER']
mp_building = poi_buildings[poi_buildings['name'] == 'HYATT REGENCY MCCORMICK PLACE']

In [14]:
def buffered_intersects(gdf: gpd.GeoDataFrame, geom:gpd.GeoSeries, geom_prefix:str, dists:list[int]):
    """
    Computes multiple buffer distances around geom and EQUIVALENTLY computes:
        - whether each row of gdf intersects buffer
        - whether buffer intersects each row of gdf
    Params:
        - gdf: compares each row of gdf to geom
        - geom: the shape to compute buffers around. EXPECTS SINGLE ROW
        - geom_prefix: name for resulting buffer comparison column
        - dists: list of buffer radii in meters
    """
    assert len(geom) == 1, "Expects only one geom to buffer around."
    building_proj = geom.geometry.to_crs(LOCAL_CRS)
    buffers = [building_proj.buffer(meter_to_foot(d)) for d in dists]
    df_proj = gdf.geometry.to_crs(LOCAL_CRS)
    codes = {f"{geom_prefix}_{d}": df_proj.intersects(b.iloc[0]) * 1.0 for d,b in zip(dists,buffers)}
    return gdf.assign(**codes)

**Coded as** *"is this station within the catchement of the POI"*

In [15]:
# Stations are coded as 
train_stations = train_stations.pipe(buffered_intersects, uc_building, "uc", [400,800,1600])
bike_stations = bike_stations.pipe(buffered_intersects, uc_building, "uc", [400,800,1600])
bike_rides = bike_rides.pipe(buffered_intersects, uc_building, "uc", [400,800,1600])
bus_stops = bus_stops.pipe(buffered_intersects, uc_building, "uc", [400,800,1600])

train_stations = train_stations.pipe(buffered_intersects, mp_building, "mp", [400,800,1600])
bike_stations = bike_stations.pipe(buffered_intersects, mp_building, "mp", [400,800,1600])
bike_rides = bike_rides.pipe(buffered_intersects, mp_building, "mp", [400,800,1600])
bus_stops = bus_stops.pipe(buffered_intersects, mp_building, "mp", [400,800,1600])

**Coded as** *"is this POI catchement intersecting the tract"*

In [16]:
tract_points = tract_points.pipe(buffered_intersects, uc_building, "uc", [400,800,1600])
comm_points = comm_points.pipe(buffered_intersects, uc_building, "uc", [400,800,1600])

tract_points = tract_points.pipe(buffered_intersects, mp_building, "mp", [400,800,1600])
comm_points = comm_points.pipe(buffered_intersects, mp_building, "mp", [400,800,1600])

## Tract Membership

Label the station tracts so we can aggregate to tract-level to match uber data.

In [17]:
def code_tract(gdf):
    """
    Spatial join point to census tract
    """
    tracts_proj = tract_points.to_crs(LOCAL_CRS)
    gdf_proj = gdf.to_crs(LOCAL_CRS)
    comm_proj = comm_points.to_crs(LOCAL_CRS)
    coded_tracts = gdf_proj[['geometry']].sjoin(tracts_proj[['geoid10','geometry']], how='left', predicate='within')
    coded_comms = gdf_proj[['geometry']].sjoin(comm_proj[['comm_area','geometry']], how='left', predicate='within')
    gdf = pd.concat([gdf, 
                    coded_tracts['geoid10'].rename('tract'), 
                    coded_comms['comm_area'].rename('comm_area')],
                    axis=1)
    return gdf

In [18]:
train_stations = train_stations.pipe(code_tract)
bike_stations = bike_stations.pipe(code_tract)
bike_rides = bike_rides.pipe(code_tract)
bus_stops = bus_stops.pipe(code_tract)

In [19]:
print("Bike rides with unknown tract {:.2%}".format(
    bike_rides.tract.isna().mean()
))

Bike rides with unknown tract 0.26%


At this point 0.2% of bike rides are not strictly within a tract. 
But they happen to be really close, so let's

In [20]:
def code_nearest_tract(gdf):
    """
    Spatial join point to census tract
    """
    tracts_proj = tract_points.to_crs(LOCAL_CRS)
    gdf_proj = gdf.to_crs(LOCAL_CRS)
    comm_proj = comm_points.to_crs(LOCAL_CRS)
    opts = dict(how='left', distance_col='dist', max_distance=meter_to_foot(TOLERANCE))
    coded_tracts = gdf_proj[['geometry']].sjoin_nearest(tracts_proj[['geoid10','geometry']], **opts)
    coded_comms = gdf_proj[['geometry']].sjoin_nearest(comm_proj[['comm_area','geometry']], **opts)
    assert all(coded_tracts.dist <= TOLERANCE)
    assert all(coded_comms.dist <= TOLERANCE)
    gdf = pd.concat([gdf, 
                    coded_tracts['geoid10'].rename('tract'), 
                    coded_comms['comm_area']],
                    axis=1)
    return gdf

In [21]:
bike_rides_nearest = bike_rides[bike_rides.tract.isna()].drop(columns=['tract','comm_area'])
bike_rides_nearest = code_nearest_tract(bike_rides_nearest)
# Just take first in case any are equidistant.
bike_rides_nearest = bike_rides_nearest.loc[bike_rides_nearest.index.drop_duplicates()]
assert bike_rides_nearest[['tract','comm_area']].notna().all().all()

In [22]:
bike_rides = pd.concat([bike_rides[bike_rides.tract.notna()], 
                        bike_rides_nearest])

## Transit access by tract

In [23]:
uber_tracts = tract_points[tract_points.geoid10.isin(uber_rides[uber_rides.unit=='tract']['id'])]
uber_tracts = uber_tracts.rename(columns={'geoid10':'id'})
uber_comms = comm_points[comm_points.comm_area.isin(uber_rides[uber_rides.unit=='comm']['id'])]
uber_comms = uber_comms.rename(columns={'comm_area':'id'})

In [24]:
def access_stats(area_df, transit_df, transit_name, area_id, transit_id):
    area_centroids = (area_df
                      .assign(centroid = area_df.geometry.to_crs(LOCAL_CRS).centroid)
                      .set_geometry('centroid')
                      .drop(columns=['geometry'])
                      .filter([area_id,'centroid']))
   
    # Note sjoin_nearest is m:m if points are equidistant ==> we further consolidate
    access = area_centroids.sjoin_nearest(transit_df[['geometry']].to_crs(LOCAL_CRS), 
                                           how='left', 
                                           distance_col=f'{transit_name}_distance') \
                        .groupby(area_id,as_index=False)[f'{transit_name}_distance'].min()

    contained = area_df.merge(transit_df, left_on=area_id, right_on=transit_id,
                               how='left', validate="1:m") \
                        .groupby(area_id,as_index=False).size() \
                        .rename(columns={'size':f'{transit_name}_contained'})
    
    area_df = area_df.merge(access).merge(contained)
    return area_df

In [25]:
tract_points = tract_points.pipe(access_stats, train_stations[['tract','geometry']], 'train', 'geoid10', 'tract')
tract_points = tract_points.pipe(access_stats, bike_stations[['tract','geometry']], 'bike', 'geoid10', 'tract')
# tract_points = tract_points.pipe(access_stats, bus_stops[['tract','geometry']], 'bus', 'geoid10', 'tract')
tract_points = tract_points.pipe(access_stats, uber_tracts[['id','geometry']], 'uber', 'geoid10', 'id')

In [26]:
comm_points = comm_points.pipe(access_stats, train_stations[['comm_area','geometry']], 'train', 'comm_area', 'comm_area')
comm_points = comm_points.pipe(access_stats, bike_stations[['comm_area','geometry']], 'bike', 'comm_area', 'comm_area')
# comm_points = comm_points.pipe(access_stats, bus_stops[['comm_area','geometry']], 'bus', 'comm_area', 'comm_area')
comm_points = comm_points.pipe(access_stats, uber_comms[['id','geometry']], 'uber', 'comm_area', 'id')

# Time-Varying Features

## DNC

In [27]:
train_rides['DNC'] = (train_rides['date'] >= DNC_START) & (train_rides['date'] <= DNC_END)
bus_rides['DNC'] = (bus_rides['date'] >= DNC_START) & (bus_rides['date'] <= DNC_END)
bike_rides['DNC'] = (bike_rides['date'] >= DNC_START) & (bike_rides['date'] <= DNC_END)
uber_rides['DNC'] = (uber_rides['date'] >= DNC_START) & (uber_rides['date'] <= DNC_END)

## Daytype

In [28]:
def label_daytype(df: pd.DataFrame):
    return df.assign(
        dotw = df.date.apply(lambda x: from_ymd(x).weekday()),
        is_weekend = lambda x: x.dotw >= SATURDAY)

In [29]:
train_rides = train_rides.pipe(label_daytype)
bike_rides = bike_rides.pipe(label_daytype)
bus_rides = bus_rides.pipe(label_daytype)
uber_rides = uber_rides.pipe(label_daytype)

# Pipeline out

In [30]:
train_stations.to_file(train_stations_out, index=False)
bus_stops.to_file(bus_stops_file_out, index=False)
bike_stations.to_file(bike_stations_file_out, index=False)
tract_points.to_parquet(tract_file_out, index=False)
comm_points.to_file(comm_file_out, index=False)

In [31]:
train_rides.to_csv(train_rides_out, index=False)
bus_rides.to_csv(bus_rides_out, index=False)
bike_rides.to_parquet(bike_rides_out, index=False)
uber_rides.to_parquet(uber_rides_out, index=False)