In [1]:
import geopandas as gpd
import os
from shapely.geometry import Point

from data.constants import (WORLD_CRS, LOCAL_CRS, DATA_FOLDER,
                                OHARE_CENTROID, MIDWAY_CENTROID)
from data.geo import meter_to_foot, dms_to_decimal

In [2]:
train_file_in = os.path.join(DATA_FOLDER, "raw", "train_stations.geojson")
bus_routes_file_in = os.path.join(DATA_FOLDER, "raw", "bus_routes.geojson")
bus_stops_file_in = os.path.join(DATA_FOLDER, "raw", "bus_stops.geojson")
bike_stations_file_in = os.path.join(DATA_FOLDER, "raw", "bike_stations.geojson")
tract_file_in = os.path.join(DATA_FOLDER, "raw", "tracts.geojson")
comm_file_in = os.path.join(DATA_FOLDER, "raw", "communities.geojson")
poi_file_in = os.path.join(DATA_FOLDER, "raw", "poi_buildings.geojson")

train_file_out = os.path.join(DATA_FOLDER, "interim", "train_stations.geojson")
bus_routes_file_out = os.path.join(DATA_FOLDER, "interim", "bus_routes.geojson")
bus_stops_file_out = os.path.join(DATA_FOLDER, "interim", "bus_stops.geojson")
bike_stations_file_out = os.path.join(DATA_FOLDER, "interim", "bike_stations.geojson")
tract_file_out = os.path.join(DATA_FOLDER, "interim", "tracts.geojson")
comm_file_out = os.path.join(DATA_FOLDER, "interim", "communities.geojson")

# Pipeline in

In [3]:
train_stations = gpd.read_file(train_file_in)
bus_routes = gpd.read_file(bus_routes_file_in)
bus_stops = gpd.read_file(bus_stops_file_in)
tract_points = gpd.read_file(tract_file_in)
comm_points = gpd.read_file(comm_file_in)
poi_buildings = gpd.read_file(poi_file_in)
bike_stations = gpd.read_file(bike_stations_file_in)

# Airports

## CTA Airports

We can label up the train and bus stops via name.

In [5]:
train_stations['airport'] = (train_stations.station_name == "O'Hare") \
                            | (train_stations.station_name == "Midway")

In [6]:
# Note: We only check for Midway because CTA busses don't go directly into O'Hare,
#       nor even to the adjacent Mixed Modal Transit center.
bus_stops['airport'] = bus_stops['PUBLIC_NAM'] == "Midway Orange Line Station"

## Uber Airports

The rideshare pickups are anonymized to census area so we can't use building catchements.

In [7]:
oh_xy = dms_to_decimal(*OHARE_CENTROID[1]), dms_to_decimal(*OHARE_CENTROID[0])
oh_xy = Point(*oh_xy) # lng/lat
oh_tract = tract_points.set_index('geoid10').geometry.contains(gpd.GeoSeries([oh_xy], crs=WORLD_CRS).to_crs(LOCAL_CRS).iloc[0])
oh_comm = comm_points.set_index('area_num_1').geometry.contains(gpd.GeoSeries([oh_xy], crs=WORLD_CRS).to_crs(LOCAL_CRS).iloc[0])

In [8]:
mdw_xy = dms_to_decimal(*MIDWAY_CENTROID[1]), dms_to_decimal(*MIDWAY_CENTROID[0])
mdw_xy = Point(*mdw_xy) # lng/lat
mdw_tract = tract_points.set_index('geoid10').geometry.contains(gpd.GeoSeries([mdw_xy], crs=WORLD_CRS).to_crs(LOCAL_CRS).iloc[0])
mdw_comm = comm_points.set_index('area_num_1').geometry.contains(gpd.GeoSeries([mdw_xy], crs=WORLD_CRS).to_crs(LOCAL_CRS).iloc[0])

In [9]:
tract_points['airport'] = oh_tract | mdw_tract
comm_points['airport'] = oh_comm | mdw_comm

## Bike Airports

We're going to assume that no one rides a bike to the airport for out of town travel.

In [10]:
bike_stations['airport'] = False

# UC and MP Catchements

In [14]:
def code_buffers(gdf: gpd.GeoDataFrame, geom:gpd.GeoSeries, geom_prefix:str, dists:list[int]):
    """
    Computes multiple buffer distances around geom and whether each row of gdf is within buffer.
    Params:
        - gdf: compares each row of gdf to geom
        - geom: the shape to compute buffers around. EXPECTS SINGLE ROW
        - geom_prefix: name for resulting buffer comparison column
        - dists: list of buffer radii in meters
    """
    assert len(geom) == 1, "Expects only one geom to buffer around."
    building_proj = geom.geometry.to_crs(LOCAL_CRS)
    buffers = [building_proj.buffer(meter_to_foot(d)) for d in dists]
    df_proj = gdf.geometry.to_crs(LOCAL_CRS)
    codes = {f"{geom_prefix}_{d}": df_proj.intersects(b.iloc[0]) * 1.0 for d,b in zip(dists,buffers)}
    return gdf.assign(**codes)

In [None]:
uc_building = poi_buildings[poi_buildings['name'] == 'UNITED CENTER']
mp_building = poi_buildings[poi_buildings['name'] == 'HYATT REGENCY MCCORMICK PLACE']

In [15]:
train_stations = train_stations.pipe(code_buffers, uc_building, "uc", [400,800,1600])
bike_stations = bike_stations.pipe(code_buffers, uc_building, "uc", [400,800,1600])
bus_stops = bus_stops.pipe(code_buffers, uc_building, "uc", [400,800,1600])

train_stations = train_stations.pipe(code_buffers, mp_building, "mp", [400,800,1600])
bike_stations = bike_stations.pipe(code_buffers, mp_building, "mp", [400,800,1600])
bus_stops = bus_stops.pipe(code_buffers, mp_building, "mp", [400,800,1600])

# TODO! Code tracts and community eareas??
# Also maybe code the UC and MP community areas in the other things?

In [16]:
# We only have ridership by bus route, not bus stop, so to aggregate stops to routes,
# we'll compute the mean number of stops within the buffer per route.
buffer_cols = ['uc_400','uc_800','uc_1600','mp_400','mp_800','mp_1600','airport']
bus_stops_features = bus_stops.groupby('route',as_index=False)[buffer_cols].mean()
bus_routes = bus_routes.merge(bus_stops_features, how='left')

# DNC

In [None]:
raise NotImplementedError("Need to pull in rides!")
        # .assign(DNC = lambda x: (x['date'] >= DNC_START_ISO) & (x['date'] <= DNC_END_ISO))


In [None]:
raise NotImplementedError("Finish refactor!")
def coalesce(df, left, right, coalesced):
    predicate = df[left].isna()
    df[coalesced] = df[left]
    df[predicate][coalesced] = df[predicate][right]
    return df.drop(columns=[left,right])

# Half of the bike rides are already denormalized and don't need bike-stations
# So we do a left-join and coalesce to get the missing geometries
bike_rides = bike_rides.merge(bike_stations.assign(station_id=bike_stations.station_id.astype(str)),
                                on=['station_id','vintage'], how='left')
bike_rides = bike_rides.pipe(coalesce, 'geometry_x','geometry_y','geometry')
# Bikes don't have the daytype column so we'll impute it
daytypes = pd.concat([train_rides[['date','daytype']],bus_rides[['date','daytype']]],ignore_index=True)
daytypes = daytypes.groupby('date')['daytype'].first()
bike_rides['daytype'] = bike_rides['date'].map(daytypes)


# Pipeline out

In [18]:
train_stations.to_file(train_file_out, index=False)
bus_routes.to_file(bus_routes_file_out, index=False)
bus_stops.to_file(bus_stops_file_out, index=False)
tract_points.to_file(tract_file_out, index=False)
comm_points.to_file(comm_file_out, index=False)
bike_stations.to_file(bike_stations_file_out, index=False)