In [1]:
import geopandas as gpd
import pandas as pd
from src.data.constants import *

In [None]:
train_file_out = os.path.join(DATA_FOLDER, "interim", "train_stations.geojson")
bus_routes_file_out = os.path.join(DATA_FOLDER, "interim", "bus_routes.geojson")
bus_stops_file_out = os.path.join(DATA_FOLDER, "interim", "bus_stops.geojson")
bike_stations_file_out = os.path.join(DATA_FOLDER, "interim", "bike_stations.geojson")
tract_file_out = os.path.join(DATA_FOLDER, "interim", "tracts.geojson")
comm_file_out = os.path.join(DATA_FOLDER, "interim", "communities.geojson")

# Pipeline in

# Panels

## Combine Panels

In [34]:
def combine_panels(bus=None, train=None, bike=None):
    if bus is not None:
        bus = bus.rename(columns={'route':'id'}).assign(transit='bus', tid="bus_"+bus['route'].astype(str))
    if train is not None:
        train = train.rename(columns={'map_id':'id'}).assign(transit='train', tid="train_"+train['map_id'].astype(str))
    if bike is not None:
        bike = bike.rename(columns={'station_id':'id'}).assign(transit='bike', tid="bike_"+bike['station_id'].astype(str))
    panel = pd.concat(filter(lambda x: x is not None and not x.empty, [bus,train,bike]), ignore_index=True, join='inner')
    panel['lat'] = gpd.GeoSeries(panel.geometry).to_crs(LOCAL_CRS).centroid.y
    panel['long'] = gpd.GeoSeries(panel.geometry).to_crs(LOCAL_CRS).centroid.x
    panel = panel.drop(columns=['geometry'])
    return panel

In [36]:
def coalesce(df, left, right, coalesced):
    predicate = df[left].isna()
    df[coalesced] = df[left]
    df[predicate][coalesced] = df[predicate][right]
    return df.drop(columns=[left,right])

def get_rides_panel(ctl_start, ctl_end, trt_start, trt_end):
    soql_where_date = f"""(('{trt_start}' <= date) AND (date <= '{trt_end}'))
                OR (('{ctl_start}' <= date) AND (date <= '{ctl_end}'))"""
    def _pd_where_date(x):
        dts = pd.to_datetime(pd.Series([ctl_start, ctl_end, trt_start, trt_end]))
        return ((dts[0] <= x['date']) & (x['date'] <= dts[1])) | \
                ((dts[2] <= x['date']) & (x['date'] <= dts[3]))
    
    train_rides = soda_get_all(L_RIDERSHIP_TABLE, 
                            select="station_id,date,daytype,rides",
                            where=soql_where_date) \
                .merge(train_stations, left_on='station_id', right_on='map_id')

    bus_rides = soda_get_all(BUS_RIDERSHIP_TABLE, 
                            select="route,date,daytype,rides",
                            where=soql_where_date) \
                .merge(bus_routes, on='route')

    bike_rides = s3_bike_trips(datetime.fromisoformat(ctl_start).year, 
                            datetime.fromisoformat(trt_end).year) \
                .pipe(agg_bike_trips) \
                .loc[_pd_where_date]
    # Half of the bike rides are already denormalized and don't need bike-stations
    # So we do a left-join and coalesce to get the missing geometries
    bike_rides = bike_rides.merge(bike_stations.assign(station_id=bike_stations.station_id.astype(str)),
                                   on=['station_id','vintage'], how='left')
    bike_rides = bike_rides.pipe(coalesce, 'geometry_x','geometry_y','geometry')
    # Bikes don't have the daytype column so we'll impute it
    daytypes = pd.concat([train_rides[['date','daytype']],bus_rides[['date','daytype']]],ignore_index=True)
    daytypes = daytypes.groupby('date')['daytype'].first()
    bike_rides['daytype'] = bike_rides['date'].map(daytypes)
    
    rides = combine_panels(bus_rides, train_rides, bike_rides)
    rides['DNC'] = (trt_start <= rides['date']) & (rides['date'] <= trt_end)
    return rides

# Define data extent

def _soql_where_date(ctl_start, ctl_end, trt_start, trt_end):
    return  f"""(('{trt_start}' <= date) AND (date <= '{trt_end}'))
                OR (('{ctl_start}' <= date) AND (date <= '{ctl_end}'))"""


def _pd_where_date(x: pd.DataFrame, ctl_start, ctl_end, trt_start, trt_end):
    dts = pd.to_datetime(pd.Series([ctl_start, ctl_end, trt_start, trt_end]))
    return ((dts[0] <= x['date']) & (x['date'] <= dts[1])) | \
            ((dts[2] <= x['date']) & (x['date'] <= dts[3]))
    