In [1]:
import geopandas as gpd
import pandas as pd

In [2]:
train_station_rides_in = "../data/final/train_station_rides.parquet"
train_line_rides_in = "../data/final/train_line_rides.parquet"
bus_rides_in = "../data/final/bus_rides.parquet"
bike_rides_in = "../data/final/bike_rides.geoparquet"
uber_tract_rides_in = "../data/final/uber_tract_rides.parquet"
uber_comm_rides_in = "../data/final/uber_comm_rides.parquet"

tract_file_in = "../data/interim/tracts.geoparquet"
comm_file_in = "../data/interim/communities.geojson"

point_panel_out = "../data/interim/point_panel.parquet"
line_panel_out = "../data/interim/line_panel.parquet"
tract_panel_out = "../data/interim/tract_panel.parquet"
comm_panel_out = "../data/interim/comm_panel.parquet"



# Pipeline in

In [3]:
tracts = gpd.read_parquet(tract_file_in)
comms = gpd.read_file(comm_file_in)

train_rides = gpd.read_parquet(train_station_rides_in)
train_line_rides = gpd.read_parquet(train_line_rides_in)
bus_rides = gpd.read_parquet(bus_rides_in)
bike_rides = gpd.read_parquet(bike_rides_in)
uber_tract_rides = gpd.read_parquet(uber_tract_rides_in)
uber_comm_rides = gpd.read_parquet(uber_comm_rides_in)

# Panels

In [4]:
def combine_panels(bus=None, train=None, bike=None, uber=None):
    # prepend categorical transit type and only take common columns
    dfs = zip([bus,train,bike,uber],['bus','train','bike','uber'])
    dfs, keys = zip(*filter(lambda x: x[0] is not None and not x[0].empty, dfs))
    # note: to use keys and still ignore the index, we have to just reset it later.
    panel = pd.concat(dfs, ignore_index=False, join='inner',
                      keys=keys, names=['transit'])
    panel = panel.reset_index(level='transit').reset_index(drop=True)
    panel['tid'] = panel['transit'] + "_" + panel['id'].astype(str)
    return panel

## Point Panel

In [5]:
# Only taking naturally station-level data to avoid heteroskedasticity issues.

point_panel = combine_panels(train=train_rides.assign(id = train_rides['station_id'].astype(str)),
                            bike=bike_rides.rename(columns={"station_name":"id"}))

## Line Panel

In [6]:
# Only taking trivially line-level data to avoid heteroskedasticity issues.

line_panel = combine_panels(bus=bus_rides.rename(columns={'route':'id'}),
                            train=train_line_rides.rename(columns={'line':'id'}))

## Tract Panel

Requires spatially aggregating train and bike rides. Not taking bus rides because
diaggregating is too imprecise.

In [7]:
id_cols = ['tract','date','dotw','is_weekend','DNC']

train_rides_by_tract = train_rides[id_cols + ['rides']].groupby(id_cols).sum().reset_index()
bike_rides_by_tract = bike_rides[id_cols + ['start_rides','end_rides','rides']].groupby(id_cols).sum().reset_index()
uber_rides_by_tract = uber_tract_rides.rename(columns={'id':'tract'})[id_cols + ['start_rides','end_rides','rides']].groupby(id_cols).sum().reset_index()

In [8]:
tract_rides = combine_panels(train=train_rides_by_tract.rename(columns={"tract":"id"}),
                             bike=bike_rides_by_tract.rename(columns={"tract":"id"}),
                             uber=uber_rides_by_tract.rename(columns={"tract":"id"}))

In [9]:
# Re-integrate tract-level features:
tract_panel = tracts.merge(tract_rides, right_on='id', left_on='geoid10')
tract_panel = tract_panel.drop(columns=['centroid','geoid10'])

## Comm Area Panel

In [10]:
id_cols = ['comm_area','date','dotw','is_weekend','DNC']

train_rides_by_comm = train_rides[id_cols + ['rides']].groupby(id_cols).sum().reset_index()
bike_rides_by_comm = bike_rides[id_cols + ['start_rides','end_rides','rides']].groupby(id_cols).sum().reset_index()
uber_rides_by_comm = uber_comm_rides.rename(columns={'id':'comm_area'})[id_cols + ['start_rides','end_rides','rides']].groupby(id_cols).sum().reset_index()

In [11]:
comm_rides = combine_panels(train=train_rides_by_comm.rename(columns={"comm_area":"id"}),
                             bike=bike_rides_by_comm.rename(columns={"comm_area":"id"}),
                             uber=uber_rides_by_comm.rename(columns={"comm_area":"id"}))

In [12]:
comm_panel = comms.merge(comm_rides, right_on='id', left_on='comm_area')
comm_panel = comm_panel.drop(columns=['centroid','comm_area'])

# Pipeline out

In [13]:
point_panel.to_parquet(point_panel_out, index=False)
line_panel.to_parquet(line_panel_out, index=False)
tract_panel.to_parquet(tract_panel_out, index=False)
comm_panel.to_parquet(comm_panel_out, index=False)