In [1]:
import geopandas as gpd
import pandas as pd

In [2]:
train_lines_in = "../data/raw/train_lines.geojson"
train_stations_in = "../data/interim/train_stations.geojson"
bus_routes_file_in = "../data/raw/bus_routes.geojson"
bus_stops_file_in = "../data/interim/bus_stops.geojson"
bike_stations_file_in = "../data/interim/bike_stations_gbfs_v2.geojson"
tract_file_in = "../data/interim/tracts.geoparquet"
comm_file_in =  "../data/interim/communities.geojson"

train_rides_in = "../data/interim/train_rides.csv"
bus_rides_in = "../data/interim/bus_rides.csv"
bike_rides_in = "../data/interim/bike_rides_v3.geoparquet"
uber_rides_in = "../data/interim/uber_rides.parquet"

train_line_rides_out = "../data/final/train_line_rides.parquet"
train_station_rides_out = "../data/final/train_station_rides.parquet"
bus_rides_out = "../data/final/bus_rides.parquet"
bike_rides_out = "../data/final/bike_rides.geoparquet"
uber_rides_out = "../data/final/uber_rides.parquet"

# Pipeline in

In [3]:
train_lines = gpd.read_file(train_lines_in)
train_stations = gpd.read_file(train_stations_in)
bus_routes = gpd.read_file(bus_routes_file_in)
bus_stops = gpd.read_file(bus_stops_file_in)
tracts = gpd.read_parquet(tract_file_in)
comms = gpd.read_file(comm_file_in)
bike_stations = gpd.read_file(bike_stations_file_in)

In [4]:
train_rides = pd.read_csv(train_rides_in)
bus_rides = pd.read_csv(bus_rides_in)
bike_rides = gpd.read_parquet(bike_rides_in)
uber_rides = pd.read_parquet(uber_rides_in)

# Train Rides + Stations

In [5]:
# We only care about granularity of entrypoints, not line/direction because we dont know
# which line/direction the riders go.
train_stations = train_stations.drop_duplicates('map_id')

In [6]:
train_rides = train_stations.merge(train_rides, how='right', right_on='station_id', left_on='map_id', validate="1:m")

# Train Rides + Lines

In [7]:
# Aggregate station features per line:
feature_cols = ['airport', 'uc_400', 'uc_800', 'uc_1600', 'mp_400', 'mp_800', 'mp_1600']

# Recover statistic before exploding
lines_per_station = train_stations.set_index('map_id')['line'].str.count(',')+1

# Explode multi-line stations to keep correct cardinality of lines
train_stations['line'] = train_stations['line'].str.split(',')
train_stations = train_stations.explode('line')

# Agg station -> line
train_line_features = train_stations.groupby('line')[feature_cols].any().astype(float).reset_index()

In [8]:
# Aggregate ridership and merge line-level features:

# The ridership data isn't at directional granularity, even though some stations 
# do have separate directional entrances. We will apportion ridership equally per
# line at multi-line stations.
# XXX: Improve this model of line popularity?
train_line_rides = train_rides.assign(rides = train_rides['rides'] / train_rides['map_id'].map(lines_per_station))
train_line_rides = train_line_rides.groupby(['line','date','dotw','is_weekend','DNC'],as_index=False).agg({'rides':'sum'})
train_line_rides = train_line_rides.merge(train_line_features)
train_line_rides = train_lines.merge(train_line_rides, how='right', on='line')

# Bike Rides + Stations

In [9]:
# # Double-checking if all bike stations (for the years currently accessed)
# #       already have valid (ie chicago?) geometries
# from shapely import box
# chi_boundary = gpd.read_file(CHI_BOUNDARY_FILE)
# chi_bound_geo = chi_boundary['geometry'].to_crs(LOCAL_CRS).iloc[0]
# chi_box = box(*chi_bound_geo.bounds) # Using bounding box for faster computation
# is_valid = bike_rides['geometry'].to_crs(LOCAL_CRS).within(chi_box)

# # "Invalid" geos are all in Evanston essentially, which is fine for our purposes.
# fig = chi_boundary.boundary.plot()
# bike_rides[~is_valid]['geometry'].drop_duplicates().plot(ax=fig, color='red')

In [10]:
print("Columns to gain by merging: ", set(bike_stations.columns) - set(bike_rides.columns))

Columns to gain by merging:  {'station_id'}


XXX:

For now, since all data vintages currently pulled are denormalized and already
have valid geometries, on which we've already computed our spatial features,
there's actually nothing to merge via bike_stations.

If you return to this, note that within bike_stations the station_id <-> geometry is not 1:1,
even though according to the documentation each station has multiple bike docks.
Some of the points are thousands of feet away per station_id.
Therefore be careful about how you construct the merge.

# Bus Rides + Stops + Routes

Dropping stops with unknown tracts (ie outside chicago)

In [11]:
print("Dropping {} ({:.1%}) of rows".format(
    bus_stops[['route','tract']].isna().any(axis=1).sum(),
    bus_stops[['route','tract']].isna().any(axis=1).mean(),
))
bus_stops = bus_stops.dropna(subset=['route','tract'])

Dropping 1 (0.0%) of rows


Note: since the bus ridership is at route granularity, not stop or point granularity,
we need to aggregate to stop-level features to route.

This is tricky since routes span miles across the city. 

## Numeric

My first thought is to 
compute the proportion of stops that exhibit a certain feature, ie proportion
of stops near the United Center. 


In [12]:
feature_cols = ['uc_400','uc_800','uc_1600','mp_400','mp_800','mp_1600','airport']
bus_stops_features = bus_stops.groupby('route')[feature_cols].mean()

This turns out poorly scaled -- most routes
have zero stops near the United Center, while those that do tend to have a very
small proportion.

In [13]:
print('Proportion of routes with ANY POI stops')
print((bus_stops_features > 0).mean().round(2).to_dict())

Proportion of routes with ANY POI stops
{'uc_400': 0.03, 'uc_800': 0.05, 'uc_1600': 0.1, 'mp_400': 0.01, 'mp_800': 0.04, 'mp_1600': 0.1, 'airport': 0.07}


In [14]:
print("Distribution of mean POI stops per route, given route serves ANY POI's:")
bus_stops_features[(bus_stops_features>0).agg('any',axis=1)].describe()

Distribution of mean POI stops per route, given route serves ANY POI's:


Unnamed: 0,uc_400,uc_800,uc_1600,mp_400,mp_800,mp_1600,airport
count,39.0,39.0,39.0,39.0,39.0,39.0,39.0
mean,0.005838,0.024061,0.085773,0.001223,0.014325,0.049139,0.004464
std,0.018715,0.055232,0.150498,0.005439,0.043182,0.091428,0.008463
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.134408,0.0,0.0,0.093375,0.006944
max,0.086614,0.222222,0.666667,0.028571,0.234375,0.46875,0.03125


## Binary

Instead we will interpret simply as binary whether or not the route serves the POI
anywhere along the route, without taking into account the lenght of the route,
nor ridership density along the route.

In [15]:
bus_stops_features = bus_stops.groupby('route')[feature_cols].any().astype(float)
bus_routes = bus_routes.merge(bus_stops_features, how='left', left_on='route', right_index=True)

In [16]:
bus_rides = bus_routes.merge(bus_rides, on='route', how='right', validate='1:m')

# Uber Rides + Tracts

XXX:
Not dropping anonymized observations (null tract/comm) because they still add to the non-spatial
time series count.

In [17]:
uber_tracts = tracts[feature_cols + ['geometry','centroid','geoid10']] \
                [lambda x: x['geoid10'].isin(uber_rides[uber_rides.unit == 'tract']['id'])] \
                .rename(columns={'geoid10':'id'})

uber_comms = comms[feature_cols + ['geometry','centroid','comm_area']] \
                [lambda x: x['comm_area'].isin(uber_rides[uber_rides.unit == 'comm']['id'])]\
                .rename(columns={'comm_area':'id'})

assert len(uber_rides.columns.intersection(uber_tracts.columns)) == 1
assert len(uber_rides.columns.intersection(uber_comms.columns)) == 1

uber_tract_rides = uber_tracts.merge(uber_rides[uber_rides.unit == 'tract'], how='right', on='id')
uber_comm_rides = uber_comms.merge(uber_rides[uber_rides.unit == 'comm'], how='right', on='id')
uber_rides = pd.concat([uber_tract_rides, uber_comm_rides], ignore_index=True)

# Pipeline out

In [18]:
uber_rides.to_parquet(uber_rides_out)
bike_rides.to_parquet(bike_rides_out)
train_rides.to_parquet(train_station_rides_out)
train_line_rides.to_parquet(train_line_rides_out)
bus_rides.to_parquet(bus_rides_out)