In [1]:
import geopandas as gpd
import pandas as pd
import os
from shapely.geometry import MultiPoint

from data.constants import DATA_FOLDER, WORLD_CRS, LOCAL_CRS, CHI_BOUNDARY_FILE
from data.datemath import to_ymd

In [None]:
train_file_in = os.path.join(DATA_FOLDER, "interim", "train_stations.geojson")
bus_routes_file_in = os.path.join(DATA_FOLDER, "interim", "bus_routes.geojson")
bus_stops_file_in = os.path.join(DATA_FOLDER, "interim", "bus_stops.geojson")
bike_stations_file_in = os.path.join(DATA_FOLDER, "interim", "bike_stations.geojson")
tract_file_in = os.path.join(DATA_FOLDER, "interim", "tracts.geoparquet")
comm_file_in = os.path.join(DATA_FOLDER, "interim", "communities.geojson")

train_rides_in = os.path.join(DATA_FOLDER, "interim", "train_rides.csv")
bus_rides_in = os.path.join(DATA_FOLDER, "interim", "bus_rides.csv")
bike_rides_in = os.path.join(DATA_FOLDER, "interim", "bike_rides.geoparquet")
uber_rides_in = os.path.join(DATA_FOLDER, "interim", "uber_rides.parquet")

point_panel_out = os.path.join(DATA_FOLDER, "interim", "point_panel.parquet")
line_panel_out = os.path.join(DATA_FOLDER, "interim", "line_panel.parquet")
tract_panel_out = os.path.join(DATA_FOLDER, "interim", "tract_panel.parquet")

# Pipeline in

In [3]:
train_stations = gpd.read_file(train_file_in)
bus_routes = gpd.read_file(bus_routes_file_in)
bus_stops = gpd.read_file(bus_stops_file_in)
tract_points = gpd.read_parquet(tract_file_in)
comm_points = gpd.read_file(comm_file_in)
bike_stations = gpd.read_file(bike_stations_file_in)

In [4]:
train_rides = pd.read_csv(train_rides_in)
bus_rides = pd.read_csv(bus_rides_in)
bike_rides = gpd.read_parquet(bike_rides_in)
uber_rides = pd.read_parquet(uber_rides_in)

# Train Rides + Stations

In [5]:
# We only care about granularity of entrypoints, not line/direction because we dont know
# which line/direction the riders go.
train_stations = train_stations.drop_duplicates('map_id')

In [6]:
train_rides = train_stations.merge(train_rides, how='right', right_on='station_id', left_on='map_id', validate="1:m")

## Aggregate to Line

In [7]:
# Recover statistic before exploding
lines_per_station = train_stations.set_index('map_id')['line'].str.count(',')+1

In [8]:
# Explode multi-line stations to keep correct cardinality of lines
train_stations['line'] = train_stations['line'].str.split(',')
train_stations = train_stations.explode('line')

In [9]:
feature_cols = ['airport', 'uc_400', 'uc_800', 'uc_1600', 'mp_400', 'mp_800', 'mp_1600']
train_line_features = train_stations.groupby('line')[feature_cols].any().astype(float).reset_index()
train_centroids = train_stations.groupby('line')['geometry'].agg(lambda x: MultiPoint(list(x)).centroid)
train_line_features['centroid'] = gpd.GeoSeries(train_line_features['line'].map(train_centroids), crs=train_stations.crs)

In [10]:
# The ridership data isn't at directional granularity, even though some stations 
# do have separate directional entrances. We will apportion ridership equally per
# line at multi-line stations.
# TODO: Improve this model of line popularity?
train_line_rides = train_rides.assign(rides = train_rides['rides'] / train_rides['map_id'].map(lines_per_station))
train_line_rides = train_line_rides.groupby(['line','date','daytype','DNC'],as_index=False).agg({'rides':'sum'})
train_line_rides = train_line_rides.merge(train_line_features)

# Bike Rides + Stations

In [11]:
# # Double-checking if all bike stations (for the years currently accessed)
# #       already have valid (ie chicago?) geometries
# from shapely import box
# chi_boundary = gpd.read_file(CHI_BOUNDARY_FILE)
# chi_bound_geo = chi_boundary['geometry'].to_crs(LOCAL_CRS).iloc[0]
# chi_box = box(*chi_bound_geo.bounds) # Using bounding box for faster computation
# is_valid = bike_rides['geometry'].to_crs(LOCAL_CRS).within(chi_box)

# # "Invalid" geos are all in Evanston essentially, which is fine for our purposes.
# fig = chi_boundary.boundary.plot()
# bike_rides[~is_valid]['geometry'].drop_duplicates().plot(ax=fig, color='red')

In [12]:
# In the current dataset, all id's are integer, so we will truncate decimal part before stringifying
assert all(bike_stations['station_id'].astype(int) == bike_stations['station_id'])
bike_stations['station_id'] = bike_stations['station_id'].astype(int).astype(str)

In [13]:
tmp = bike_stations.merge(bike_rides, on=['station_id','vintage'], how='inner')
print(f"No common stations? {tmp.empty}")

No common stations? True


In [14]:
tmp = bike_stations.merge(bike_rides, on=['geometry'], how='inner')
print(f"No common points? {tmp.empty}. {len(tmp)} / {len(bike_rides)} points in common.")

No common points? False. 359714 / 1086142 points in common.


In [15]:
print("Columns to gain by merging: ", set(bike_stations.columns) - set(bike_rides.columns))

Columns to gain by merging:  {'name'}


In [16]:
# Convert to string for dtype parity with other transits
bike_rides['date'] = bike_rides['date'].apply(to_ymd)

XXX:

For now, since all data vintages currently pulled are denormalized and already
have valid geometries, on which we've already computed our spatial features,
there's actually nothing to merge via bike_stations.

If you return to this, note that within bike_stations the station_id <-> geometry is not 1:1,
even though according to the documentation each station has multiple bike docks.
Some of the points are thousands of feet away per station_id.
Therefore be careful about how you construct the merge.

# Bus Rides + Stops + Routes

Dropping stops with unknown tracts (ie outside chicago)

In [17]:
bus_stops = bus_stops.dropna(subset=['route','tract'])

Note: since the bus ridership is at route granularity, not stop or point granularity,
we need to aggregate to stop-level features to route.

This is tricky since routes span miles across the city. 

## Numeric

My first thought is to 
compute the proportion of stops that exhibit a certain feature, ie proportion
of stops near the United Center. 


In [18]:
feature_cols = ['uc_400','uc_800','uc_1600','mp_400','mp_800','mp_1600','airport']
bus_stops_features = bus_stops.groupby('route')[feature_cols].mean()

This turns out poorly scaled -- most routes
have zero stops near the United Center, while those that do tend to have a very
small proportion.

In [19]:
print('Proportion of routes with ANY POI stops')
print((bus_stops_features > 0).mean().round(2).to_dict())

Proportion of routes with ANY POI stops
{'uc_400': 0.03, 'uc_800': 0.06, 'uc_1600': 0.1, 'mp_400': 0.01, 'mp_800': 0.04, 'mp_1600': 0.1, 'airport': 0.08}


In [20]:
print("Distribution of mean POI stops per route, given route serves ANY POI's:")
bus_stops_features[(bus_stops_features>0).agg('any',axis=1)].describe()

Distribution of mean POI stops per route, given route serves ANY POI's:


Unnamed: 0,uc_400,uc_800,uc_1600,mp_400,mp_800,mp_1600,airport
count,39.0,39.0,39.0,39.0,39.0,39.0,39.0
mean,0.005838,0.024061,0.086059,0.001506,0.015386,0.050852,0.005408
std,0.018715,0.055232,0.150561,0.00697,0.045943,0.093818,0.010453
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.135339,0.0,0.0,0.093375,0.00692
max,0.086614,0.222222,0.666667,0.039604,0.234375,0.46875,0.038462


## Binary

Instead we will interpret simply as binary whether or not the route serves the POI
anywhere along the route, without taking into account the lenght of the route,
nor ridership density along the route.

In [21]:
bus_stops_features = bus_stops.groupby('route')[feature_cols].any().astype(float)
bus_routes = bus_routes.merge(bus_stops_features, how='left', left_on='route', right_index=True)

In [22]:
bus_centroids = bus_stops.groupby('route')['geometry'].agg(lambda x: MultiPoint(list(x)).centroid)
bus_routes['centroid'] = gpd.GeoSeries(bus_routes['route'].map(bus_centroids), crs=bus_stops.crs)

In [23]:
bus_rides = bus_routes.merge(bus_rides, on='route', how='right', validate='1:m')

# Uber Rides + Tracts

XXX:

Community areas are kinda big. I'll use tracts and drop all rides that are
even anonymized at the tract level.


In [24]:
uber_rides = uber_rides.dropna(subset=['tract'])

Note that the Uber dataset contains rides that start OR end in chicago, but 
may include rides that start XOR end outside of chicago. We haven't computed
features on those, but for simplicity, we'll set them to false, which is ok 
for our definitions. Technically, OHare is inside of chicago even if it's 
surrounded by non-chicago tracts.

In [25]:
uber_rides['tract'] = uber_rides['tract'].astype('int64')

In [26]:
uber_rides = tract_points[feature_cols+['centroid','geoid10']] \
                .merge(uber_rides,
                       right_on='tract', left_on='geoid10',
                       how='right', validate="1:m") \
                .drop(columns=['geoid10'])

# Panels

## Point Panel

In [27]:
def combine_panels(bus=None, train=None, bike=None, uber=None):
    # prepend categorical transit type and only take common columns
    dfs = zip([bus,train,bike,uber],['bus','train','bike','uber'])
    dfs, keys = zip(*filter(lambda x: x[0] is not None and not x[0].empty, dfs))
    # note: to use keys and still ignore the index, we have to just reset it later.
    panel = pd.concat(dfs, ignore_index=False, join='inner',
                      keys=keys, names=['transit'])
    panel = panel.reset_index(level='transit').reset_index(drop=True)
    panel['tid'] = panel['transit'] + "_" + panel['id'].astype(str)
    # convert geo into xy features. 
    # must project to compute centroid. unprojecting because WGS scale is nicer for regression
    if 'geometry' in panel.columns:
        panel['lat'] = gpd.GeoSeries(panel.geometry).to_crs(LOCAL_CRS).centroid.to_crs(WORLD_CRS).y
        panel['long'] = gpd.GeoSeries(panel.geometry).to_crs(LOCAL_CRS).centroid.to_crs(WORLD_CRS).x
        panel = panel.drop(columns=['geometry'])
    return panel

All things equal, we should take the bus and uber components with a grain of salt.
The uber and bus components have systematically larger measurement error in lat/lon since they
are snapped to tract / route centroid. This breaks an OLS assumption. 

In [28]:
point_panel = combine_panels(train=train_rides.assign(id = train_rides['station_id'].astype(str)),
                            bike=bike_rides.rename(columns={"station_id":"id"}),
                            uber=uber_rides.rename(columns={'centroid':'geometry'}) \
                                .assign(id = uber_rides['tract'].astype(str)))

In [29]:
line_panel = combine_panels(bus=bus_rides.drop(columns=['geometry']) \
                                 .rename(columns={'centroid':'geometry', 'route':'id'}),
                            train=train_line_rides.rename(columns={'centroid':'geometry', 'line':'id'}))

## Tract Panel

In [30]:
def tract_point_statistics(tract_df, point_df, transit: str):
    tract_centroids = tract_df.set_geometry(tract_df.to_crs(LOCAL_CRS).centroid)

    # Note sjoin_nearest is m:m if points are equidistant ==> we further consolidate
    access = tract_centroids.sjoin_nearest(point_df.to_crs(LOCAL_CRS), 
                                           how='left', 
                                           distance_col=f'{transit}_access') \
                        .groupby('geoid10',as_index=False)[f'{transit}_access'].min()

    contained = tract_df.merge(point_df, left_on='geoid10', right_on='tract',
                               how='left', validate="1:m") \
                        .groupby('geoid10',as_index=False).size() \
                        .rename(columns={'size':f'{transit}_count'})
    
    tract_df = tract_df.merge(access).merge(contained)
    return tract_df

In [31]:
tract_points = tract_points.pipe(tract_point_statistics, train_stations[['tract','geometry']], 'train')
tract_points = tract_points.pipe(tract_point_statistics, bike_stations[['tract','geometry']], 'bike')
tract_points = tract_points.pipe(tract_point_statistics, bus_stops[['tract','geometry']], 'bus')

# These don't have a lot of meaning for uber, particularly because the accessability==0 always
# but just throwing in for good measure.
tract_points = tract_points.pipe(tract_point_statistics, 
                                 uber_rides[['tract','centroid']].drop_duplicates().set_geometry('centroid'), 'uber')

In [32]:
id_cols = ['tract','date','daytype','DNC']

train_rides_by_tract = train_rides[id_cols + ['rides']].groupby(id_cols, as_index=False).sum()
bike_rides_by_tract = bike_rides[id_cols + ['start_rides','end_rides','rides']].groupby(id_cols, as_index=False).sum()
uber_rides_by_tract = uber_rides[id_cols + ['start_rides','end_rides','rides']].groupby(id_cols, as_index=False).sum()
# Handle bus rides separately!

In [33]:
# Bus ridership is only known per route. We will disaggregate to tracts by
# assuming equal ridership per stop: ridership per tract ~ nbr stops in tract.
stops_per_route_tract = bus_stops.value_counts(['route','tract'])
stops_per_route = bus_stops.value_counts(['route'])
prop_in_route_tract = stops_per_route_tract / stops_per_route
prop_in_route_tract = prop_in_route_tract.rename('prop').reset_index()
assert (prop_in_route_tract.groupby('route')['prop'].sum()==1).all()

bus_rides_by_tract = bus_rides.merge(bus_stops[['route','tract']]).merge(prop_in_route_tract)
bus_rides_by_tract['rides'] = bus_rides_by_tract['rides'] * bus_rides_by_tract['prop']
bus_rides_by_tract = bus_rides_by_tract[id_cols + ['rides']].groupby(id_cols, as_index=False).sum()

In [34]:
tract_rides = combine_panels(bus=bus_rides_by_tract.rename(columns={"tract":"id"}),
                             train=train_rides_by_tract.rename(columns={"tract":"id"}),
                             bike=bike_rides_by_tract.rename(columns={"tract":"id"}),
                             uber=uber_rides_by_tract.rename(columns={"tract":"id"}))

In [35]:
tract_panel = tract_points.merge(tract_rides, right_on='id', left_on='geoid10')
tract_panel['lat'] = tract_panel['centroid'].to_crs(LOCAL_CRS).centroid.to_crs(WORLD_CRS).y
tract_panel['long'] = tract_panel['centroid'].to_crs(LOCAL_CRS).centroid.to_crs(WORLD_CRS).x
tract_panel = tract_panel.drop(columns=['centroid','geometry','geoid10'])

# Pipeline out

In [36]:
point_panel.to_parquet(point_panel_out, index=False)
line_panel.to_parquet(line_panel_out, index=False)
tract_panel.to_parquet(tract_panel_out, index=False)