In [1]:
import geopandas as gpd
import pandas as pd
from plotly import express as px

from data.datemath import date_range_daily

In [2]:
train_rides_in = "../data/raw/train_rides.csv"
bus_rides_in = "../data/raw/bus_rides.csv"
bike_rides_in = "../data/interim/bike_rides_v2.geoparquet"
uber_rides_in = "../data/raw/uber_tract_rides.parquet"

train_file_in = "../data/raw/train_stations.geojson"
bus_routes_file_in = "../data/raw/bus_routes.geojson"
bike_stations_file_in = "../data/raw/bike_stations.geojson"
tract_file_in = "../data/raw/tracts.geojson"

In [3]:
train_rides = pd.read_csv(train_rides_in)
bus_rides = pd.read_csv(bus_rides_in)
bike_rides = gpd.read_parquet(bike_rides_in)
uber_rides = pd.read_parquet(uber_rides_in)

In [4]:
train_stations = gpd.read_file(train_file_in)
bus_routes = gpd.read_file(bus_routes_file_in)
tract_points = gpd.read_file(tract_file_in)
bike_stations = gpd.read_file(bike_stations_file_in)

# Cardinality

In [5]:
bike_rides[['station_name','geometry']].nunique()

station_name    1654
geometry        1651
dtype: int64

# Ride Units not in Stations Units

### Train

In [6]:
assert train_rides.station_id.isin(train_stations.map_id).all()

### Uber

In [7]:
excess = ~uber_rides.tract.isin(tract_points.geoid10)
print("{} uber tracts aren't in chicago".format(
    uber_rides[excess]['tract'].nunique()
))
# We already know the uber dataset contains some rides TO or FROM chicago.

31 uber tracts aren't in chicago


### Bus

In [8]:
excess = ~bus_rides.route.isin(bus_routes.route)
print("These bus rides aren't in the routes list: ",
      set(bus_rides[excess]['route']))

These bus rides aren't in the routes list:  {'1001'}


### Bike

We know the trip data isn't in the station data because the later vintages are denormalized.

# Units not in Rides

### Train

In [9]:
assert train_stations.map_id.isin(train_rides.station_id).all()

### Uber

In [10]:
print("Uber data has {} / {} cook county tracts".format(
    tract_points.geoid10.isin(uber_rides.tract).sum(),
    tract_points.geoid10.nunique()
))

Uber data has 1264 / 1331 cook county tracts


### Bus

In [11]:
print("Are all bus routes present in ridership? ", 
      bus_routes.route.isin(bus_rides.route).all())

Are all bus routes present in ridership?  False


In [12]:
missing = bus_routes[~bus_routes.route.isin(bus_rides.route)]
missing
# We are missing one route. That's fine. 
# We'll implicitly drop it when we merge on ridership.
# This is unfortunate because that route runs near mccormick place.
# It only runs during bears games though which happened on Aug 1, 10, 17, 22

Unnamed: 0,route,name,geometry
114,128,SOLDIER FIELD EXPRESS,"MULTILINESTRING ((-87.61699 41.86502, -87.6153..."


### Bike

In [13]:
print("Latest stations vintage: ",
      bike_stations['vintage'].str.extract(r"(\d{4})").max().iloc[0])

Latest stations vintage:  2017


In [14]:
# Listen, the relevant ridership years are all denormalized
# ==> we already have station locations in the ridership tables directly.
# We don't even want this data because we only want post-pandemic data at the most.
# ==> don't even need to merge these stations to ridership.
# ==> we won't even check this query which takes a few sec to run anyway
# assert bike_stations.geometry.isin(bike_rides.geometry).all()

# Dates

## All dates

In [15]:
dates = set(map(lambda x: x[0], date_range_daily("2024-07-01","2024-08-31")))

In [16]:
assert all([d in train_rides['date'].values for d in dates])
assert all([d in bus_rides['date'].values for d in dates])
assert all([d in uber_rides['date'].values for d in dates])
assert all([d in bike_rides['date'].values for d in dates])

## Units x Dates

### Train

In [17]:
for g, gdf in train_rides.groupby('station_id'):
    assert all([d in gdf['date'].values for d in dates])

In [18]:
plot_data = train_rides.groupby('date')['station_id'].nunique().rename('completeness') / train_rides['station_id'].nunique()
px.line(plot_data.reset_index(), x='date', y='completeness')
# The green line damen station comes online during the study period, so we will need
# to impute its prior ridership.

### Bus

In [19]:
compliance = bus_rides.groupby('route')['date'].agg(lambda x: sum([d in x.values for d in dates]))
compliance = compliance / len(dates)
compliance.describe()
# We have a 50% panel of ridership for bus rides

count    127.000000
mean       0.910465
std        0.149557
min        0.000000
25%        0.733871
50%        1.000000
75%        1.000000
max        1.000000
Name: date, dtype: float64

In [20]:
plot_data = bus_rides.groupby('date')['route'].nunique().rename('completeness') / bus_rides['route'].nunique()
px.line(plot_data.reset_index(), x='date', y='completeness')
# Three things mess with the panel:
#  - some routes operate every day and some only on weekdays
#  - a small number of routes are added/removed from service over time
#  - the panel never records 0 ridership days so those migth be implied

### Uber

In [21]:
compliance = uber_rides.groupby('tract')['date'].agg(lambda x: sum([d in x.values for d in dates]))
compliance = compliance / len(dates)
compliance.describe()
# We have a 50% panel of ridership for uber tracts


count    1295.000000
mean        0.798231
std         0.335229
min         0.000000
25%         0.725806
50%         1.000000
75%         1.000000
max         1.000000
Name: date, dtype: float64

In [22]:
plot_data = uber_rides.groupby('date')['tract'].nunique().rename('completeness') / uber_rides['tract'].nunique()
px.line(plot_data.reset_index(), x='date', y='completeness')
# I think two things mess with the panel:
#   - we know that ridership is much higher on weekends than weekdays
#   - people go to certain tracts dichotomously on weekends vs weekdays
#   - the panel doesn't display 0 ridership days so these might be implied
#   - the panel anonymizes low ridership days to community area level so 
#     as far as we are ignoring those we are treating them as 0 ridership too.
# The mean completeness probably saturates over time due to market forces.

### Bike

In [23]:
compliance = bike_rides.groupby(['station_name'])['date'].agg(lambda x: len(set(x) & dates))
compliance = compliance / len(dates)
compliance.describe()
# We have a 75% panel of ridership for bike stations

count    1654.000000
mean        0.499035
std         0.416789
min         0.000000
25%         0.064516
50%         0.419355
75%         1.000000
max         1.000000
Name: date, dtype: float64

In [24]:
plot_data = bike_rides.groupby('date')['station_name'].nunique().rename('completeness')
plot_data = plot_data / bike_rides['station_name'].nunique()
px.line(plot_data.reset_index(), x='date', y='completeness')