In [1]:
from datetime import datetime as dt
import geopandas as gpd
import os
import pandas as pd
from tqdm import tqdm

from data.constants import (DATA_FOLDER, L_RIDERSHIP_TABLE, BUS_RIDERSHIP_TABLE)
from data.cta import CTAClient
from data.divvy import DivvyClient
from data.uber import UberClient
from data.datemath import iso_to_ymd, is_iso

In [2]:
train_rides_out = os.path.join(DATA_FOLDER, "raw", "train_rides.csv")
bus_rides_out = os.path.join(DATA_FOLDER, "raw", "bus_rides.csv")
bike_rides_out = os.path.join(DATA_FOLDER, "raw", "bike_rides.geoparquet")
uber_rides_out = os.path.join(DATA_FOLDER, "raw", "uber_rides.parquet")

In [3]:
cta_client = CTAClient(60)
divvy_client = DivvyClient()
uber_client = UberClient(900)



# Pipeline In

(None)

# Define Data Scope

Some of these tables are rather large so we need to make good choices about
what to pull in. We should abstract any logic that we might need to re-do
if we want to pull in additional dates, and cache anything that takes a while to load.

Looking ahead, we use models with -1 week, -1 month, and -1 YTD, at daily granularity.

Therefore we will pull in data from JANUARY 1, 2024 through AUGUST 31, 2024.

Note: we want to finish the whole month of August to ensure we have a FULL 
week of data for the DNC. Otherwise we may mis-infer a weekly/monthly effect of DNC on ridership
when actually we just mechanically omitted some days!

In [4]:
data_start_iso = dt(2024, 1, 1).isoformat()
data_end_iso = dt(2024, 8, 31, 23, 59, 59).isoformat()

# Train Rides

In [5]:
train_rides = cta_client.soda_get_all(L_RIDERSHIP_TABLE, 
                            select="station_id,date,daytype,rides",
                            where=f"date between '{data_start_iso}' and '{data_end_iso}'")

# Bus Rides

In [6]:
bus_rides = cta_client.soda_get_all(BUS_RIDERSHIP_TABLE, 
                            select="route,date,daytype,rides",
                            where=f"date between '{data_start_iso}' and '{data_end_iso}'")

# Bike Rides

The divvy ridership are at the ride granularity, so we need to aggregate to station-level.

In [7]:
def agg_ridership(trips: pd.DataFrame):
    """
    Get counts by station and date.
    """
    trips['start_date'] = trips['start_time'].dt.date
    trips['end_date'] = trips['end_time'].dt.date
    id_cols = ['station_id','date','vintage'] 
    id_cols += ['geometry'] if any('geometry' in x for x in trips.columns) else []
    start_rides = trips.rename(columns=lambda x: x.replace('start_','')) \
                    .groupby(id_cols, as_index=False).size() \
                    .rename(columns={'size': 'start_rides'})
    end_rides = trips.rename(columns=lambda x: x.replace('end_','')) \
                    .groupby(id_cols, as_index=False).size() \
                    .rename(columns={'size': 'end_rides'})
    rides = start_rides.merge(end_rides, how='outer')
    rides['rides'] = rides['start_rides'].fillna(0) + rides['end_rides'].fillna(0)
    return rides

In [8]:
bike_rides = divvy_client.s3_bike_trips(dt.fromisoformat(data_start_iso).year, 
                                        dt.fromisoformat(data_end_iso).year)
bike_rides = map(agg_ridership, bike_rides)
bike_rides = pd.concat(list(tqdm(bike_rides)), ignore_index=True)
bike_rides = bike_rides.loc[(bike_rides['date'] >= dt.fromisoformat(data_start_iso).date()) \
                            & (bike_rides['date'] <= dt.fromisoformat(data_end_iso).date())]

0it [00:00, ?it/s]

DEBUG: populating bucket paths.
DEBUG: reading  s3://divvy-tripdata/202401-divvy-tripdata.zip


1it [00:26, 26.30s/it]

DEBUG: reading  s3://divvy-tripdata/202402-divvy-tripdata.zip


2it [00:29, 12.46s/it]

DEBUG: reading  s3://divvy-tripdata/202403-divvy-tripdata.zip


3it [00:32,  8.54s/it]

DEBUG: reading  s3://divvy-tripdata/202404-divvy-tripdata.zip


4it [00:38,  7.34s/it]

DEBUG: reading  s3://divvy-tripdata/202405-divvy-tripdata.zip


5it [00:46,  7.58s/it]

DEBUG: reading  s3://divvy-tripdata/202406-divvy-tripdata.zip


6it [00:56,  8.31s/it]

DEBUG: reading  s3://divvy-tripdata/202407-divvy-tripdata.zip


7it [01:05,  8.72s/it]

DEBUG: reading  s3://divvy-tripdata/202408-divvy-tripdata.zip


8it [01:14,  8.86s/it]

DEBUG: reading  s3://divvy-tripdata/202409-divvy-tripdata.zip


9it [01:24,  9.42s/it]


In [9]:
bike_rides = gpd.GeoDataFrame(bike_rides, geometry='geometry')

In [10]:
# Note, according to https://data.cityofchicago.org/Transportation/Divvy-Bicycle-Stations/bbyy-e7gq/about_data
# each station contains multiple bike docks. Nevertheless in this data there are 
# up to thousands of unique Points per station_id. When we agg station_id -> MultiPoint
# and compute the area and perimeter of the minimum bounding circle, convex hull, etc,
# they are typically small but can be rather large (tens of thousands of feet, millions of ft^2)
# therefore I think we should not ever merge on station_id and only rely on X,Y when given

# Analysis commented out ...
# from shapely import from_wkt
# from data.constants import LOCAL_CRS, WORLD_CRS
# from shapely.geometry import MultiPoint
# from shapely import minimum_bounding_circle, minimum_rotated_rectangle

# bike_rides['geometry'] = bike_rides['geometry'].apply(from_wkt)
# bike_rides = gpd.GeoDataFrame(bike_rides, geometry='geometry', crs=WORLD_CRS).to_crs(LOCAL_CRS)

# station_geos = bike_rides.groupby('station_id')['geometry'].agg(['nunique',lambda x: MultiPoint(list(set(x)))])

# station_geos.columns = ['n','points']
# station_geos['circle'] = station_geos['points'].apply(minimum_bounding_circle)
# station_geos['rect'] = station_geos['points'].apply(minimum_rotated_rectangle)
# station_geos['hull'] = station_geos['points'].apply(lambda x: x.convex_hull)

# print(station_geos['n'].describe())

# station_geos[['circle','rect','hull']].map(lambda x: x.area).agg('mean',axis=1).hist()
# station_geos[['circle','rect','hull']].map(lambda x: x.length).agg('mean',axis=1).hist()

# Uber Rides

In [11]:
uber_pickups = uber_client.soda_get_uber(select="""
                                    date_trunc_ymd(trip_start_timestamp) as start_date, 
                                    pickup_census_tract,
                                    count(trip_id) as rides
                                    """,
                                    where_start=iso_to_ymd(data_start_iso), 
                                    where_end=iso_to_ymd(data_end_iso), 
                                    group="start_date, pickup_census_tract",
                                    pickup=True)
uber_dropoffs = uber_client.soda_get_uber(select="""
                                    date_trunc_ymd(trip_end_timestamp) as end_date, 
                                    dropoff_census_tract,
                                    count(trip_id) as rides
                                    """,
                                    where_start=iso_to_ymd(data_start_iso), 
                                    where_end=iso_to_ymd(data_end_iso), 
                                    group="end_date, dropoff_census_tract",
                                    pickup=False)

In [12]:
uber_pickups = uber_pickups.rename(columns={'start_date':'date', 'pickup_census_tract':'tract', 'rides':'start_rides'})
uber_dropoffs = uber_dropoffs.rename(columns={'end_date':'date', 'dropoff_census_tract':'tract', 'rides':'end_rides'})
uber_pickups['tract'] = pd.to_numeric(uber_pickups['tract'], 'coerce')
uber_dropoffs['tract'] = pd.to_numeric(uber_dropoffs['tract'], 'coerce')
uber_rides = uber_pickups.merge(uber_dropoffs, how='outer')
uber_rides['rides'] = uber_rides['start_rides'].fillna(0) + uber_rides['end_rides'].fillna(0)
uber_rides['date'] = uber_rides['date'].apply(lambda x: iso_to_ymd(x) if is_iso(x) else x)

# Pipeline out

In [13]:
train_rides.to_csv(train_rides_out, index=False)
bus_rides.to_csv(bus_rides_out, index=False)
bike_rides.to_parquet(bike_rides_out, index=False)
uber_rides.to_parquet(uber_rides_out, index=False)