In [1]:
from datetime import datetime as dt
import geopandas as gpd
import os
import pandas as pd
from tqdm import tqdm

from data.constants import (DATA_FOLDER, DNC_START_ISO, DNC_END_ISO,
                            L_RIDERSHIP_TABLE, BUS_RIDERSHIP_TABLE)
from data.cta import CTAClient
from data.divvy import DivvyClient
from data.uber import UberClient
from data.datemath import iso_to_ymd

In [2]:
train_file_in = os.path.join(DATA_FOLDER, "interim", "train_stations.geojson")
bus_routes_file_in = os.path.join(DATA_FOLDER, "interim", "bus_routes.geojson")
bus_stops_file_in = os.path.join(DATA_FOLDER, "interim", "bus_stops.geojson")
bike_stations_file_in = os.path.join(DATA_FOLDER, "interim", "bike_stations.geojson")
tract_file_in = os.path.join(DATA_FOLDER, "interim", "tracts.geojson")
comm_file_in = os.path.join(DATA_FOLDER, "interim", "communities.geojson")

In [3]:
train_rides_out = os.path.join(DATA_FOLDER, "interim", "train_rides.csv")
bus_rides_out = os.path.join(DATA_FOLDER, "raw", "bus_rides.csv")
bike_rides_out = os.path.join(DATA_FOLDER, "raw", "bike_rides.csv")
uber_rides_out = os.path.join(DATA_FOLDER, "interim", "uber_rides.csv")

In [4]:
cta_client = CTAClient(60)
divvy_client = DivvyClient()
uber_client = UberClient(600)



# Pipeline In

In [5]:
train_stations = gpd.read_file(train_file_in)
bus_routes = gpd.read_file(bus_routes_file_in)
bus_stops = gpd.read_file(bus_stops_file_in)
tract_points = gpd.read_file(tract_file_in)
comm_points = gpd.read_file(comm_file_in)
bike_stations = gpd.read_file(bike_stations_file_in)

# Define Data Scope

Some of these tables are rather large so we need to make good choices about
what to pull in. We should abstract any logic that we might need to re-do
if we want to pull in additional dates, and cache anything that takes a while to load.

Looking ahead, we use models with -1 week, -1 month, and -1 YTD, at daily granularity.

In [6]:
data_start_iso = dt(2024, 1, 1).isoformat()
data_end_iso = DNC_END_ISO

# Train Rides

In [7]:
train_rides = cta_client.soda_get_all(L_RIDERSHIP_TABLE, 
                            select="station_id,date,daytype,rides",
                            where=f"date between '{data_start_iso}' and '{data_end_iso}'")

# Bus Rides

In [8]:
bus_rides = cta_client.soda_get_all(BUS_RIDERSHIP_TABLE, 
                            select="route,date,daytype,rides",
                            where=f"date between '{data_start_iso}' and '{data_end_iso}'")

# Bike Rides

The divvy ridership are at the ride granularity, so we need to aggregate to station-level.

In [9]:
def agg_ridership(trips: pd.DataFrame):
    """
    Get counts by station and date.
    """
    trips['start_date'] = trips['start_time'].dt.date
    trips['end_date'] = trips['end_time'].dt.date
    id_cols = ['station_id','date','vintage'] 
    id_cols += ['geometry'] if any('geometry' in x for x in trips.columns) else []
    start_rides = trips.rename(columns=lambda x: x.replace('start_','')) \
                    .groupby(id_cols, as_index=False).size() \
                    .rename(columns={'size': 'start_rides'})
    end_rides = trips.rename(columns=lambda x: x.replace('end_','')) \
                    .groupby(id_cols, as_index=False).size() \
                    .rename(columns={'size': 'end_rides'})
    rides = start_rides.merge(end_rides, how='outer')
    rides['rides'] = rides['start_rides'].fillna(0) + rides['end_rides'].fillna(0)
    return rides

In [10]:
bike_rides = divvy_client.s3_bike_trips(dt.fromisoformat(data_start_iso).year, 
                                        dt.fromisoformat(data_end_iso).year)
bike_rides = map(agg_ridership, bike_rides)
bike_rides = pd.concat(list(tqdm(bike_rides)), ignore_index=True)
bike_rides = bike_rides.loc[(bike_rides['date'] >= dt.fromisoformat(data_start_iso).date()) \
                            & (bike_rides['date'] <= dt.fromisoformat(data_end_iso).date())]

0it [00:00, ?it/s]

DEBUG: populating bucket paths.
DEBUG: reading  s3://divvy-tripdata/202401-divvy-tripdata.zip


1it [00:33, 33.25s/it]

DEBUG: reading  s3://divvy-tripdata/202402-divvy-tripdata.zip


2it [00:38, 17.03s/it]

DEBUG: reading  s3://divvy-tripdata/202403-divvy-tripdata.zip


3it [00:46, 12.69s/it]

DEBUG: reading  s3://divvy-tripdata/202404-divvy-tripdata.zip


4it [00:59, 12.75s/it]

DEBUG: reading  s3://divvy-tripdata/202405-divvy-tripdata.zip


5it [01:09, 11.98s/it]

DEBUG: reading  s3://divvy-tripdata/202406-divvy-tripdata.zip


6it [01:26, 13.51s/it]

DEBUG: reading  s3://divvy-tripdata/202407-divvy-tripdata.zip


7it [01:47, 16.01s/it]

DEBUG: reading  s3://divvy-tripdata/202408-divvy-tripdata.zip


8it [01:59, 14.87s/it]

DEBUG: reading  s3://divvy-tripdata/202409-divvy-tripdata.zip


9it [02:15, 15.10s/it]


In [None]:
# TODO! 
# We should buffer-agg these bike stations!
# Each station is actually a single dock, not the rack! (I think ...)

# Uber Rides

In [11]:
uber_pickups = uber_client.soda_get_uber(select="""
                                         date_trunc_ymd(trip_start_timestamp) as start_date, 
                                         pickup_census_tract as start_tract,
                                         count(trip_id) as rides
                                         """,
                                         where_start=iso_to_ymd(data_start_iso), 
                                         where_end=iso_to_ymd(data_end_iso), 
                                         group="start_date, start_tract",
                                         pickup=True)
uber_dropoffs = uber_client.soda_get_uber(select="""
                                         date_trunc_ymd(trip_end_timestamp) as end_date, 
                                         dropoff_census_tract as end_tract,
                                         count(trip_id) as rides
                                         """,
                                         where_start=iso_to_ymd(data_start_iso), 
                                         where_end=iso_to_ymd(data_end_iso), 
                                         group="end_date, end_tract",
                                         pickup=False)

In [12]:
uber_pickups = uber_pickups.rename(columns={'start_date':'date', 'start_tract':'tract', 'rides':'start_rides'})
uber_dropoffs = uber_dropoffs.rename(columns={'end_date':'date', 'end_tract':'tract', 'rides':'end_rides'})
uber_rides = uber_pickups.merge(uber_dropoffs, how='outer')
uber_rides['rides'] = uber_rides['start_rides'].fillna(0) + uber_rides['end_rides'].fillna(0)

# Pipeline out

In [15]:
train_rides.to_csv(train_rides_out, index=False)
bus_rides.to_csv(bus_rides_out, index=False)
bike_rides.to_csv(bike_rides_out, index=False)
uber_rides.to_csv(uber_rides_out, index=False)