In [None]:
import geopandas as gpd
import pandas as pd
import os
import numpy as np

import plotly.express as px

from data.constants import DATA_FOLDER, DNC_START
from data.datemath import to_ymd, from_ymd
from datetime import datetime as dt
from functools import partial
from statsmodels.tsa.ar_model import AutoReg
from power import power_uncond, power_reg  # XXX: I think i broke this when i changed delta -> effect size

In [2]:
point_panel_in = os.path.join(DATA_FOLDER, "final", "point_panel.parquet")
tract_panel_in = os.path.join(DATA_FOLDER, "final", "tract_panel.parquet")

In [3]:
point_panel = pd.read_parquet(point_panel_in)
tract_panel = pd.read_parquet(tract_panel_in)

In [4]:
dnc_isoc = from_ymd(DNC_START).isocalendar()
dnc_week = "{}-{}".format(dnc_isoc.year, str(dnc_isoc.week).rjust(2,'0'))
dnc_month = DNC_START[:7]
is_dnc = {"yearly": "2024", "monthly": dnc_month, "weekly": dnc_week, "daily":DNC_START}

In [None]:
DNC_ATTEND = 5e4
daily_delta = 2 * DNC_ATTEND  # roundtrips per day * attendees
weekly_delta = (4 + 2) * daily_delta  # (days in event + travel days) * roudtrips per day * attendees
cum_delta = {"yearly": weekly_delta, "monthly": weekly_delta, "weekly": weekly_delta, "daily": daily_delta}

# Total Rides Stats

**RQ**

Is DNC attendance enough to notice?

In [7]:
assert point_panel.date.str.match(r"\d{4}-\d{2}-\d{2}").all()
point_panel['year'] = point_panel['date'].str.extract(r"(\d{4})-\d{2}-\d{2}",expand=False)
point_panel['monthofyear'] = point_panel['date'].str.extract(r"\d{4}-(\d{2})-\d{2}",expand=False)
point_panel['dayofweek'] = point_panel['date'].str.extract(r"\d{4}-\d{2}-(\d{2})",expand=False)
point_panel['year-month'] = point_panel['date'].str.extract(r"(\d{4}-\d{2})-\d{2}",expand=False)

iso_calendar = pd.to_datetime(point_panel['date']).dt.isocalendar()
datepad = lambda x,n: x.astype(str).str.pad(n, 'left', '0')
point_panel['weekofyear'] = iso_calendar['week']
point_panel['year-week'] = iso_calendar['year'].pipe(datepad, 4) + "-" + iso_calendar['week'].pipe(datepad, 2)

In [None]:
yearly_rides = point_panel.groupby(['year','transit']).agg({'rides':'sum','DNC':'any'})
monthly_rides = point_panel.groupby(['year-month','transit']).agg({'rides':'sum','DNC':'any'})
weekly_rides = point_panel.groupby(['year-week','transit']).agg({'rides':'sum','DNC':'any'})
daily_rides = point_panel.groupby(['date','transit']).agg({'rides':'sum', 'dotw':'first','is_weekend':'first','DNC':'any'})

In [9]:
freqs = ['yearly','monthly','weekly','daily']
total_rides_ts = pd.concat([yearly_rides, monthly_rides, weekly_rides, daily_rides],
                keys=freqs, names=['freq'])
total_rides_ts.index = total_rides_ts.index.set_names(['freq','date','transit'])
total_rides_ts = total_rides_ts.reset_index()
total_rides_ts['DNC'] = total_rides_ts['DNC'].astype(float)

## Effect vs Y

In [10]:
plot_data = total_rides_ts.copy()
plot_data['torder'] = pd.Categorical(plot_data['transit'], categories=['bus','uber','train','bike'], ordered=True)
plot_data['forder'] = pd.Categorical(plot_data['freq'], categories=freqs, ordered=True)
plot_data = plot_data.sort_values(['torder','forder'])

In [11]:
plot_data['is_dnc'] = plot_data['date'] == plot_data['freq'].map(is_dnc)
plot_data['err'] =  plot_data['is_dnc'] * plot_data['freq'].map(cum_delta)

In [12]:
# Compare attendance to distribution of ridership
fig = px.box(plot_data, x='rides', color='transit', facet_row='freq')
fig.add_vline(x=5e4, line_dash="dash", line_color="gray")
fig.add_vline(x=cum_delta['yearly'], row=4, line_dash="dot", line_color="gray")
fig.add_vline(x=cum_delta['monthly'], row=3, line_dash="dot", line_color="gray")
fig.add_vline(x=cum_delta['weekly'], row=2, line_dash="dot", line_color="gray")
fig.add_vline(x=cum_delta['daily'], row=1, line_dash="dot", line_color="gray")
fig = fig.update_yaxes(matches=None)
fig = fig.update_xaxes(matches=None)
fig.show()
# This is an OK representation. It's not exactly apt because we actually want
# a DELTA of 50k. But we can see that the scale of 50k is so small that if 
# we shift one datapoint by this amount, it will probably not significantly 
# affect the mean.

## Effect vs TS

In [13]:
# Hide yearly since there is no variation in years.
this_plot_data = plot_data[plot_data['freq']!='yearly'].assign(zero=0)
fig = px.line(this_plot_data, x='date', y='rides', error_y='err', error_y_minus='zero',
              color='transit', facet_col='freq')
fig = fig.update_yaxes(matches=None)
fig = fig.update_xaxes(matches=None)
fig.update_layout(xaxis2=dict(type='category')) # disable aggregation
fig.show()
# This better shows the marginal effect. Unfortunately updating the error bar visibilities
# is too complex to be worthwhile. You can see that the bars are noticeable but 
# pretty small compared to the overall varation. 

In [14]:
# Plot the weekly graph a little bigger to double-check too
this_plot_data = plot_data[plot_data['freq']=='daily']
fig = px.line(this_plot_data, x='date', y='rides', color='transit')
fig = fig.update_yaxes(matches=None)
fig.update_layout(xaxis=dict(type='category')) # disable aggregation
fig.show()
# I omit the error bars here but note they would be a 50% change in daily
# bike rides (way out of sample) vs a 12.5-25% change in other transits.
# Controlling for day-of-the-week variation, that is a practically significant effect size!
# BUT only on the one day. Given the amount of data and variation, it has barely 
# any affect on the mean.

# This is the first time I noticed that Uber rides are exactly out-of-phase with train rides.
# People take way more ubers on weekends than weekdays!

## Power Analysis on Y

According to a Choose Chicago [report](https://cdn.choosechicago.com/uploads/2024/10/TE-DNC-Impact.pdf), the DNC attracted 50,000 visitors.

Given our sample size of ?? time periods and stations, desired power, and desired alpha,
what effect sizes can we predict?

Following [Bloom (1995)](https://journals.sagepub.com/doi/epdf/10.1177/0193841X9501900504):

the minimum detectable effect size is simply computed by comparing the one-sided
z score to reject the null hypothesis vs the one-sided z score to accept the alternate
hypothesis. i.e. for $\alpha=.05$ and $\beta=.8$ we have $MDE = 2.49\sigma_c$
whereas for $\alpha=.1$ and $\beta=.8$ we have $MDE = 2.12\sigma_c$ where $\sigma_c$
is the standard error of the estimate.

One can easily extend this analysis to two-sided cases. Bloom uses one-sided for its
simplicity in testing whether the *intended* effect happened or not. One-sided
tests have greater statistical power (produce smaller MDE's) than two-sided.
But in practice researchers do use the two-sided test because they can't theoretically rule out
the possibility of observing an unintended effect.

In [None]:
power_stats = power_uncond(total_rides_ts, ['freq','transit'], 'rides', weekly_delta / len(total_rides_ts))

In [16]:
px.bar(power_stats.reset_index(), x='transit', y='power_achieved', facet_col='freq')
# This validates that we are under-powered to detect a change in the unconditional mean ridership.

In [17]:
# Another visualization of H0 (no DNC effect) vs H1 (DNC effect)
from power_plot import plot_power
plot_data = power_stats[power_stats['freq'] != 'yearly']
labels = plot_data['freq'].str.cat(plot_data['transit'], sep=' ').rename('variable')
plot_power(plot_data[['mean', 'effect_size', 'std', 'mde']], labels)

## Power Analysis on TS

In [None]:
def compute_ar_power(model_data, model_groups, endog, exog=None, lags=1):
    model_power = {}
    for g,gd in model_data.groupby(model_groups):
        group_exog = []
        delta = cum_delta[g[0]] / len(gd)
        if g[0] == "monthly":
            dates = pd.to_datetime(gd['date']+'-01')
        elif g[0] == "weekly":
            dates = gd['date'].str.split('-',expand=True)
            isoweek = partial(dt.fromisocalendar, day=1)
            dates = list(map(lambda x: isoweek(int(x[0]),int(x[1])), dates.itertuples(index=False)))
            gd['is_holiday'] = gd['date'].str.extract(r"-(\d{2})").astype(int).isin([1,47,51,52]).astype(float)
            group_exog.append('is_holiday')
        else:
            dates = gd['date']
            dummies = pd.get_dummies(gd['is_weekend'],'weekend_dummy') * 1.0
            gd = pd.concat([gd, dummies], axis=1)
            group_exog.extend(gd.filter(like='weekend_dummy_').columns.values)
        gd = gd.set_index(pd.DatetimeIndex(dates,'infer')).sort_index()
        model_exog = [exog] if exog is not None else [] + group_exog
        model = AutoReg(endog=gd[endog], 
                        exog=gd[model_exog] if model_exog else None, 
                        lags=lags,
                        trend='c').fit()
        power = power_reg(model, delta, exog)
        model_power[g] = power

    model_power = pd.DataFrame.from_dict(model_power,'index')
    model_power.index = model_power.index.set_names(model_groups)
    model_power = model_power.reset_index()
    return model_power

In [19]:
model_data = total_rides_ts[total_rides_ts['freq']!='yearly']
model_power = compute_ar_power(model_data, ['freq','transit'], endog='rides', lags=1)

In [20]:
px.bar(model_power.reset_index(), x='transit', y='power_achieved', facet_col='freq')
# We are still under-powered even for controlling for some variance via AR(p)

In [21]:
model_data = total_rides_ts[total_rides_ts['freq']!='yearly']
model_power = compute_ar_power(model_data, ['freq','transit'], endog='rides', exog='DNC',lags=1)

In [22]:
px.bar(model_power.reset_index(), x='transit', y='power_achieved', facet_col='freq')
# We are still under-powered even for controlling for some variance via AR(p)

But I don't really think this is the right test. Because first of all we're comparing
a year's worth of data to the DNC, so any change via DNC is attenuated by 365 other days.

# Tid Rides Stats

We don't see a very obvious jump in the total time series during the DNC week.
We also don't have power to test for it using any modeling I know.

Before we check the panel models, the last thing we can do is check if we see any obvious
signal in the disaggregated time series. Particularly just the POI's.

In [46]:
poi_panel = point_panel[point_panel[['uc_1600','mp_1600','airport']].any(axis=1)]

In [47]:
print("Taking {} of {} ({:.2%}) of point units".format(
    poi_panel.tid.nunique(),
    point_panel.tid.nunique(),
    poi_panel.tid.nunique() / point_panel.tid.nunique(),
))

Taking 219 of 3223 (6.79%) of point units


In [None]:
yearly_rides = poi_panel.groupby(['year','transit','id']).agg({'rides':'sum','DNC':'any'})
monthly_rides = poi_panel.groupby(['year-month','transit','id']).agg({'rides':'sum','DNC':'any'})
weekly_rides = poi_panel.groupby(['year-week','transit','id']).agg({'rides':'sum','DNC':'any'})
daily_rides = poi_panel.groupby(['date','transit','id']).agg({'rides':'sum', 'dotw':'first','is_weekend':'first','DNC':'any'})

In [49]:
freqs = ['yearly','monthly','weekly','daily']
poi_rides_ts = pd.concat([yearly_rides, monthly_rides, weekly_rides, daily_rides],
                keys=freqs, names=['freq'])
poi_rides_ts.index = poi_rides_ts.index.set_names(['freq','date','transit','id'])
poi_rides_ts = poi_rides_ts.reset_index()
poi_rides_ts['DNC'] = poi_rides_ts['DNC'].astype(float)


## Effect vs Y

In [50]:
plot_data = poi_rides_ts.copy()
plot_data['torder'] = pd.Categorical(plot_data['transit'], categories=['bus','uber','train','bike'], ordered=True)
plot_data['forder'] = pd.Categorical(plot_data['freq'], categories=freqs, ordered=True)
plot_data['iorder'] = plot_data.groupby('id')['rides'].transform('sum').rank()

In [None]:
poi_delta = plot_data.groupby(['transit','freq'])['id'].nunique().rename('n') \
                    .reset_index() \
                    .assign(poi_delta = lambda x: x['freq'].map(cum_delta) / len(x)) \
                    .set_index(['transit','freq'])['poi_delta']

In [52]:
plot_data = plot_data.merge(poi_delta, left_on=['transit','freq'], right_index=True) \
    .sort_values(['date','torder','forder','iorder'])

In [53]:
# Compare attendance to distribution of ridership
this_plot_data = plot_data[plot_data['freq'] != 'yearly']
fig = px.box(this_plot_data, x='rides', color='id', facet_row='freq', facet_col='transit')
for t,f,d in this_plot_data[['transit','freq','poi_delta']].drop_duplicates().itertuples(index=False):
    row = this_plot_data['forder'].cat.categories.get_loc(f)
    col = this_plot_data['torder'].cat.categories.get_loc(t)
    fig.add_vline(x=d, row=row, col=col, line_dash="dot", line_color="gray")
fig = fig.update_yaxes(matches=None)
fig = fig.update_xaxes(matches=None)
fig = fig.update_layout(showlegend=False)
fig.show()

# If we assume that the DNC ridership affect is evenly distributed across all
# POI transits in the city, we see that in most cases that would be a huge
# increase in ridership. This is because first, we are only considering 6% of
# all transit options. Second, the distribution of ridership is very concentrated
# in a few transit ids, so the median ridership is actually quite low. 

# We also see that errors are heteroskedastic by freq, transit, and tid

## Effect vs TS

In [73]:
this_plot_data = plot_data[plot_data['freq']=='monthly']
this_plot_data = this_plot_data.sort_values(['date','transit','id'])
fig = px.line(this_plot_data, x='date', y='rides',
              color='id', facet_row='transit')
fig = fig.update_yaxes(matches=None)
fig = fig.update_layout(showlegend=False)
fig.show()
# Looks like no significant changes at the monthly level.

In [78]:
this_plot_data = plot_data[(plot_data['freq']=='weekly')&(plot_data['date']>="2024-05-01")]
this_plot_data = this_plot_data.sort_values(['date','transit','id'])
fig = px.line(this_plot_data, x='date', y='rides', color='id', facet_row='transit')
for row in [1,2,3,4]:
    fig.add_vline(x=dnc_week, row=row, line_dash="dot", line_color="gray")
fig = fig.update_yaxes(matches=None)
fig = fig.update_layout(showlegend=False)
fig.update_layout(xaxis=dict(type='category')) # disable aggregation
fig.show()
# Don't really see any particular spikes here either.

In [None]:
this_plot_data = plot_data[(plot_data['freq']=='daily')&(plot_data['date']>="2024-05-01")]
this_plot_data = this_plot_data.sort_values(['date','transit','id'])
fig = px.line(this_plot_data, x='date', y='rides', color='id', facet_row='transit')
for row in [1,2,3,4]:
    fig.add_vline(x=DNC_START, row=row, line_dash="dot", line_color="gray")
fig = fig.update_yaxes(matches=None)
fig = fig.update_layout(showlegend=False)
fig.update_layout(xaxis=dict(type='category')) # disable aggregation
fig.show()
# Maybe see a spike in one tid.

## Power Analysis on Y

In [None]:
power_stats = power_uncond(this_plot_data, 
                           ['freq','transit','id'], 
                           'rides', 
                           this_plot_data.groupby(['freq','transit','id'])['poi_delta'].sum())

In [65]:
px.box(power_stats.reset_index(), y='power_achieved', x='transit', facet_col='freq')


This is somewhat encouraging. I expected better cases, but it looks like the expected
spike in attendance would be noticeable on most of the transit modes at the monthly
and weekly aggregations.