In [393]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import plotly.express as px
from statsmodels.iolib.summary2 import summary_col
from tabulate import  tabulate
from stats.reg import catvar

In [394]:
point_panel_in = "../../../data/final/point_panel.parquet"
tract_panel_in = "../../../data/final/tract_panel.parquet"

fe_table_out = "../../../reports/replication/fe-attend.md"
did_table_out = "../../../reports/replication/did-attend.md"
pt_img_out = "../../../reports/replication/parallel_trends_attend.jpeg"
cv_dir = "../../../../eric-mc2-cv/static"

In [395]:
point_panel = pd.read_parquet(point_panel_in)
tract_panel = pd.read_parquet(tract_panel_in)

## Preprocessing

In [396]:
def get_model_data(panel, transit):
    model_data = panel.query(f'transit == "{transit}"')
    exclude_tracts = model_data.groupby('id')['rides'].transform('sum') == 0
    model_data = model_data[~exclude_tracts]
    return model_data

## Fixed Effects

In [443]:
def model_fe(df):
    formula = f"""np.log1p(rides) ~ 
                DNC + np.log(attendance) + 
                C(dotw) + monthofyear + C(id)"""
    model = sm.OLS.from_formula(formula, df).fit()
    return model

In [444]:
model_data = get_model_data(point_panel, "train")
train_fe_model = model_fe(model_data)
# train_fe_model.summary()

In [445]:
model_data = get_model_data(tract_panel, "uber")
uber_fe_model = model_fe(model_data)
# uber_fe_model.summary()

In [446]:
model_data = get_model_data(point_panel, "bike")
bike_fe_model = model_fe(model_data)
# bike_fe_model.summary()

In [447]:

regressors = ['Intercept','DNC','np.log(attendance)']
    # XXX: mention fixed effects but dont show in table. not interesting. 
    # [f'C(dotw)[T.{i:02}]' for i in range(1,7)] + \
    # [f'monthofyear[T.{i:02}]' for i in range(1,13)]

headers = ["FE (Uber)", "FE (Train)", "FE (Bike)"]
fe_summary = summary_col([uber_fe_model, train_fe_model, bike_fe_model], 
            model_names=headers, 
            info_dict={"N":lambda x:(x.nobs)},
            stars=True,
            regressor_order=regressors,
            drop_omitted=True)

fe_summary.tables[0].index = (fe_summary.tables[0].index
                              .str.replace('np.log(attendance)','log(attendance)')
                              .str.replace('C(dotw)','dotw')
                              .str.replace('DNC','During DNC'))

fe_summary = tabulate(fe_summary.tables[0],
                   headers=[""] + headers,
                   showindex=True,
                   tablefmt='github')
with open(fe_table_out,"w") as f:
    f.write(fe_summary)
!cp $fe_table_out "$cv_dir/uploads"
print(fe_summary)

|                 | FE (Uber)   | FE (Train)   | FE (Bike)   |
|-----------------|-------------|--------------|-------------|
| Intercept       | 4.7565***   | 5.2723***    | 1.4198***   |
|                 | (0.1679)    | (0.3000)     | (0.2086)    |
| During DNC      | -0.0718**   | -0.0476      | 0.0658      |
|                 | (0.0321)    | (0.0547)     | (0.0406)    |
| log(attendance) | 0.1405***   | 0.1853***    | 0.1700***   |
|                 | (0.0146)    | (0.0276)     | (0.0193)    |
| R-squared       | 0.9179      | 0.8956       | 0.8096      |
| R-squared Adj.  | 0.9168      | 0.8930       | 0.8064      |
| N               | 9261.0      | 1473.0       | 8087.0      |


## DiffNDiff

In [467]:
def model_did(df):
    clusters = df['stadium'] + df['date'] # using only stadium is rank deficient
    contained_term = "train_contained + bike_contained" if 'train_contained' in df.columns else ""
    formula = f"""np.log1p(rides) ~ 
                UCMP * DNC + np.log(attendance) +
                C(dotw) + monthofyear +
                {contained_term} +
                lat + long + I(lat * long) +
                I(lat**2) + I(long**2)
                """
    did_model = sm.OLS.from_formula(formula, df).fit(cov_type='cluster', cov_kwds={'groups': clusters})
    return did_model

In [468]:
model_data = get_model_data(point_panel, "bike")
bike_did_model = model_did(model_data)
bike_did_model.summary()

0,1,2,3
Dep. Variable:,np.log1p(rides),R-squared:,0.552
Model:,OLS,Adj. R-squared:,0.551
Method:,Least Squares,F-statistic:,252.1
Date:,"Fri, 10 Jan 2025",Prob (F-statistic):,3.3200000000000004e-201
Time:,23:20:07,Log-Likelihood:,-9854.2
No. Observations:,8087,AIC:,19750.0
Df Residuals:,8064,BIC:,19920.0
Df Model:,22,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,4.2424,0.543,7.814,0.000,3.178,5.306
C(dotw)[T.1],0.0764,0.071,1.069,0.285,-0.064,0.216
C(dotw)[T.2],0.0617,0.075,0.828,0.408,-0.084,0.208
C(dotw)[T.3],0.0025,0.070,0.035,0.972,-0.135,0.140
C(dotw)[T.4],0.0697,0.066,1.063,0.288,-0.059,0.198
C(dotw)[T.5],0.1387,0.078,1.789,0.074,-0.013,0.291
C(dotw)[T.6],-0.0076,0.073,-0.104,0.917,-0.151,0.136
monthofyear[T.02],0.6255,0.098,6.388,0.000,0.434,0.817
monthofyear[T.03],0.7615,0.097,7.851,0.000,0.571,0.952

0,1,2,3
Omnibus:,435.803,Durbin-Watson:,1.469
Prob(Omnibus):,0.0,Jarque-Bera (JB):,609.199
Skew:,-0.496,Prob(JB):,5.18e-133
Kurtosis:,3.907,Cond. No.,307.0


In [469]:
model_data = get_model_data(point_panel, "train")
train_did_model = model_did(model_data)
train_did_model.summary()

0,1,2,3
Dep. Variable:,np.log1p(rides),R-squared:,0.502
Model:,OLS,Adj. R-squared:,0.494
Method:,Least Squares,F-statistic:,274.5
Date:,"Fri, 10 Jan 2025",Prob (F-statistic):,5.46e-230
Time:,23:20:07,Log-Likelihood:,-1452.9
No. Observations:,1473,AIC:,2956.0
Df Residuals:,1448,BIC:,3088.0
Df Model:,24,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,2.8555,0.611,4.677,0.000,1.659,4.052
C(dotw)[T.1],0.1703,0.046,3.723,0.000,0.081,0.260
C(dotw)[T.2],0.2222,0.045,4.926,0.000,0.134,0.311
C(dotw)[T.3],0.1652,0.057,2.915,0.004,0.054,0.276
C(dotw)[T.4],0.1040,0.043,2.429,0.015,0.020,0.188
C(dotw)[T.5],-0.0746,0.062,-1.208,0.227,-0.196,0.046
C(dotw)[T.6],-0.3864,0.057,-6.735,0.000,-0.499,-0.274
monthofyear[T.02],0.3341,0.070,4.748,0.000,0.196,0.472
monthofyear[T.03],0.3354,0.070,4.764,0.000,0.197,0.473

0,1,2,3
Omnibus:,119.71,Durbin-Watson:,2.136
Prob(Omnibus):,0.0,Jarque-Bera (JB):,235.077
Skew:,-0.535,Prob(JB):,8.99e-52
Kurtosis:,4.638,Cond. No.,357.0


In [470]:
model_data = get_model_data(tract_panel, "uber")
uber_did_model = model_did(model_data)
uber_did_model.summary()

0,1,2,3
Dep. Variable:,np.log1p(rides),R-squared:,0.594
Model:,OLS,Adj. R-squared:,0.593
Method:,Least Squares,F-statistic:,1455.0
Date:,"Fri, 10 Jan 2025",Prob (F-statistic):,0.0
Time:,23:20:08,Log-Likelihood:,-12852.0
No. Observations:,9261,AIC:,25760.0
Df Residuals:,9233,BIC:,25960.0
Df Model:,27,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,5.8664,0.349,16.830,0.000,5.183,6.550
C(dotw)[T.1],0.1116,0.052,2.150,0.032,0.010,0.213
C(dotw)[T.2],0.2281,0.048,4.764,0.000,0.134,0.322
C(dotw)[T.3],0.4124,0.051,8.121,0.000,0.313,0.512
C(dotw)[T.4],0.6756,0.045,14.961,0.000,0.587,0.764
C(dotw)[T.5],0.9068,0.050,18.152,0.000,0.809,1.005
C(dotw)[T.6],0.3839,0.048,8.020,0.000,0.290,0.478
monthofyear[T.02],0.1554,0.096,1.620,0.105,-0.033,0.343
monthofyear[T.03],0.1820,0.083,2.196,0.028,0.020,0.345

0,1,2,3
Omnibus:,455.547,Durbin-Watson:,1.527
Prob(Omnibus):,0.0,Jarque-Bera (JB):,885.533
Skew:,-0.363,Prob(JB):,5.12e-193
Kurtosis:,4.33,Cond. No.,306.0


In [472]:
headers = ["DiD (Uber)", "DiD (Train)", "DiD (Bike)"]
did_summary = summary_col([uber_did_model, train_did_model, bike_did_model], 
            model_names=headers, 
            info_dict={"N":lambda x:(x.nobs)},
            stars=True,
            regressor_order=['Intercept', 'UCMP','DNC','UCMP:DNC','np.log(attendance)','train_contained','bike_contained'], 
            drop_omitted=True)

did_summary.tables[0].index = (did_summary.tables[0].index
                               .str.replace('DNC','During DNC')
                               .str.replace('UCMP','Near DNC')
                               .str.replace('np.log(attendance)','log(attendance)'))

did_summary = tabulate(did_summary.tables[0],
                   headers=[""] + headers,
                   showindex=True,
                   tablefmt='github')
with open(did_table_out,"w") as f:
    f.write(did_summary)
!cp $did_table_out "$cv_dir/uploads"
print(did_summary)

|                     | DiD (Uber)   | DiD (Train)   | DiD (Bike)   |
|---------------------|--------------|---------------|--------------|
| Intercept           | 5.8664***    | 2.8555***     | 4.2424***    |
|                     | (0.3486)     | (0.6106)      | (0.5429)     |
| Near DNC            | -0.2478***   | -0.6215***    | -0.6044***   |
|                     | (0.0567)     | (0.0797)      | (0.0691)     |
| During DNC          | -0.2359***   | -0.2107***    | -0.1444**    |
|                     | (0.0478)     | (0.0409)      | (0.0644)     |
| Near DNC:During DNC | 0.5758***    | 0.3751***     | 0.3544***    |
|                     | (0.0772)     | (0.1344)      | (0.0915)     |
| log(attendance)     | -0.0048      | 0.5028***     | -0.1037**    |
|                     | (0.0330)     | (0.0606)      | (0.0498)     |
| train_contained     | -0.3028***   |               |              |
|                     | (0.0278)     |               |              |
| bike_contained    

### Parallel Trends

This doesnt look great. There's no trend, but the spikes are uncorrelated.
Especially in summer, there are obviously factors affecting the control
that are not affecting the treatment. This will bias the treatment effect.

In [408]:
plot_data = pd.concat([
    get_model_data(tract_panel, "uber"),
    get_model_data(point_panel, "train"),
    get_model_data(point_panel, "bike")])
plot_data = plot_data.loc[(plot_data.date < "2024-08-19")]# | (plot_data.date > "2024-08-22")]
plot_data['UCMP'] = plot_data['UCMP'].map({0:False, 1:True})
plot_data = plot_data.groupby(['date','UCMP','transit'])['rides'].sum().reset_index()
means = plot_data.groupby(['UCMP','transit'])['rides'].mean().rename('mean').reset_index()
plot_data = plot_data.merge(means, on=['UCMP','transit'])
plot_data = plot_data.assign(demean = plot_data['rides'] / plot_data['mean'])

In [409]:
chart = px.line(plot_data, x='date', y='demean', 
        line_dash='UCMP', facet_row='transit',
        color_discrete_sequence=['black'],
        title= 'Rides (normalized to group mean)',
        labels={'date':'', 'demean':'',
                'UCMP':'Near DNC'},
        template='simple_white')
chart.write_image(pt_img_out)
!cp $pt_img_out "$cv_dir/img"
chart


# Parallel Trends Test

In [492]:
def model_pt(df):
    df = df.loc[df.date < "2024-08-19"]
    df = df.sort_values('date')
    df['time'] = pd.Categorical(df['date']).codes
    df['clusters'] = df['stadium'] + df['date']
    # Compute windows for placebo dates
    window_size = 4 # days
    n_windows = 31 // window_size
    windows = (df['time'].max() - df['time']) // window_size # count backwards
    pre = n_windows + 1 - windows.clip(lower=0, upper=n_windows+1) # count forwards
    df['pre'] = pre.astype(str)
    df = df.loc[df.groupby('pre')['UCMP'].transform('nunique') == 2] # some dont have data

    contained_term = "train_contained + bike_contained" if 'train_contained' in df.columns else ""
    formula = f"""np.log1p(rides) ~ 
                UCMP * pre + 
                np.log(attendance) + {contained_term} + 
                C(dotw) + monthofyear +
                lat + long + I(lat * long) +
                I(lat**2) + I(long**2)
                """
    model = sm.OLS.from_formula(formula, df).fit(cov_type='cluster', cov_kwds={'groups': df['clusters']})
    return model

In [493]:
model_data = get_model_data(point_panel, "train")
train_pt_model = model_pt(model_data)
model_data = get_model_data(point_panel, "bike")
bike_pt_model = model_pt(model_data)
model_data = get_model_data(tract_panel, "uber")
uber_pt_model = model_pt(model_data)

In [494]:
constraints = [x for x in uber_pt_model.model.exog_names if 'UCMP:pre' in x]
print(uber_pt_model.f_test(','.join([f"({c} = 0)" for c in constraints])))

constraints = [x for x in train_pt_model.model.exog_names if 'UCMP:pre' in x]
print(train_pt_model.f_test(','.join([f"({c} = 0)" for c in constraints])))

constraints = [x for x in bike_pt_model.model.exog_names if 'UCMP:pre' in x]
print(bike_pt_model.f_test(','.join([f"({c} = 0)" for c in constraints])))

<F test: F=9.413087449110602, p=1.1868428401329914e-10, df_denom=330, df_num=7>
<F test: F=8.470124591895102, p=1.989671471978291e-05, df_denom=314, df_num=3>
<F test: F=8.463298774758721, p=1.565532709974813e-09, df_denom=330, df_num=7>


We reject the H0 that the pre-periods are jointly zero. Meaning,
there is evidence to say that the treatment and control are significantly
different during pre periods.

In [495]:
regs = [x for x in bike_pt_model.model.exog_names if 'UCMP:pre' in x] + \
    [x for x in train_pt_model.model.exog_names if 'UCMP:pre' in x] + \
    [x for x in uber_pt_model.model.exog_names if 'UCMP:pre' in x]
headers = ["PT (Uber)", "PT (Train)", "PT (Bike)"]
pt_summary = summary_col([uber_pt_model, train_pt_model, bike_pt_model], 
            model_names=headers,
            info_dict={"N":lambda x:(x.nobs)},
            stars=True,
            regressor_order=sorted(list(set(regs))),
            drop_omitted=True)

pt_summary = tabulate(pt_summary.tables[0],
                   headers=[""] + headers,
                   showindex=True,
                   tablefmt='github')
# with open(pt_table_out,"w") as f:
#     f.write(pt_summary)
# !cp $pt_table_out "$cv_dir/uploads"
print(pt_summary)

|                | PT (Uber)   | PT (Train)   | PT (Bike)   |
|----------------|-------------|--------------|-------------|
| UCMP:pre[T.1]  | 0.0267      | 0.4683***    | 0.0253      |
|                | (0.1049)    | (0.1341)     | (0.1191)    |
| UCMP:pre[T.2]  | -0.2457***  | 0.3963***    | 0.2285*     |
|                | (0.0861)    | (0.1078)     | (0.1213)    |
| UCMP:pre[T.3]  | -0.0503     | 0.6690***    | -0.1939*    |
|                | (0.1343)    | (0.1362)     | (0.1077)    |
| UCMP:pre[T.4]  | -0.5912***  |              | -0.7403***  |
|                | (0.1191)    |              | (0.1367)    |
| UCMP:pre[T.5]  | -0.1628***  |              | -0.1093     |
|                | (0.0590)    |              | (0.0731)    |
| UCMP:pre[T.7]  | -0.2996***  |              | -0.0067     |
|                | (0.1033)    |              | (0.1113)    |
| UCMP:pre[T.8]  | -0.4217***  |              | 0.4128***   |
|                | (0.0692)    |              | (0.1203)    |
| R-squa

Damn so we can't use this model?