In [1]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col
from tabulate import  tabulate
import numpy as np
import plotly.express as px
from stats.reg import compute_vif

In [25]:
panel_in = "../../../data/final/model_data.parquet"

did_out = "../../../reports/replication/did.md"
fe_out = "../../../reports/replication/fe.md"
parallel_out = "../../../reports/replication/parallel_trends.jpeg"

In [3]:
panel = pd.read_parquet(panel_in)

# Preprocessing

In [4]:
panel['logrides'] = panel['rides'].apply(np.log1p)

In [5]:
dotw_names = {f"C(dotw)[T.{i}]":j for i,j in zip([1,2,3,4],['Tuesday','Wednesday','Thursday','Friday'])}

# Fixed Effects

In [6]:
def mask_colinear(df, groupby, coef):
    test = df.groupby(groupby)[coef].nunique() > 1
    mask = df[groupby].isin(test[test].index)
    if not all(test):
        print(f"Dropping {sum(~test)} {groupby}, {sum(~mask)} obs invariant to {coef}")
    return mask

def model_fe(df):
    ## Community areas dont have separate units for transit.
    ## Since fixed effects cant distringuish transit anyway,
    ## Might as well sum it. Otherwise you have huge variance per unit.
    df = df.groupby(['id','date','dotw','DNC'])['rides'].sum().reset_index()
    df = df.sort_values('date')
    df['time'] = pd.Categorical(df['date']).codes
    # Make sure all units have variance in all coefs.
    mask = np.full_like(df.index, True)
    for coef in ['DNC','time','dotw']:
        mask &= mask_colinear(df, 'id', coef)
    df = df[mask]
    id_formula = "C(id)" #catvar(df, "id", tract_panel.id.min())
    formula = f"""np.log1p(rides) ~ 
                DNC + time + C(dotw) + {id_formula}"""
    model = sm.OLS.from_formula(formula, df).fit()
    return model

In [7]:
def print_fe(model):
    for l in str(model.summary()).split('\n'):
        if not l.startswith('C(id)'):
            print(l)

In [8]:
model_data = panel.loc[panel.transit == 'train']
train_fe_model = model_fe(model_data)
print_fe(train_fe_model)

                            OLS Regression Results                            
Dep. Variable:        np.log1p(rides)   R-squared:                       0.902
Model:                            OLS   Adj. R-squared:                  0.900
Method:                 Least Squares   F-statistic:                     563.2
Date:                Mon, 03 Feb 2025   Prob (F-statistic):               0.00
Time:                        15:46:50   Log-Likelihood:                -1496.0
No. Observations:                7910   AIC:                             3248.
Df Residuals:                    7782   BIC:                             4141.
Df Model:                         127                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          6.6028      0.038    175.

In [9]:
compute_vif(train_fe_model)

Unnamed: 0,coef,vif
0,Intercept,128.332169


In [10]:
model_data = panel.loc[panel.transit == 'uber']
uber_fe_model = model_fe(model_data)
print_fe(uber_fe_model)

Dropping 139 id, 995 obs invariant to DNC
Dropping 25 id, 25 obs invariant to time
Dropping 32 id, 40 obs invariant to dotw
                            OLS Regression Results                            
Dep. Variable:        np.log1p(rides)   R-squared:                       0.951
Model:                            OLS   Adj. R-squared:                  0.950
Method:                 Least Squares   F-statistic:                     1097.
Date:                Mon, 03 Feb 2025   Prob (F-statistic):               0.00
Time:                        15:47:23   Log-Likelihood:                -27717.
No. Observations:               64229   AIC:                         5.765e+04
Df Residuals:                   63123   BIC:                         6.768e+04
Df Model:                        1105                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.

In [11]:
# takes forever
# # compute_vif(uber_fe_model)

In [12]:
model_data = panel.loc[panel.transit == 'bike']
bike_fe_model = model_fe(model_data)
print_fe(bike_fe_model)

Dropping 492 id, 3345 obs invariant to DNC
Dropping 127 id, 127 obs invariant to time
Dropping 143 id, 161 obs invariant to dotw
                            OLS Regression Results                            
Dep. Variable:        np.log1p(rides)   R-squared:                       0.947
Model:                            OLS   Adj. R-squared:                  0.946
Method:                 Least Squares   F-statistic:                     831.4
Date:                Mon, 03 Feb 2025   Prob (F-statistic):               0.00
Time:                        15:47:37   Log-Likelihood:                -18357.
No. Observations:               49697   AIC:                         3.881e+04
Df Residuals:                   48651   BIC:                         4.802e+04
Df Model:                        1045                                         
Covariance Type:            nonrobust                                         
                                                                                c

In [13]:
# Takes forever
# # compute_vif(bike_fe_model)

In [14]:
fe_summary = summary_col([uber_fe_model, train_fe_model, bike_fe_model], 
            model_names=["FE (Uber)", "FE (Train)", "FE (Bike)"], 
            info_dict={"N":lambda x:(x.nobs)},
            stars=True,
            regressor_order=['DNC','time'] + list(dotw_names.keys()),
            drop_omitted=True)

fe_summary.tables[0].index = (fe_summary.tables[0].index
                              .map(lambda x: dotw_names[x] if x in dotw_names else x)
                              .str.replace('DNC','During DNC')
                              .str.replace('time','time trend'))

fe_table = tabulate(fe_summary.tables[0],
                   headers=["", "FE (Uber)", "FE (Train)", "FE (Bike)"],
                   showindex=True,
                   tablefmt='github')
print(fe_table)
with open(fe_out,"w") as f:
    f.write(fe_table)
!cp {fe_out} ../../../../eric-mc2-cv/static/uploads/

|                | FE (Uber)   | FE (Train)   | FE (Bike)   |
|----------------|-------------|--------------|-------------|
| During DNC     | -0.0524***  | -0.0814***   | 0.0297***   |
|                | (0.0066)    | (0.0148)     | (0.0069)    |
| time trend     | 0.0013***   | 0.0014***    | -0.0000     |
|                | (0.0001)    | (0.0002)     | (0.0001)    |
| Tuesday        | 0.1059***   | 0.1098***    | 0.0274***   |
|                | (0.0047)    | (0.0105)     | (0.0051)    |
| Wednesday      | 0.2156***   | 0.1172***    | 0.1199***   |
|                | (0.0047)    | (0.0105)     | (0.0051)    |
| Thursday       | 0.3314***   | 0.0893***    | 0.0794***   |
|                | (0.0047)    | (0.0105)     | (0.0051)    |
| Friday         | 0.5977***   | 0.0530***    | 0.1148***   |
|                | (0.0047)    | (0.0106)     | (0.0051)    |
| R-squared      | 0.9505      | 0.9019       | 0.9470      |
| R-squared Adj. | 0.9497      | 0.9003       | 0.9458      |
| N     

In [15]:
main_effect = fe_summary.tables[0].xs('During DNC').str.replace('*','').astype(float)
main_pct = np.exp(main_effect) - 1
main_pct.apply("{:.1%}".format)

FE (Uber)     -5.1%
FE (Train)    -7.8%
FE (Bike)      3.0%
Name: During DNC, dtype: object

# DiffNDiff

In [16]:
def did_colinearity(df):
    mask = np.full_like(df.index, True)
    mask &= mask_colinear(df, 'id', 'DNC')
    mask &= mask_colinear(df, 'date', 'UCMP')
    df = df.loc[mask]
    return df

def model_transit_did(df, transit):
    df = df.sort_values('date')
    df['time'] = pd.Categorical(df['date']).codes
    # contained_terms = ['train_contained','bike_contained','bus_contained']
    # contained_term = " + ".join([c for c in contained_terms if c in df.columns])
    # contained_term = contained_term if contained_term else "1"
    unit_obs = df.groupby('id').size()
    df['weight'] = df['id'].map(unit_obs).apply(lambda x: 1/x)
    dist_terms = ['log_train_distance','log_bike_distance','log_bus_distance']
    dist_terms = [c for c in dist_terms if c in df.columns and transit not in c]
    dist_term = " + ".join(dist_terms)
    dist_term = dist_term if dist_term else "1"
    # Assert that spatial units have time variance (only need to test group inclusion, not every variable.)
    df = did_colinearity(df)
    formula = f"""logrides ~ 
                UCMP * DNC +
                time + C(dotw) +
                {dist_term} +
                lat + long + I(lat * long) 
                + I(lat**2) + I(long**2)
                """
    did_model = sm.WLS.from_formula(formula, df, weights=df['weight']).fit(cov_type='cluster',cov_kwds={'groups': df['id']})
    return did_model

In [17]:
model_data = panel.loc[panel.transit == 'bike']
bike_did_model = model_transit_did(model_data, 'bike')
bike_did_model.summary()

Dropping 492 id, 3345 obs invariant to DNC


0,1,2,3
Dep. Variable:,logrides,R-squared:,0.66
Model:,WLS,Adj. R-squared:,0.66
Method:,Least Squares,F-statistic:,191.3
Date:,"Mon, 03 Feb 2025",Prob (F-statistic):,1.63e-286
Time:,15:47:40,Log-Likelihood:,-69529.0
No. Observations:,49704,AIC:,139100.0
Df Residuals:,49688,BIC:,139200.0
Df Model:,15,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,3.6798,0.184,20.004,0.000,3.319,4.040
C(dotw)[T.1],0.0096,0.015,0.642,0.521,-0.020,0.039
C(dotw)[T.2],0.0922,0.016,5.689,0.000,0.060,0.124
C(dotw)[T.3],0.0587,0.016,3.664,0.000,0.027,0.090
C(dotw)[T.4],0.0602,0.015,3.964,0.000,0.030,0.090
UCMP,0.4222,0.124,3.394,0.001,0.178,0.666
DNC,-0.0335,0.024,-1.391,0.164,-0.081,0.014
UCMP:DNC,0.2998,0.055,5.405,0.000,0.191,0.408
time,-0.0008,0.000,-1.929,0.054,-0.002,1.32e-05

0,1,2,3
Omnibus:,7502.425,Durbin-Watson:,1.685
Prob(Omnibus):,0.0,Jarque-Bera (JB):,35039.937
Skew:,-0.666,Prob(JB):,0.0
Kurtosis:,6.892,Cond. No.,812.0


In [18]:
model_data = panel.loc[panel.transit == 'train']
train_did_model = model_transit_did(model_data, 'train')
train_did_model.summary()

0,1,2,3
Dep. Variable:,logrides,R-squared:,0.294
Model:,WLS,Adj. R-squared:,0.293
Method:,Least Squares,F-statistic:,42.45
Date:,"Mon, 03 Feb 2025",Prob (F-statistic):,6.23e-41
Time:,15:47:40,Log-Likelihood:,-9627.9
No. Observations:,7910,AIC:,19290.0
Df Residuals:,7894,BIC:,19400.0
Df Model:,15,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,6.4759,0.769,8.424,0.000,4.969,7.983
C(dotw)[T.1],0.1099,0.007,15.087,0.000,0.096,0.124
C(dotw)[T.2],0.1178,0.006,19.457,0.000,0.106,0.130
C(dotw)[T.3],0.0911,0.006,15.922,0.000,0.080,0.102
C(dotw)[T.4],0.0564,0.010,5.588,0.000,0.037,0.076
UCMP,-0.7937,0.515,-1.542,0.123,-1.802,0.215
DNC,-0.1221,0.015,-8.367,0.000,-0.151,-0.094
UCMP:DNC,0.8550,0.525,1.629,0.103,-0.174,1.884
time,0.0009,0.000,4.143,0.000,0.000,0.001

0,1,2,3
Omnibus:,4129.53,Durbin-Watson:,2.109
Prob(Omnibus):,0.0,Jarque-Bera (JB):,94650.541
Skew:,-2.008,Prob(JB):,0.0
Kurtosis:,19.464,Cond. No.,622.0


In [19]:
model_data = panel.loc[panel.transit == 'uber']
uber_did_model = model_transit_did(model_data, 'uber')
uber_did_model.summary()

Dropping 139 id, 995 obs invariant to DNC


0,1,2,3
Dep. Variable:,logrides,R-squared:,0.511
Model:,WLS,Adj. R-squared:,0.511
Method:,Least Squares,F-statistic:,175.7
Date:,"Mon, 03 Feb 2025",Prob (F-statistic):,3.2500000000000003e-289
Time:,15:47:40,Log-Likelihood:,-106030.0
No. Observations:,64231,AIC:,212100.0
Df Residuals:,64214,BIC:,212300.0
Df Model:,16,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,8.5576,0.477,17.929,0.000,7.622,9.493
C(dotw)[T.1],0.1085,0.009,12.124,0.000,0.091,0.126
C(dotw)[T.2],0.1924,0.009,21.193,0.000,0.175,0.210
C(dotw)[T.3],0.2866,0.011,26.111,0.000,0.265,0.308
C(dotw)[T.4],0.5041,0.014,36.159,0.000,0.477,0.531
UCMP,0.7979,0.236,3.385,0.001,0.336,1.260
DNC,-0.0666,0.014,-4.644,0.000,-0.095,-0.038
UCMP:DNC,0.1845,0.081,2.279,0.023,0.026,0.343
time,0.0013,0.000,7.666,0.000,0.001,0.002

0,1,2,3
Omnibus:,954.722,Durbin-Watson:,1.404
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1346.564
Skew:,-0.187,Prob(JB):,3.96e-293
Kurtosis:,3.602,Cond. No.,873.0


In [20]:
headers = ["DiD (Uber)", "DiD (Train)", "DiD (Bike)"]
did_summary = summary_col([uber_did_model, train_did_model, bike_did_model], 
            model_names=headers, 
            info_dict={"N":lambda x:(x.nobs)},
            stars=True,
            regressor_order=['UCMP','DNC','UCMP:DNC'] + 
                            #  ['time'] + list(dotw_names.keys()) +
                             ['log_train_distance','log_bike_distance','log_bus_distance'],
                            #  'lat','long','I(lat * long)','I(lat ** 2)', 'I(long ** 2)'],
            drop_omitted=True)

did_summary.tables[0].index = (did_summary.tables[0].index
                               .str.replace('DNC','During DNC')
                               .str.replace('UCMP','Near DNC')
                               .str.replace('time','time trend')
                               .map(lambda x: dotw_names[x] if x in dotw_names else x)
                               .str.replace('log_train_distance','log(dist to train)')
                               .str.replace('log_bike_distance','log(dist to bike)')
                               .str.replace('log_bus_distance','log(dist to bus)')
                               .str.replace('I(lat ** 2)','lat**2')
                               .str.replace('I(long ** 2)','long**2')
                               .str.replace('I(lat * long)','lat*long'))

did_table = tabulate(did_summary.tables[0],
                   headers=[""] + headers,
                   showindex=True,
                   tablefmt='github')
print(did_table)
with open(did_out,"w") as f:
    f.write(did_table)
!cp {did_out} ../../../../eric-mc2-cv/static/uploads/

|                     | DiD (Uber)   | DiD (Train)   | DiD (Bike)   |
|---------------------|--------------|---------------|--------------|
| Near DNC            | 0.7979***    | -0.7937       | 0.4222***    |
|                     | (0.2357)     | (0.5147)      | (0.1244)     |
| During DNC          | -0.0666***   | -0.1221***    | -0.0335      |
|                     | (0.0143)     | (0.0146)      | (0.0240)     |
| Near DNC:During DNC | 0.1845**     | 0.8550        | 0.2998***    |
|                     | (0.0810)     | (0.5248)      | (0.0555)     |
| log(dist to train)  | -0.3231***   |               | -0.1238***   |
|                     | (0.0563)     |               | (0.0215)     |
| log(dist to bike)   | -0.2997***   | 0.2345        |              |
|                     | (0.0553)     | (0.1478)      |              |
| log(dist to bus)    | -0.0108      | -0.0730       | 0.0236       |
|                     | (0.0450)     | (0.0667)      | (0.0275)     |
| R-squared         

## DiD Impact

In [21]:
beta1 = did_summary.tables[0].xs('During DNC').str.replace('*','').astype(float)
main_pct = np.exp(beta1) - 1
pct1 = main_pct.apply("{:.1%}".format)

beta2 = did_summary.tables[0].xs('Near DNC').str.replace('*','').astype(float)
main_pct = np.exp(beta2) - 1
pct2 = main_pct.apply("{:.1%}".format)

beta3 = did_summary.tables[0].xs('Near DNC:During DNC').str.replace('*','').astype(float)
main_pct = np.exp(beta3) - 1
pct3 = main_pct.apply("{:.1%}".format)
pd.concat([pct1,pct2,pct3],axis=1)

Unnamed: 0,During DNC,Near DNC,Near DNC:During DNC
DiD (Uber),-6.4%,122.1%,20.3%
DiD (Train),-11.5%,-54.8%,135.1%
DiD (Bike),-3.3%,52.5%,35.0%


## Contextualized Effect

In [22]:
n_trt = panel[(panel.UCMP==1)&(panel.DNC==1)].groupby('transit').size()
n_ctl = panel[(panel.UCMP==0)&(panel.DNC==1)].groupby('transit').size()
mean_trt = panel[(panel.UCMP==1)&(panel.DNC==0)].groupby(['transit','id'])['rides'].mean().groupby('transit').mean()
mean_ctl = panel[(panel.UCMP==0)&(panel.DNC==0)].groupby(['transit','id'])['rides'].mean().groupby('transit').mean()
# The geom mean vastly under-estimates the scale of the changes. because the total changes
# are in fact driven by the high-ridership tail of the distribution, which is discounted in the geom mean.
geo_mean_trt = panel[(panel.UCMP==1)&(panel.DNC==0)].groupby(['transit','id'])['logrides'].mean().groupby('transit').mean()
geo_mean_ctl = panel[(panel.UCMP==0)&(panel.DNC==0)].groupby(['transit','id'])['logrides'].mean().groupby('transit').mean()
exp_mean_trt = np.exp(geo_mean_trt)
exp_mean_ctl = np.exp(geo_mean_ctl)
pct_trt = (np.exp(beta3) - 1)
pct_ctl = (np.exp(beta1) - 1)
pct_trt.index = pct_trt.index.map(lambda x: x.replace('DiD (','').replace(')','').lower())
pct_ctl.index = pct_ctl.index.map(lambda x: x.replace('DiD (','').replace(')','').lower())
delta_trt = pct_trt * mean_trt * n_trt
delta_ctl = pct_ctl * mean_ctl * n_ctl
tmp = pd.DataFrame({'trt':delta_trt, 'ctl':delta_ctl}).round(0)
delta_trt = pct_trt * geo_mean_trt * n_trt
delta_ctl = pct_ctl * geo_mean_ctl * n_ctl
tmp2 = pd.DataFrame({'geotrt':delta_trt, 'geoctl':delta_ctl}).round(0)
pd.concat([tmp,tmp2],axis=1)

Unnamed: 0,trt,ctl,geotrt,geoctl
bike,2989.0,-2546.0,220.0,-197.0
train,62426.0,-142951.0,287.0,-397.0
uber,24047.0,-42733.0,178.0,-825.0


## Parallel Trends

PT is actually pretty convincing.

In [23]:
plot_data = panel.assign(nearby = panel['UCMP'].map({0:False, 1:True}))
plot_data = did_colinearity(plot_data)
plot_data = plot_data.groupby(['date','nearby','transit'])['rides'].sum().reset_index()
means = plot_data.groupby(['nearby','transit'])['rides'].mean().rename('mean').reset_index()
first = plot_data.sort_values('date')[['nearby','transit','rides']].iloc[:6].rename(columns={'rides':'first'})
plot_data = plot_data.merge(means, on=['nearby','transit']).merge(first, on=['nearby','transit'])
plot_data = plot_data.assign(demean = plot_data['rides'] / plot_data['first'])

Dropping 631 id, 4340 obs invariant to DNC


In [26]:
chart = px.line(plot_data, x='date', y='demean', 
        line_dash='nearby', facet_row='transit',
        color_discrete_sequence=['black'],
        title= 'Rides (normalized to starting date)',
        labels={'date':'', 'demean':'',
                'nearby':'Near DNC'},
        template='simple_white')
chart.add_vrect(x0="2024-08-19", x1="2024-08-22", 
                fillcolor="gray", opacity=.5)
chart.write_image(parallel_out)
!cp {parallel_out} ../../../../eric-mc2-cv/static/img
chart.show()
