In [1]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col
from tabulate import  tabulate
import numpy as np
import plotly.express as px
from stats.reg import compute_vif

In [2]:
panel_in = "../../../data/final/model_data.parquet"

In [3]:
panel = pd.read_parquet(panel_in)

## Preprocessing

In [4]:
panel['logrides'] = panel['rides'].apply(np.log1p)

In [5]:
dotw_names = {f"C(dotw)[T.{i}]":j for i,j in zip([1,2,3,4],['Tuesday','Wednesday','Thursday','Friday'])}

## Fixed Effects

In [6]:
def mask_colinear(df, groupby, coef):
    test = df.groupby(groupby)[coef].nunique() > 1
    mask = df[groupby].isin(test[test].index)
    if not all(test):
        print(f"Dropping {sum(~test)} {groupby}, {sum(~mask)} obs invariant to {coef}")
    return mask

def model_fe(df):
    ## Community areas dont have separate units for transit.
    ## Since fixed effects cant distringuish transit anyway,
    ## Might as well sum it. Otherwise you have huge variance per unit.
    df = df.groupby(['id','date','dotw','DNC'])['rides'].sum().reset_index()
    df = df.sort_values('date')
    df['time'] = pd.Categorical(df['date']).codes
    # Make sure all units have variance in all coefs.
    mask = np.full_like(df.index, True)
    for coef in ['DNC','time','dotw']:
        mask &= mask_colinear(df, 'id', coef)
    df = df[mask]
    id_formula = "C(id)" #catvar(df, "id", tract_panel.id.min())
    formula = f"""np.log1p(rides) ~ 
                DNC + time + C(dotw) + {id_formula}"""
    model = sm.OLS.from_formula(formula, df).fit()
    return model

In [7]:
def print_fe(model):
    for l in str(model.summary()).split('\n'):
        if not l.startswith('C(id)'):
            print(l)

In [8]:
model_data = panel.loc[panel.transit == 'train']
train_fe_model = model_fe(model_data)
print_fe(train_fe_model)

                            OLS Regression Results                            
Dep. Variable:        np.log1p(rides)   R-squared:                       0.902
Model:                            OLS   Adj. R-squared:                  0.900
Method:                 Least Squares   F-statistic:                     563.2
Date:                Sun, 02 Feb 2025   Prob (F-statistic):               0.00
Time:                        23:48:39   Log-Likelihood:                -1496.0
No. Observations:                7910   AIC:                             3248.
Df Residuals:                    7782   BIC:                             4141.
Df Model:                         127                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          6.6028      0.038    175.

In [9]:
compute_vif(train_fe_model)

Unnamed: 0,coef,vif
0,Intercept,128.332169


In [10]:
model_data = panel.loc[panel.transit == 'uber']
uber_fe_model = model_fe(model_data)
print_fe(uber_fe_model)

Dropping 139 id, 995 obs invariant to DNC
Dropping 25 id, 25 obs invariant to time
Dropping 32 id, 40 obs invariant to dotw
                            OLS Regression Results                            
Dep. Variable:        np.log1p(rides)   R-squared:                       0.951
Model:                            OLS   Adj. R-squared:                  0.950
Method:                 Least Squares   F-statistic:                     1097.
Date:                Sun, 02 Feb 2025   Prob (F-statistic):               0.00
Time:                        23:49:04   Log-Likelihood:                -27717.
No. Observations:               64229   AIC:                         5.765e+04
Df Residuals:                   63123   BIC:                         6.768e+04
Df Model:                        1105                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.

In [11]:
# takes forever
# # compute_vif(uber_fe_model)

In [12]:
model_data = panel.loc[panel.transit == 'bike']
bike_fe_model = model_fe(model_data)
print_fe(bike_fe_model)

Dropping 492 id, 3345 obs invariant to DNC
Dropping 127 id, 127 obs invariant to time
Dropping 143 id, 161 obs invariant to dotw
                            OLS Regression Results                            
Dep. Variable:        np.log1p(rides)   R-squared:                       0.947
Model:                            OLS   Adj. R-squared:                  0.946
Method:                 Least Squares   F-statistic:                     831.4
Date:                Sun, 02 Feb 2025   Prob (F-statistic):               0.00
Time:                        23:49:17   Log-Likelihood:                -18357.
No. Observations:               49697   AIC:                         3.881e+04
Df Residuals:                   48651   BIC:                         4.802e+04
Df Model:                        1045                                         
Covariance Type:            nonrobust                                         
                                                                                c

In [13]:
# Takes forever
# # compute_vif(bike_fe_model)

In [14]:
fe_summary = summary_col([uber_fe_model, train_fe_model, bike_fe_model], 
            model_names=["FE (Uber)", "FE (Train)", "FE (Bike)"], 
            info_dict={"N":lambda x:(x.nobs)},
            stars=True,
            regressor_order=['DNC','time'] + list(dotw_names.keys()),
            drop_omitted=True)

fe_summary.tables[0].index = (fe_summary.tables[0].index
                              .map(lambda x: dotw_names[x] if x in dotw_names else x)
                              .str.replace('DNC','During DNC')
                              .str.replace('time','time trend'))

fe_table = tabulate(fe_summary.tables[0],
                   headers=["", "FE (Uber)", "FE (Train)", "FE (Bike)"],
                   showindex=True,
                   tablefmt='github')
with open("../../../reports/replication/fe.md","w") as f:
    f.write(fe_table)
    print(fe_table)
!cp ../../../reports/replication/fe.md ../../../../eric-mc2-cv/static/uploads/

|                | FE (Uber)   | FE (Train)   | FE (Bike)   |
|----------------|-------------|--------------|-------------|
| During DNC     | -0.0524***  | -0.0814***   | 0.0297***   |
|                | (0.0066)    | (0.0148)     | (0.0069)    |
| time trend     | 0.0013***   | 0.0014***    | -0.0000     |
|                | (0.0001)    | (0.0002)     | (0.0001)    |
| Tuesday        | 0.1059***   | 0.1098***    | 0.0274***   |
|                | (0.0047)    | (0.0105)     | (0.0051)    |
| Wednesday      | 0.2156***   | 0.1172***    | 0.1199***   |
|                | (0.0047)    | (0.0105)     | (0.0051)    |
| Thursday       | 0.3314***   | 0.0893***    | 0.0794***   |
|                | (0.0047)    | (0.0105)     | (0.0051)    |
| Friday         | 0.5977***   | 0.0530***    | 0.1148***   |
|                | (0.0047)    | (0.0106)     | (0.0051)    |
| R-squared      | 0.9505      | 0.9019       | 0.9470      |
| R-squared Adj. | 0.9497      | 0.9003       | 0.9458      |
| N     

In [15]:
main_effect = fe_summary.tables[0].xs('During DNC').str.replace('*','').astype(float)
main_pct = np.exp(main_effect) - 1
main_pct.apply("{:.1%}".format)

FE (Uber)     -5.1%
FE (Train)    -7.8%
FE (Bike)      3.0%
Name: During DNC, dtype: object

## DiffNDiff

In [16]:
def did_colinearity(df):
    mask = np.full_like(df.index, True)
    mask &= mask_colinear(df, 'id', 'DNC')
    mask &= mask_colinear(df, 'date', 'UCMP')
    df = df.loc[mask]
    return df

def model_transit_did(df, transit):
    df = df.sort_values('date')
    df['time'] = pd.Categorical(df['date']).codes
    # contained_terms = ['train_contained','bike_contained','bus_contained']
    # contained_term = " + ".join([c for c in contained_terms if c in df.columns])
    # contained_term = contained_term if contained_term else "1"
    dist_terms = ['log_train_distance','log_bike_distance','log_bus_distance']
    dist_terms = [c for c in dist_terms if c in df.columns and transit not in c]
    dist_term = " + ".join(dist_terms)
    dist_term = dist_term if dist_term else "1"
    # Assert that spatial units have time variance (only need to test group inclusion, not every variable.)
    df = did_colinearity(df)
    formula = f"""logrides ~ 
                UCMP * DNC +
                time + C(dotw) +
                {dist_term} +
                lat + long + I(lat * long) 
                + I(lat**2) + I(long**2)
                """
    did_model = sm.OLS.from_formula(formula, df).fit()
    return did_model

In [17]:
model_data = panel.loc[panel.transit == 'bike']
bike_did_model = model_transit_did(model_data, 'bike')
bike_did_model.summary()

Dropping 492 id, 3345 obs invariant to DNC


0,1,2,3
Dep. Variable:,logrides,R-squared:,0.667
Model:,OLS,Adj. R-squared:,0.667
Method:,Least Squares,F-statistic:,6640.0
Date:,"Sun, 02 Feb 2025",Prob (F-statistic):,0.0
Time:,23:49:19,Log-Likelihood:,-64010.0
No. Observations:,49704,AIC:,128100.0
Df Residuals:,49688,BIC:,128200.0
Df Model:,15,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.8150,0.028,138.036,0.000,3.761,3.869
C(dotw)[T.1],0.0218,0.013,1.741,0.082,-0.003,0.046
C(dotw)[T.2],0.1156,0.012,9.258,0.000,0.091,0.140
C(dotw)[T.3],0.0701,0.013,5.603,0.000,0.046,0.095
C(dotw)[T.4],0.0963,0.013,7.632,0.000,0.072,0.121
UCMP,0.1523,0.018,8.365,0.000,0.117,0.188
DNC,0.0118,0.018,0.673,0.501,-0.023,0.046
UCMP:DNC,0.2386,0.071,3.363,0.001,0.100,0.378
time,-3.105e-05,0.000,-0.138,0.890,-0.000,0.000

0,1,2,3
Omnibus:,672.035,Durbin-Watson:,1.729
Prob(Omnibus):,0.0,Jarque-Bera (JB):,730.179
Skew:,-0.259,Prob(JB):,2.78e-159
Kurtosis:,3.291,Cond. No.,688.0


In [18]:
model_data = panel.loc[panel.transit == 'train']
train_did_model = model_transit_did(model_data, 'train')
train_did_model.summary()

0,1,2,3
Dep. Variable:,logrides,R-squared:,0.289
Model:,OLS,Adj. R-squared:,0.288
Method:,Least Squares,F-statistic:,214.3
Date:,"Sun, 02 Feb 2025",Prob (F-statistic):,0.0
Time:,23:49:19,Log-Likelihood:,-9326.8
No. Observations:,7910,AIC:,18690.0
Df Residuals:,7894,BIC:,18800.0
Df Model:,15,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,6.6702,0.075,88.650,0.000,6.523,6.818
C(dotw)[T.1],0.1104,0.028,3.941,0.000,0.055,0.165
C(dotw)[T.2],0.1183,0.028,4.222,0.000,0.063,0.173
C(dotw)[T.3],0.0910,0.028,3.244,0.001,0.036,0.146
C(dotw)[T.4],0.0556,0.028,1.965,0.049,0.000,0.111
UCMP,-0.6559,0.039,-16.805,0.000,-0.732,-0.579
DNC,-0.1214,0.041,-2.981,0.003,-0.201,-0.042
UCMP:DNC,0.7023,0.149,4.718,0.000,0.411,0.994
time,0.0009,0.001,1.755,0.079,-0.000,0.002

0,1,2,3
Omnibus:,3009.495,Durbin-Watson:,2.11
Prob(Omnibus):,0.0,Jarque-Bera (JB):,38524.904
Skew:,-1.46,Prob(JB):,0.0
Kurtosis:,13.41,Cond. No.,636.0


In [19]:
model_data = panel.loc[panel.transit == 'uber']
uber_did_model = model_transit_did(model_data, 'uber')
uber_did_model.summary()

Dropping 139 id, 995 obs invariant to DNC


0,1,2,3
Dep. Variable:,logrides,R-squared:,0.452
Model:,OLS,Adj. R-squared:,0.452
Method:,Least Squares,F-statistic:,3312.0
Date:,"Sun, 02 Feb 2025",Prob (F-statistic):,0.0
Time:,23:49:20,Log-Likelihood:,-104940.0
No. Observations:,64231,AIC:,209900.0
Df Residuals:,64214,BIC:,210100.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,7.7795,0.070,111.107,0.000,7.642,7.917
C(dotw)[T.1],0.1091,0.016,7.001,0.000,0.079,0.140
C(dotw)[T.2],0.2057,0.016,13.239,0.000,0.175,0.236
C(dotw)[T.3],0.3096,0.016,19.970,0.000,0.279,0.340
C(dotw)[T.4],0.5604,0.016,35.887,0.000,0.530,0.591
UCMP,0.7552,0.026,28.855,0.000,0.704,0.806
DNC,-0.0621,0.022,-2.794,0.005,-0.106,-0.019
UCMP:DNC,0.1939,0.103,1.877,0.061,-0.009,0.396
time,0.0013,0.000,4.613,0.000,0.001,0.002

0,1,2,3
Omnibus:,154.932,Durbin-Watson:,1.39
Prob(Omnibus):,0.0,Jarque-Bera (JB):,197.937
Skew:,-0.01,Prob(JB):,1.0400000000000001e-43
Kurtosis:,3.271,Cond. No.,825.0


In [20]:
headers = ["DiD (Uber)", "DiD (Train)", "DiD (Bike)"]
did_summary = summary_col([uber_did_model, train_did_model, bike_did_model], 
            model_names=headers, 
            info_dict={"N":lambda x:(x.nobs)},
            stars=True,
            regressor_order=['UCMP','DNC','UCMP:DNC'] + 
                            #  ['time'] + list(dotw_names.keys()) +
                             ['log_train_distance','log_bike_distance','log_bus_distance'],
                            #  'lat','long','I(lat * long)','I(lat ** 2)', 'I(long ** 2)'],
            drop_omitted=True)

did_summary.tables[0].index = (did_summary.tables[0].index
                               .str.replace('DNC','During DNC')
                               .str.replace('UCMP','Near DNC')
                               .str.replace('time','time trend')
                               .map(lambda x: dotw_names[x] if x in dotw_names else x)
                               .str.replace('log_train_distance','log(dist to train)')
                               .str.replace('log_bike_distance','log(dist to bike)')
                               .str.replace('log_bus_distance','log(dist to bus)')
                               .str.replace('I(lat ** 2)','lat**2')
                               .str.replace('I(long ** 2)','long**2')
                               .str.replace('I(lat * long)','lat*long'))

did_table = tabulate(did_summary.tables[0],
                   headers=[""] + headers,
                   showindex=True,
                   tablefmt='github')
with open("../../../reports/replication/did.md","w") as f:
    f.write(did_table)
    print(did_table)
!cp ../../../reports/replication/did.md ../../../../eric-mc2-cv/static/uploads/

|                     | DiD (Uber)   | DiD (Train)   | DiD (Bike)   |
|---------------------|--------------|---------------|--------------|
| Near DNC            | 0.7552***    | -0.6559***    | 0.1523***    |
|                     | (0.0262)     | (0.0390)      | (0.0182)     |
| During DNC          | -0.0621***   | -0.1214***    | 0.0118       |
|                     | (0.0222)     | (0.0407)      | (0.0175)     |
| Near DNC:During DNC | 0.1939*      | 0.7023***     | 0.2386***    |
|                     | (0.1033)     | (0.1489)      | (0.0710)     |
| log(dist to train)  | -0.2663***   |               | -0.0865***   |
|                     | (0.0072)     |               | (0.0029)     |
| log(dist to bike)   | -0.2517***   | 0.1948***     |              |
|                     | (0.0073)     | (0.0132)      |              |
| log(dist to bus)    | -0.0040      | -0.0714***    | -0.0073*     |
|                     | (0.0060)     | (0.0105)      | (0.0043)     |
| R-squared         

In [21]:
main_effect = did_summary.tables[0].xs('During DNC').str.replace('*','').astype(float)
main_pct = np.exp(main_effect) - 1
x = main_pct.apply("{:.1%}".format)
main_effect = did_summary.tables[0].xs('Near DNC').str.replace('*','').astype(float)
main_pct = np.exp(main_effect) - 1
y = main_pct.apply("{:.1%}".format)
main_effect = did_summary.tables[0].xs('Near DNC:During DNC').str.replace('*','').astype(float)
main_pct = np.exp(main_effect) - 1
z = main_pct.apply("{:.1%}".format)
pd.concat([x,y,z],axis=1)

Unnamed: 0,During DNC,Near DNC,Near DNC:During DNC
DiD (Uber),-6.0%,112.8%,21.4%
DiD (Train),-11.4%,-48.1%,101.8%
DiD (Bike),1.2%,16.5%,26.9%


### Parallel Trends

PT is actually pretty convincing.

In [22]:
plot_data = panel.assign(nearby = panel['UCMP'].map({0:False, 1:True}))
plot_data = did_colinearity(plot_data)
plot_data = plot_data.groupby(['date','nearby','transit'])['rides'].mean().reset_index()
means = plot_data.groupby(['nearby','transit'])['rides'].mean().rename('mean').reset_index()
first = plot_data.sort_values('date')[['nearby','transit','rides']].iloc[:6].rename(columns={'rides':'first'})
plot_data = plot_data.merge(means, on=['nearby','transit']).merge(first, on=['nearby','transit'])
plot_data = plot_data.assign(demean = plot_data['rides'] / plot_data['first'])

Dropping 631 id, 4340 obs invariant to DNC


In [23]:
chart = px.line(plot_data, x='date', y='demean', 
        line_dash='nearby', facet_row='transit',
        color_discrete_sequence=['black'],
        title= 'Rides (normalized to starting date)',
        labels={'date':'', 'demean':'',
                'nearby':'Near DNC'},
        template='simple_white')
chart.add_vrect(x0="2024-08-19", x1="2024-08-22", 
                fillcolor="gray", opacity=.5)
chart.write_image("../../../reports/replication/parallel_trends.jpeg")
!cp ../../../reports/replication/parallel_trends.jpeg ../../../../eric-mc2-cv/static/img
chart.show()
