In [1]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import plotly.express as px
from statsmodels.regression.linear_model import RegressionResults
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.iolib.table import SimpleTable
from statsmodels.iolib.summary2 import summary_col
from tabulate import  tabulate
from stats.reg import coefplot, prettify_coefs, main_effects, compute_vif, joint_effect, cond_effect, catvar

In [2]:
point_panel_in = "../../../data/final/point_panel.parquet"
line_panel_in = "../../../data/final/line_panel.parquet"
tract_panel_in = "../../../data/final/tract_panel.parquet"
comm_panel_in = "../../../data/final/comm_panel.parquet"

In [3]:
point_panel = pd.read_parquet(point_panel_in)
line_panel = pd.read_parquet(line_panel_in)
tract_panel = pd.read_parquet(tract_panel_in)
comm_panel = pd.read_parquet(comm_panel_in)

## Preprocessing

In [4]:
comm_panel = comm_panel.loc[~comm_panel.is_weekend]
tract_panel = tract_panel.loc[~tract_panel.is_weekend]
line_panel = line_panel.loc[~line_panel.is_weekend]
point_panel = point_panel.loc[~point_panel.is_weekend]

In [5]:
def get_model_data(panel, transit):
    model_data = panel.query(f'transit == "{transit}"')
    exclude_tracts = model_data.groupby('id')['rides'].transform('sum') == 0
    model_data = model_data[~exclude_tracts]
    return model_data

In [6]:
# def pretty_summary(model, exclude=None):
#     s = model.summary()
#     data = pd.DataFrame(s.tables[1].data)
#     data.columns = ['name'] + list(data.iloc[0,1:])
#     data = data.iloc[1:, :].reset_index(drop=True)
#     if exclude is not None:
#         data = data[~data.name.str.contains(exclude)]
#     s.tables = [s.tables[0], data, s.tables[2]]
#     return s

## Fixed Effects

In [7]:
def model_transit_fe(df):
    df = df.copy()
    df['tid'] = df['transit'] + ":" + df['id'].astype(str)
    df = df.sort_values('date')
    df['time'] = pd.Categorical(df['date']).codes
    formula = f"""np.log1p(rides) ~ 
                DNC + time + dotw + tid"""
    model = sm.OLS.from_formula(formula, df).fit()
    return model

def model_area_fe(df):
    ## Community areas dont have separate units for transit.
    ## Since fixed effects cant distringuish transit anyway,
    ## Might as well sum it. Otherwise you have huge variance per unit.
    df = df.groupby(['id','date','dotw','DNC'])['rides'].sum().reset_index()
    df = df.sort_values('date')
    df['time'] = pd.Categorical(df['date']).codes
    id_formula = "C(id)" #catvar(df, "id", tract_panel.id.min())
    formula = f"""np.log1p(rides) ~ 
                DNC + time + dotw + I(dotw**2) + {id_formula}"""
    model = sm.OLS.from_formula(formula, df).fit()
    return model

In [8]:
model_data = get_model_data(point_panel, "train")
train_fe_model = model_area_fe(model_data)
# train_fe_model.summary()

In [9]:
model_data

Unnamed: 0,date,id,transit,UCMP,airport,lat,long,stadium,rides,attendance,DNC,is_weekend,dotw,monthofyear
0,2024-01-01,41000,train,1,0.0,-0.759441,0.696228,HYATT REGENCY MCCORMICK PLACE 1600m,2162.0,45000.0,0.0,False,0,01
1,2024-01-01,41690,train,1,0.0,-0.761557,0.914950,HYATT REGENCY MCCORMICK PLACE 1600m,324.0,45000.0,0.0,False,0,01
21,2024-01-02,41000,train,1,0.0,-0.759441,0.696228,HYATT REGENCY MCCORMICK PLACE 1600m,2256.0,45000.0,0.0,False,1,01
22,2024-01-02,41690,train,1,0.0,-0.761557,0.914950,HYATT REGENCY MCCORMICK PLACE 1600m,742.0,45000.0,0.0,False,1,01
40,2024-01-03,41000,train,1,0.0,-0.759441,0.696228,HYATT REGENCY MCCORMICK PLACE 1600m,2249.0,45000.0,0.0,False,2,01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9555,2024-10-30,40220,train,1,0.0,-0.241575,-2.056627,UNITED CENTER 1600m,653.0,19015.0,0.0,False,2,10
9556,2024-10-30,40470,train,1,0.0,-0.231298,-0.668511,UNITED CENTER 1600m,934.0,19015.0,0.0,False,2,10
9557,2024-10-30,40810,train,1,0.0,-0.236274,-1.361850,UNITED CENTER 1600m,1636.0,19015.0,0.0,False,2,10
9558,2024-10-30,41030,train,1,0.0,-0.332886,-1.150984,UNITED CENTER 1600m,2658.0,19015.0,0.0,False,2,10


In [10]:
model_data = get_model_data(tract_panel, "uber")
uber_fe_model = model_area_fe(model_data)
# uber_fe_model.summary()

In [11]:
model_data = get_model_data(point_panel, "bike")
bike_fe_model = model_area_fe(model_data)
# bike_fe_model.summary()

In [12]:
fe_summary = summary_col([uber_fe_model, train_fe_model, bike_fe_model], 
            model_names=["FE (Uber)", "FE (Train)", "FE (Bike)"], 
            info_dict={"N":lambda x:(x.nobs)},
            stars=True,
            regressor_order=['DNC','time','dotw','I(dotw ** 2)'], 
            drop_omitted=True)

fe_summary.tables[0].index = (fe_summary.tables[0].index
                              .str.replace('I(dotw ** 2)','dotw**2')
                              .str.replace('DNC','During DNC'))

fe_summary = tabulate(fe_summary.tables[0],
                   headers=["", "FE (Uber)", "FE (Train)", "FE (Bike)"],
                   showindex=True,
                   tablefmt='github')
with open("../../../reports/replication/fe.md","w") as f:
    f.write(fe_summary)
!cp ../../../reports/replication/fe.md ../../../../eric-mc2-cv/static/uploads/
print(fe_summary)

|                | FE (Uber)   | FE (Train)   | FE (Bike)   |
|----------------|-------------|--------------|-------------|
| During DNC     | -0.0139     | -0.0777**    | -0.3525***  |
|                | (0.0263)    | (0.0323)     | (0.0355)    |
| time           | 0.0001      | 0.0022***    | 0.0133***   |
|                | (0.0001)    | (0.0002)     | (0.0002)    |
| dotw           | 0.0499***   | 0.1334***    | 0.0629***   |
|                | (0.0135)    | (0.0174)     | (0.0174)    |
| dotw**2        | 0.0296***   | -0.0272***   | -0.0147***  |
|                | (0.0031)    | (0.0040)     | (0.0040)    |
| R-squared      | 0.9222      | 0.9362       | 0.8249      |
| R-squared Adj. | 0.9209      | 0.9347       | 0.8209      |
| N              | 6295.0      | 978.0        | 5586.0      |


## DiffNDiff

In [13]:
def model_did(df, transit_ref: str):
    transit = catvar(df, "transit", transit_ref)
    df = df.sort_values('date')
    df['time'] = pd.Categorical(df['date']).codes
    formula = f"""np.log1p(rides) ~ 
                (UCMP * DNC * {transit}) +
                time + dotw * {transit} +
                train_contained + bike_contained +
                lat + long + I(lat * long) 
                """
                # (UCMP * DNC * {transit}) - (DNC : {transit}) +
                # airport * DNC + 
                # + I(lat**2) + I(long**2)
    did_model = sm.OLS.from_formula(formula, df).fit()
    return did_model

def model_transit_did(df):
    df = df.sort_values('date')
    df['time'] = pd.Categorical(df['date']).codes
    contained_term = "train_contained + bike_contained +" if 'train_contained' in df.columns else ""
    formula = f"""np.log1p(rides) ~ 
                UCMP * DNC +
                time + dotw + I(dotw**2) +
                {contained_term}
                lat + long + I(lat * long) 
                + I(lat**2) + I(long**2)
                """
                # airport * DNC + 
    did_model = sm.OLS.from_formula(formula, df).fit()
    return did_model

In [14]:
model_data = get_model_data(point_panel, "bike")
bike_did_model = model_transit_did(model_data)
bike_did_model.summary()

0,1,2,3
Dep. Variable:,np.log1p(rides),R-squared:,0.516
Model:,OLS,Adj. R-squared:,0.515
Method:,Least Squares,F-statistic:,540.6
Date:,"Sat, 11 Jan 2025",Prob (F-statistic):,0.0
Time:,00:13:14,Log-Likelihood:,-6830.8
No. Observations:,5586,AIC:,13690.0
Df Residuals:,5574,BIC:,13770.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.4933,0.071,49.280,0.000,3.354,3.632
UCMP,-0.7144,0.051,-13.962,0.000,-0.815,-0.614
DNC,-0.4747,0.080,-5.909,0.000,-0.632,-0.317
UCMP:DNC,0.3581,0.104,3.440,0.001,0.154,0.562
time,0.0116,0.000,31.365,0.000,0.011,0.012
dotw,0.0484,0.029,1.691,0.091,-0.008,0.105
I(dotw ** 2),-0.0108,0.007,-1.633,0.102,-0.024,0.002
lat,0.9195,0.017,53.803,0.000,0.886,0.953
long,0.4978,0.019,26.574,0.000,0.461,0.535

0,1,2,3
Omnibus:,313.724,Durbin-Watson:,1.656
Prob(Omnibus):,0.0,Jarque-Bera (JB):,440.053
Skew:,-0.509,Prob(JB):,2.78e-96
Kurtosis:,3.925,Cond. No.,1050.0


In [15]:
model_data = get_model_data(point_panel, "train")
train_did_model = model_transit_did(model_data)
train_did_model.summary()

0,1,2,3
Dep. Variable:,np.log1p(rides),R-squared:,0.516
Model:,OLS,Adj. R-squared:,0.511
Method:,Least Squares,F-statistic:,93.77
Date:,"Sat, 11 Jan 2025",Prob (F-statistic):,5.01e-144
Time:,00:13:14,Log-Likelihood:,-823.54
No. Observations:,978,AIC:,1671.0
Df Residuals:,966,BIC:,1730.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,7.9014,0.137,57.526,0.000,7.632,8.171
UCMP,-0.1585,0.103,-1.535,0.125,-0.361,0.044
DNC,-0.2570,0.129,-1.995,0.046,-0.510,-0.004
UCMP:DNC,0.2242,0.168,1.336,0.182,-0.105,0.554
time,0.0020,0.000,4.189,0.000,0.001,0.003
dotw,0.1470,0.048,3.080,0.002,0.053,0.241
I(dotw ** 2),-0.0312,0.011,-2.851,0.004,-0.053,-0.010
lat,0.3016,0.046,6.606,0.000,0.212,0.391
long,0.0906,0.058,1.562,0.119,-0.023,0.204

0,1,2,3
Omnibus:,53.171,Durbin-Watson:,2.101
Prob(Omnibus):,0.0,Jarque-Bera (JB):,61.275
Skew:,-0.568,Prob(JB):,4.95e-14
Kurtosis:,3.46,Cond. No.,1200.0


In [16]:
model_data = get_model_data(tract_panel, "uber")
uber_did_model = model_transit_did(model_data)
uber_did_model.summary()

0,1,2,3
Dep. Variable:,np.log1p(rides),R-squared:,0.553
Model:,OLS,Adj. R-squared:,0.552
Method:,Least Squares,F-statistic:,598.0
Date:,"Sat, 11 Jan 2025",Prob (F-statistic):,0.0
Time:,00:13:15,Log-Likelihood:,-8806.6
No. Observations:,6295,AIC:,17640.0
Df Residuals:,6281,BIC:,17740.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,5.6909,0.088,65.021,0.000,5.519,5.862
UCMP,-0.1049,0.049,-2.136,0.033,-0.201,-0.009
DNC,-0.1700,0.092,-1.840,0.066,-0.351,0.011
UCMP:DNC,0.3725,0.122,3.047,0.002,0.133,0.612
time,-0.0005,0.000,-1.728,0.084,-0.001,6.28e-05
dotw,0.0607,0.032,1.883,0.060,-0.003,0.124
I(dotw ** 2),0.0266,0.007,3.571,0.000,0.012,0.041
train_contained,-0.2073,0.047,-4.434,0.000,-0.299,-0.116
bike_contained,0.3111,0.007,44.604,0.000,0.297,0.325

0,1,2,3
Omnibus:,325.597,Durbin-Watson:,1.676
Prob(Omnibus):,0.0,Jarque-Bera (JB):,607.463
Skew:,-0.39,Prob(JB):,1.2300000000000001e-132
Kurtosis:,4.306,Cond. No.,1340.0


In [17]:
headers = ["DiD (Uber)", "DiD (Train)", "DiD (Bike)"]
did_summary = summary_col([uber_did_model, train_did_model, bike_did_model], 
            model_names=headers, 
            info_dict={"N":lambda x:(x.nobs)},
            stars=True,
            regressor_order=['UCMP','DNC','UCMP:DNC',
                             'time','dotw','I(dotw ** 2)'],
            drop_omitted=True)
                            #  'lat','long','I(lat * long)','I(lat ** 2)','I(long ** 2)'],

did_summary.tables[0].index = (did_summary.tables[0].index
                               .str.replace('DNC','During DNC')
                               .str.replace('UCMP','Near DNC')
                               .str.replace('I(dotw ** 2)','dotw**2')
                               .str.replace('I(dotw ** 2)','dotw**2')
                               .str.replace('I(lat ** 2)','lat**2')
                               .str.replace('I(long ** 2)','long**2')
                               .str.replace('I(lat * long)','lat*long'))

did_summary = tabulate(did_summary.tables[0],
                   headers=[""] + headers,
                   showindex=True,
                   tablefmt='github')
with open("../../../reports/replication/did.md","w") as f:
    f.write(did_summary)
!cp ../../../reports/replication/did.md ../../../../eric-mc2-cv/static/uploads/
print(did_summary)

|                     | DiD (Uber)   | DiD (Train)   | DiD (Bike)   |
|---------------------|--------------|---------------|--------------|
| Near DNC            | -0.1049**    | -0.1585       | -0.7144***   |
|                     | (0.0491)     | (0.1033)      | (0.0512)     |
| During DNC          | -0.1700*     | -0.2570**     | -0.4747***   |
|                     | (0.0924)     | (0.1288)      | (0.0803)     |
| Near DNC:During DNC | 0.3725***    | 0.2242        | 0.3581***    |
|                     | (0.1222)     | (0.1678)      | (0.1041)     |
| time                | -0.0005*     | 0.0020***     | 0.0116***    |
|                     | (0.0003)     | (0.0005)      | (0.0004)     |
| dotw                | 0.0607*      | 0.1470***     | 0.0484*      |
|                     | (0.0323)     | (0.0477)      | (0.0286)     |
| dotw**2             | 0.0266***    | -0.0312***    | -0.0108      |
|                     | (0.0074)     | (0.0110)      | (0.0066)     |
| R-squared         

### Parallel Trends

PT is actually pretty convincing.

In [18]:
plot_data = pd.concat([
    get_model_data(tract_panel, "uber"),
    get_model_data(point_panel, "train"),
    get_model_data(point_panel, "bike")])
plot_data['UCMP'] = plot_data['UCMP'].map({0:False, 1:True})
plot_data = plot_data[plot_data.DNC==0].groupby(['date','UCMP','transit'])['rides'].mean().reset_index()
means = plot_data.groupby(['UCMP','transit'])['rides'].mean().rename('mean').reset_index()
first = plot_data.sort_values('date')[['UCMP','transit','rides']].iloc[:6].rename(columns={'rides':'first'})
plot_data = plot_data.merge(means, on=['UCMP','transit']).merge(first, on=['UCMP','transit'])
plot_data = plot_data.assign(demean = plot_data['rides'] / plot_data['first'])

In [19]:
# fig, ax = plt.subplots(1,1, figsize=(6,6))
chart = px.line(plot_data, x='date', y='demean', 
        line_dash='UCMP', facet_row='transit',
        color_discrete_sequence=['black'],
        title= 'Rides (normalized to group mean)',
        labels={'date':'', 'demean':'',
                'UCMP':'Near DNC'},
        template='simple_white')
chart.write_image("../../../reports/replication/parallel_trends.jpeg")
!cp ../../../reports/replication/parallel_trends.jpeg ../../../../eric-mc2-cv/static/img
chart


This looks pretty good. We dont need to take the literal derivative. 

In [20]:
# But just to check, there is no sustained difference in slopes
diff_data = plot_data.groupby(['UCMP','transit'], as_index=False) \
    .apply(lambda x: 
        x.sort_values('date').assign(diff = x['demean'].diff(1))) \
    .groupby(['transit','date'], as_index=False).apply(lambda x:
        x.sort_values('UCMP').assign(ddiff = x['diff'].diff(1)))
chart = px.line(diff_data,
         x='date', y='ddiff', 
        line_dash='UCMP', facet_row='transit',
        color_discrete_sequence=['black'],
        title= 'difference in slopes',
        labels={'date':'', 'demean':'',
                'UCMP':'Near DNC'},
        template='simple_white')
chart








# Parallel Trends Test

In [71]:
def model_pt(df):
    df = df.loc[df.date < "2024-08-19"]
    df = df.sort_values('date')
    df['time'] = pd.Categorical(df['date']).codes
    # Compute windows for placebo dates
    window_size = 4 # days
    n_windows = 31 // window_size
    df['windows'] = (df['time'].max() - df['time']) // window_size # count backwards
    pre = n_windows + 1 - df['windows'].clip(lower=0, upper=n_windows+1) # count forwards
    df['pre'] = pre.astype(str)
    df['pre_label'] = (df['time'].max() - df.groupby('pre')['time'].transform('min')).astype(str)
    df = df.loc[df.groupby('pre')['UCMP'].transform('nunique') == 2] # some dont have data

    contained_term = "train_contained + bike_contained" if 'train_contained' in df.columns else ""
    formula = f"""np.log1p(rides) ~ 
                UCMP * pre_label +
                {contained_term} + 
                C(dotw) + monthofyear +
                lat + long + I(lat * long) +
                I(lat**2) + I(long**2)
                """
    model = sm.OLS.from_formula(formula, df).fit()
    return model

In [72]:
model_data = get_model_data(point_panel, "train")
train_pt_model = model_pt(model_data)
model_data = get_model_data(point_panel, "bike")
bike_pt_model = model_pt(model_data)
model_data = get_model_data(tract_panel, "uber")
uber_pt_model = model_pt(model_data)

In [67]:
constraints = [x for x in uber_pt_model.model.exog_names if 'UCMP:pre' in x]
print(uber_pt_model.f_test(','.join([f"({c} = 0)" for c in constraints])))

constraints = [x for x in train_pt_model.model.exog_names if 'UCMP:pre' in x]
print(train_pt_model.f_test(','.join([f"({c} = 0)" for c in constraints])))

constraints = [x for x in bike_pt_model.model.exog_names if 'UCMP:pre' in x]
print(bike_pt_model.f_test(','.join([f"({c} = 0)" for c in constraints])))

<F test: F=2.336237658922619, p=0.016755019908966794, df_denom=4.83e+03, df_num=8>
<F test: F=0.22394612470265718, p=0.9250631711935731, df_denom=695, df_num=4>
<F test: F=0.8760114963897017, p=0.5358335014437943, df_denom=5.17e+03, df_num=8>


This looks better. Though I'm not sure which uber coef is making this reject HO.

In [68]:
regs = [x for x in bike_pt_model.model.exog_names if 'UCMP:pre' in x] + \
    [x for x in train_pt_model.model.exog_names if 'UCMP:pre' in x] + \
    [x for x in uber_pt_model.model.exog_names if 'UCMP:pre' in x]
headers = ["PT (Uber)", "PT (Train)", "PT (Bike)"]
pt_summary = summary_col([uber_pt_model, train_pt_model, bike_pt_model], 
            model_names=headers,
            info_dict={"N":lambda x:(x.nobs)},
            stars=True,
            regressor_order=sorted(list(set(regs))),
            drop_omitted=True)

pt_summary = tabulate(pt_summary.tables[0],
                   headers=[""] + headers,
                   showindex=True,
                   tablefmt='github')
# with open(pt_table_out,"w") as f:
#     f.write(pt_summary)
# !cp $pt_table_out "$cv_dir/uploads"
print(pt_summary)

|                       | PT (Uber)   | PT (Train)   | PT (Bike)   |
|-----------------------|-------------|--------------|-------------|
| UCMP:pre_label[T.152] | 0.3461      | -0.0037      | 0.2096      |
|                       | (0.3062)    | (0.3084)     | (0.4290)    |
| UCMP:pre_label[T.15]  | -0.0013     |              | 0.1644      |
|                       | (0.3511)    |              | (0.4515)    |
| UCMP:pre_label[T.19]  | 0.2734      | 0.1134       | 0.5318      |
|                       | (0.3790)    | (0.4353)     | (0.4575)    |
| UCMP:pre_label[T.23]  | -0.0154     | 0.1518       | 0.3482      |
|                       | (0.3678)    | (0.5300)     | (0.4692)    |
| UCMP:pre_label[T.27]  | 0.8687      | 0.3315       | 0.1391      |
|                       | (0.5432)    | (0.5035)     | (0.4850)    |
| UCMP:pre_label[T.31]  | -0.3381     |              | -0.1304     |
|                       | (0.3674)    |              | (0.5236)    |
| UCMP:pre_label[T.3]   | -0.0024 