In [118]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import plotly.express as px
from statsmodels.regression.linear_model import RegressionResults
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.iolib.table import SimpleTable
from statsmodels.iolib.summary2 import summary_col
from tabulate import  tabulate
from stats.reg import coefplot, prettify_coefs, main_effects, compute_vif, joint_effect, cond_effect, catvar

In [119]:
point_panel_in = "../../../data/final/point_panel.parquet"
line_panel_in = "../../../data/final/line_panel.parquet"
tract_panel_in = "../../../data/final/tract_panel.parquet"
comm_panel_in = "../../../data/final/comm_panel.parquet"

In [120]:
point_panel = pd.read_parquet(point_panel_in)
line_panel = pd.read_parquet(line_panel_in)
tract_panel = pd.read_parquet(tract_panel_in)
comm_panel = pd.read_parquet(comm_panel_in)

## Preprocessing

In [121]:
comm_panel = comm_panel.loc[~comm_panel.is_weekend]
tract_panel = tract_panel.loc[~tract_panel.is_weekend]
line_panel = line_panel.loc[~line_panel.is_weekend]
point_panel = point_panel.loc[~point_panel.is_weekend]

In [122]:
def get_model_data(panel, transit):
    model_data = panel.query(f'transit == "{transit}"')
    exclude_tracts = model_data.groupby('id')['rides'].transform('sum') == 0
    model_data = model_data[~exclude_tracts]
    return model_data

In [123]:
# def pretty_summary(model, exclude=None):
#     s = model.summary()
#     data = pd.DataFrame(s.tables[1].data)
#     data.columns = ['name'] + list(data.iloc[0,1:])
#     data = data.iloc[1:, :].reset_index(drop=True)
#     if exclude is not None:
#         data = data[~data.name.str.contains(exclude)]
#     s.tables = [s.tables[0], data, s.tables[2]]
#     return s

## Fixed Effects

In [124]:
def model_transit_fe(df):
    df = df.copy()
    df['tid'] = df['transit'] + ":" + df['id'].astype(str)
    df = df.sort_values('date')
    df['time'] = pd.Categorical(df['date']).codes
    formula = f"""np.log1p(rides) ~ 
                DNC + time + dotw + tid"""
    model = sm.OLS.from_formula(formula, df).fit()
    return model

def model_area_fe(df):
    ## Community areas dont have separate units for transit.
    ## Since fixed effects cant distringuish transit anyway,
    ## Might as well sum it. Otherwise you have huge variance per unit.
    df = df.groupby(['id','date','dotw','DNC'])['rides'].sum().reset_index()
    df = df.sort_values('date')
    df['time'] = pd.Categorical(df['date']).codes
    id_formula = "C(id)" #catvar(df, "id", tract_panel.id.min())
    formula = f"""np.log1p(rides) ~ 
                DNC + time + dotw + I(dotw**2) + {id_formula}"""
    model = sm.OLS.from_formula(formula, df).fit()
    return model

In [125]:
model_data = get_model_data(point_panel, "train")
train_fe_model = model_area_fe(model_data)
# train_fe_model.summary()

In [126]:
model_data

Unnamed: 0,date,id,transit,UCMP,airport,lat,long,rides,DNC,is_weekend,dotw
2,2024-06-03,40030,train,0,0.0,-0.115786,-1.133519,936.0,0.0,False,0
3,2024-06-03,40040,train,0,0.0,-0.208408,0.736134,4520.0,0.0,False,0
4,2024-06-03,40060,train,0,0.0,0.614223,-0.867442,3088.0,0.0,False,0
5,2024-06-03,40070,train,0,0.0,-0.215886,0.826778,3330.0,0.0,False,0
6,2024-06-03,40080,train,0,0.0,0.830830,0.303946,2404.0,0.0,False,0
...,...,...,...,...,...,...,...,...,...,...,...
82582,2024-08-30,41660,train,0,0.0,-0.124136,0.857026,11129.0,0.0,False,4
82583,2024-08-30,41670,train,0,0.0,-0.122820,-0.952374,668.0,0.0,False,4
82584,2024-08-30,41690,train,1,0.0,-0.563000,0.885806,1750.0,0.0,False,4
82585,2024-08-30,41700,train,0,0.0,-0.146139,0.890151,6749.0,0.0,False,4


In [127]:
model_data = get_model_data(tract_panel, "uber")
uber_fe_model = model_area_fe(model_data)
# uber_fe_model.summary()

In [128]:
model_data = get_model_data(point_panel, "bike")
bike_fe_model = model_area_fe(model_data)
# bike_fe_model.summary()

In [129]:
fe_summary = summary_col([uber_fe_model, train_fe_model, bike_fe_model], 
            model_names=["FE (Uber)", "FE (Train)", "FE (Bike)"], 
            info_dict={"N":lambda x:(x.nobs)},
            stars=True,
            regressor_order=['DNC','time','dotw','I(dotw ** 2)'], 
            drop_omitted=True)

fe_summary.tables[0].index = (fe_summary.tables[0].index
                              .str.replace('I(dotw ** 2)','dotw**2')
                              .str.replace('DNC','During DNC'))

fe_summary = tabulate(fe_summary.tables[0],
                   headers=["", "FE (Uber)", "FE (Train)", "FE (Bike)"],
                   showindex=True,
                   tablefmt='github')
with open("../../../reports/replication/fe.md","w") as f:
    f.write(fe_summary)
!cp ../../../reports/replication/fe.md ../../../../eric-mc2-cv/static/uploads/
print(fe_summary)

|                | FE (Uber)   | FE (Train)   | FE (Bike)   |
|----------------|-------------|--------------|-------------|
| During DNC     | -0.0588***  | -0.0841***   | 0.0287***   |
|                | (0.0066)    | (0.0148)     | (0.0069)    |
| time           | 0.0013***   | 0.0014***    | -0.0001     |
|                | (0.0001)    | (0.0002)     | (0.0001)    |
| dotw           | 0.0485***   | 0.1022***    | 0.0591***   |
|                | (0.0037)    | (0.0083)     | (0.0039)    |
| dotw**2        | 0.0229***   | -0.0234***   | -0.0080***  |
|                | (0.0009)    | (0.0020)     | (0.0009)    |
| R-squared      | 0.9515      | 0.9017       | 0.9488      |
| R-squared Adj. | 0.9506      | 0.9001       | 0.9473      |
| N              | 65226.0     | 7910.0       | 53053.0     |


## DiffNDiff

In [130]:
def model_did(df, transit_ref: str):
    transit = catvar(df, "transit", transit_ref)
    df = df.sort_values('date')
    df['time'] = pd.Categorical(df['date']).codes
    formula = f"""np.log1p(rides) ~ 
                (UCMP * DNC * {transit}) +
                time + dotw * {transit} +
                train_contained + bike_contained +
                lat + long + I(lat * long) 
                """
                # (UCMP * DNC * {transit}) - (DNC : {transit}) +
                # airport * DNC + 
                # + I(lat**2) + I(long**2)
    did_model = sm.OLS.from_formula(formula, df).fit()
    return did_model

def model_transit_did(df):
    df = df.sort_values('date')
    df['time'] = pd.Categorical(df['date']).codes
    contained_term = "train_contained + bike_contained +" if 'train_contained' in df.columns else ""
    formula = f"""np.log1p(rides) ~ 
                UCMP * DNC +
                time + dotw + I(dotw**2) +
                {contained_term}
                lat + long + I(lat * long) 
                + I(lat**2) + I(long**2)
                """
                # airport * DNC + 
    did_model = sm.OLS.from_formula(formula, df).fit()
    return did_model

In [131]:
model_data = get_model_data(point_panel, "bike")
bike_did_model = model_transit_did(model_data)
bike_did_model.summary()

0,1,2,3
Dep. Variable:,np.log1p(rides),R-squared:,0.664
Model:,OLS,Adj. R-squared:,0.664
Method:,Least Squares,F-statistic:,9511.0
Date:,"Mon, 30 Dec 2024",Prob (F-statistic):,0.0
Time:,10:51:41,Log-Likelihood:,-69539.0
No. Observations:,53053,AIC:,139100.0
Df Residuals:,53041,BIC:,139200.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.1474,0.011,277.358,0.000,3.125,3.170
UCMP,0.2384,0.018,13.371,0.000,0.203,0.273
DNC,0.0223,0.018,1.253,0.210,-0.013,0.057
UCMP:DNC,0.2430,0.072,3.356,0.001,0.101,0.385
time,0.0002,0.000,0.905,0.365,-0.000,0.001
dotw,0.0601,0.010,6.169,0.000,0.041,0.079
I(dotw ** 2),-0.0095,0.002,-4.045,0.000,-0.014,-0.005
lat,0.7246,0.006,128.765,0.000,0.714,0.736
long,1.3466,0.006,231.459,0.000,1.335,1.358

0,1,2,3
Omnibus:,468.917,Durbin-Watson:,1.722
Prob(Omnibus):,0.0,Jarque-Bera (JB):,481.476
Skew:,-0.23,Prob(JB):,2.81e-105
Kurtosis:,3.074,Cond. No.,699.0


In [132]:
model_data = get_model_data(point_panel, "train")
train_did_model = model_transit_did(model_data)
train_did_model.summary()

0,1,2,3
Dep. Variable:,np.log1p(rides),R-squared:,0.269
Model:,OLS,Adj. R-squared:,0.267
Method:,Least Squares,F-statistic:,263.6
Date:,"Mon, 30 Dec 2024",Prob (F-statistic):,0.0
Time:,10:51:41,Log-Likelihood:,-9441.2
No. Observations:,7910,AIC:,18910.0
Df Residuals:,7898,BIC:,18990.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,7.3506,0.026,284.597,0.000,7.300,7.401
UCMP,-0.6558,0.039,-16.757,0.000,-0.732,-0.579
DNC,-0.1222,0.041,-2.961,0.003,-0.203,-0.041
UCMP:DNC,0.6846,0.151,4.534,0.000,0.389,0.981
time,0.0008,0.001,1.610,0.107,-0.000,0.002
dotw,0.1026,0.022,4.579,0.000,0.059,0.147
I(dotw ** 2),-0.0234,0.005,-4.336,0.000,-0.034,-0.013
lat,0.3832,0.012,32.889,0.000,0.360,0.406
long,0.6321,0.015,42.793,0.000,0.603,0.661

0,1,2,3
Omnibus:,3608.617,Durbin-Watson:,2.089
Prob(Omnibus):,0.0,Jarque-Bera (JB):,61236.013
Skew:,-1.755,Prob(JB):,0.0
Kurtosis:,16.171,Cond. No.,635.0


In [133]:
model_data = get_model_data(tract_panel, "uber")
uber_did_model = model_transit_did(model_data)
uber_did_model.summary()

0,1,2,3
Dep. Variable:,np.log1p(rides),R-squared:,0.502
Model:,OLS,Adj. R-squared:,0.502
Method:,Least Squares,F-statistic:,5062.0
Date:,"Mon, 30 Dec 2024",Prob (F-statistic):,0.0
Time:,10:51:42,Log-Likelihood:,-104170.0
No. Observations:,65226,AIC:,208400.0
Df Residuals:,65212,BIC:,208500.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.1575,0.021,150.343,0.000,3.116,3.199
UCMP,0.6664,0.025,26.478,0.000,0.617,0.716
DNC,-0.0608,0.021,-2.845,0.004,-0.103,-0.019
UCMP:DNC,0.1855,0.100,1.863,0.062,-0.010,0.381
time,0.0014,0.000,5.255,0.000,0.001,0.002
dotw,0.0475,0.012,4.063,0.000,0.025,0.070
I(dotw ** 2),0.0202,0.003,7.219,0.000,0.015,0.026
train_contained,0.2960,0.017,17.915,0.000,0.264,0.328
bike_contained,0.2246,0.003,81.916,0.000,0.219,0.230

0,1,2,3
Omnibus:,662.155,Durbin-Watson:,1.349
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1099.187
Skew:,-0.05,Prob(JB):,2.06e-239
Kurtosis:,3.628,Cond. No.,805.0


In [134]:
headers = ["DiD (Uber)", "DiD (Train)", "DiD (Bike)"]
did_summary = summary_col([uber_did_model, train_did_model, bike_did_model], 
            model_names=headers, 
            info_dict={"N":lambda x:(x.nobs)},
            stars=True,
            regressor_order=['UCMP','DNC','UCMP:DNC',
                             'time','dotw','I(dotw ** 2)'],
            drop_omitted=True)
                            #  'lat','long','I(lat * long)','I(lat ** 2)','I(long ** 2)'],

did_summary.tables[0].index = (did_summary.tables[0].index
                               .str.replace('DNC','During DNC')
                               .str.replace('UCMP','Near DNC')
                               .str.replace('I(dotw ** 2)','dotw**2')
                               .str.replace('I(dotw ** 2)','dotw**2')
                               .str.replace('I(lat ** 2)','lat**2')
                               .str.replace('I(long ** 2)','long**2')
                               .str.replace('I(lat * long)','lat*long'))

did_summary = tabulate(did_summary.tables[0],
                   headers=[""] + headers,
                   showindex=True,
                   tablefmt='github')
with open("../../../reports/replication/did.md","w") as f:
    f.write(did_summary)
!cp ../../../reports/replication/did.md ../../../../eric-mc2-cv/static/uploads/
print(did_summary)

|                     | DiD (Uber)   | DiD (Train)   | DiD (Bike)   |
|---------------------|--------------|---------------|--------------|
| Near DNC            | 0.6664***    | -0.6558***    | 0.2384***    |
|                     | (0.0252)     | (0.0391)      | (0.0178)     |
| During DNC          | -0.0608***   | -0.1222***    | 0.0223       |
|                     | (0.0214)     | (0.0413)      | (0.0178)     |
| Near DNC:During DNC | 0.1855*      | 0.6846***     | 0.2430***    |
|                     | (0.0996)     | (0.1510)      | (0.0724)     |
| time                | 0.0014***    | 0.0008        | 0.0002       |
|                     | (0.0003)     | (0.0005)      | (0.0002)     |
| dotw                | 0.0475***    | 0.1026***     | 0.0601***    |
|                     | (0.0117)     | (0.0224)      | (0.0098)     |
| dotw**2             | 0.0202***    | -0.0234***    | -0.0095***   |
|                     | (0.0028)     | (0.0054)      | (0.0023)     |
| R-squared         

### Parallel Trends

PT is actually pretty convincing.

In [135]:
plot_data = pd.concat([
    get_model_data(tract_panel, "uber"),
    get_model_data(point_panel, "train"),
    get_model_data(point_panel, "bike")])
plot_data['UCMP'] = plot_data['UCMP'].map({0:False, 1:True})
plot_data = plot_data[plot_data.DNC==0].groupby(['date','UCMP','transit'])['rides'].mean().reset_index()
means = plot_data.groupby(['UCMP','transit'])['rides'].mean().rename('mean').reset_index()
first = plot_data.sort_values('date')[['UCMP','transit','rides']].iloc[:6].rename(columns={'rides':'first'})
plot_data = plot_data.merge(means, on=['UCMP','transit']).merge(first, on=['UCMP','transit'])
plot_data = plot_data.assign(demean = plot_data['rides'] / plot_data['first'])

In [136]:
# fig, ax = plt.subplots(1,1, figsize=(6,6))
chart = px.line(plot_data, x='date', y='demean', 
        line_dash='UCMP', facet_row='transit',
        color_discrete_sequence=['black'],
        title= 'Rides (normalized to group mean)',
        labels={'date':'', 'demean':'',
                'UCMP':'Near DNC'},
        template='simple_white')
chart.write_image("../../../reports/replication/parallel_trends.jpeg")
!cp ../../../reports/replication/parallel_trends.jpeg ../../../../eric-mc2-cv/static/img
chart


This looks pretty good. We dont need to take the literal derivative. 

In [None]:
# But just to check, there is no sustained difference in slopes
diff_data = plot_data.groupby(['UCMP','transit'], as_index=False) \
    .apply(lambda x: 
        x.sort_values('date').assign(diff = x['demean'].diff(1))) \
    .groupby(['transit','date'], as_index=False).apply(lambda x:
        x.sort_values('UCMP').assign(ddiff = x['diff'].diff(1)))
chart = px.line(diff_data,
         x='date', y='ddiff', 
        line_dash='UCMP', facet_row='transit',
        color_discrete_sequence=['black'],
        title= 'difference in slopes',
        labels={'date':'', 'demean':'',
                'UCMP':'Near DNC'},
        template='simple_white')
chart






