In [None]:
import pandas as pd
import re
import numpy as np
import statsmodels.api as sm
from statsmodels.regression.linear_model import RegressionResults
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.iolib.table import SimpleTable
from statsmodels.iolib.summary2 import summary_col
from tabulate import  tabulate
from stats.reg import coefplot, prettify_coefs, main_effects, compute_vif, joint_effect, cond_effect, catvar

In [None]:
point_panel_in = "../../../data/final/point_panel.parquet"
line_panel_in = "../../../data/final/line_panel.parquet"
tract_panel_in = "../../../data/final/tract_panel.parquet"
comm_panel_in = "../../../data/final/comm_panel.parquet"

In [None]:
point_panel = pd.read_parquet(point_panel_in)
line_panel = pd.read_parquet(line_panel_in)
tract_panel = pd.read_parquet(tract_panel_in)
comm_panel = pd.read_parquet(comm_panel_in)

## Preprocessing

In [None]:
comm_panel = comm_panel.loc[~comm_panel.is_weekend]
tract_panel = tract_panel.loc[~tract_panel.is_weekend]
line_panel = line_panel.loc[~line_panel.is_weekend]
point_panel = point_panel.loc[~point_panel.is_weekend]

In [None]:
# def pretty_summary(model, exclude=None):
#     s = model.summary()
#     data = pd.DataFrame(s.tables[1].data)
#     data.columns = ['name'] + list(data.iloc[0,1:])
#     data = data.iloc[1:, :].reset_index(drop=True)
#     if exclude is not None:
#         data = data[~data.name.str.contains(exclude)]
#     s.tables = [s.tables[0], data, s.tables[2]]
#     return s

## Fixed Effects

In [None]:
def model_transit_fe(df):
    df = df.copy()
    df['tid'] = df['transit'] + ":" + df['id'].astype(str)
    df = df.sort_values('date')
    df['time'] = pd.Categorical(df['date']).codes
    formula = f"""np.log1p(rides) ~ 
                DNC + time + dotw + tid"""
    model = sm.OLS.from_formula(formula, df).fit()
    return model

def model_area_fe(df):
    ## Community areas dont have separate units for transit.
    ## Since fixed effects cant distringuish transit anyway,
    ## Might as well sum it. Otherwise you have huge variance per unit.
    df = df.groupby(['id','date','dotw','DNC'])['rides'].sum().reset_index()
    df = df.sort_values('date')
    df['time'] = pd.Categorical(df['date']).codes
    id_formula = "C(id)" #catvar(df, "id", tract_panel.id.min())
    formula = f"""np.log1p(rides) ~ 
                DNC + time + dotw + I(dotw**2) + {id_formula}"""
    model = sm.OLS.from_formula(formula, df).fit()
    return model

In [None]:
model_data = point_panel.query('transit == "train"')
exclude_tracts = model_data.groupby('id')['rides'].transform('sum') == 0
model_data = model_data[~exclude_tracts]
train_fe_model = model_area_fe(model_data)
train_fe_model.summary()

In [None]:
model_data = tract_panel.query('transit == "uber"')
exclude_tracts = model_data.groupby('id')['rides'].transform('sum') == 0
model_data = model_data[~exclude_tracts]
uber_fe_model = model_area_fe(model_data)
uber_fe_model.summary()

In [None]:
model_data = point_panel.query('transit == "bike"')
exclude_tracts = model_data.groupby('id')['rides'].transform('sum') == 0
model_data = model_data[~exclude_tracts]
bike_fe_model = model_area_fe(model_data)
bike_fe_model.summary()

In [53]:
fe_summary = summary_col([uber_fe_model, train_fe_model, bike_fe_model], 
            model_names=["FE (Uber)", "FE (Train)", "FE (Bike)"], 
            info_dict={"N":lambda x:(x.nobs)},
            stars=True,
            regressor_order=['DNC','time','dotw','I(dotw ** 2)'], 
            drop_omitted=True)

fe_summary.tables[0].index = (fe_summary.tables[0].index
                              .str.replace('I(dotw ** 2)','dotw**2')
                              .str.replace('DNC','During DNC'))

fe_summary = tabulate(fe_summary.tables[0],
                   headers=["", "FE (Uber)", "FE (Train)", "FE (Bike)"],
                   showindex=True,
                   tablefmt='github')
with open("../../../reports/replication/fe.md","w") as f:
    f.write(fe_summary)
!cp ../../../reports/replication/fe.md ../../../../eric-mc2-cv/static/uploads/
print(fe_summary)

|                | FE (Uber)   | FE (Train)   | FE (Bike)   |
|----------------|-------------|--------------|-------------|
| During DNC     | -0.0588***  | -0.0841***   | 0.0287***   |
|                | (0.0066)    | (0.0148)     | (0.0069)    |
| time           | 0.0013***   | 0.0014***    | -0.0001     |
|                | (0.0001)    | (0.0002)     | (0.0001)    |
| dotw           | 0.0485***   | 0.1022***    | 0.0591***   |
|                | (0.0037)    | (0.0083)     | (0.0039)    |
| dotw**2        | 0.0229***   | -0.0234***   | -0.0080***  |
|                | (0.0009)    | (0.0020)     | (0.0009)    |
| R-squared      | 0.9515      | 0.9017       | 0.9488      |
| R-squared Adj. | 0.9506      | 0.9001       | 0.9473      |
| N              | 65226.0     | 7910.0       | 53053.0     |


## DiffNDiff

In [None]:
def model_did(df, transit_ref: str):
    transit = catvar(df, "transit", transit_ref)
    df = df.sort_values('date')
    df['time'] = pd.Categorical(df['date']).codes
    formula = f"""np.log1p(rides) ~ 
                (UCMP * DNC * {transit}) +
                time + dotw * {transit} +
                train_contained + bike_contained +
                lat + long + I(lat * long) 
                """
                # (UCMP * DNC * {transit}) - (DNC : {transit}) +
                # airport * DNC + 
                # + I(lat**2) + I(long**2)
    did_model = sm.OLS.from_formula(formula, df).fit()
    return did_model

def model_transit_did(df):
    df = df.sort_values('date')
    df['time'] = pd.Categorical(df['date']).codes
    contained_term = "train_contained + bike_contained +" if 'train_contained' in df.columns else ""
    formula = f"""np.log1p(rides) ~ 
                UCMP * DNC +
                time + dotw + I(dotw**2) +
                {contained_term}
                lat + long + I(lat * long) 
                + I(lat**2) + I(long**2)
                """
                # airport * DNC + 
    did_model = sm.OLS.from_formula(formula, df).fit()
    return did_model

In [43]:
model_data = point_panel.query('transit == "bike"')
exclude_tracts = model_data.groupby('id')['rides'].transform('sum') == 0
model_data = model_data[~exclude_tracts]
bike_did_model = model_transit_did(model_data)
bike_did_model.summary()

0,1,2,3
Dep. Variable:,np.log1p(rides),R-squared:,0.663
Model:,OLS,Adj. R-squared:,0.663
Method:,Least Squares,F-statistic:,10460.0
Date:,"Tue, 10 Dec 2024",Prob (F-statistic):,0.0
Time:,17:50:45,Log-Likelihood:,-69548.0
No. Observations:,53053,AIC:,139100.0
Df Residuals:,53042,BIC:,139200.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.1669,0.010,307.955,0.000,3.147,3.187
UCMP,0.2384,0.018,13.368,0.000,0.203,0.273
DNC,0.0282,0.018,1.591,0.112,-0.007,0.063
UCMP:DNC,0.2428,0.072,3.353,0.001,0.101,0.385
time,0.0002,0.000,0.784,0.433,-0.000,0.001
dotw,0.0224,0.003,8.014,0.000,0.017,0.028
lat,0.7246,0.006,128.740,0.000,0.714,0.736
long,1.3464,0.006,231.406,0.000,1.335,1.358
I(lat * long),0.5476,0.006,89.776,0.000,0.536,0.560

0,1,2,3
Omnibus:,468.366,Durbin-Watson:,1.721
Prob(Omnibus):,0.0,Jarque-Bera (JB):,480.897
Skew:,-0.23,Prob(JB):,3.75e-105
Kurtosis:,3.074,Cond. No.,692.0


In [44]:
model_data = point_panel.query('transit == "train"')
exclude_tracts = model_data.groupby('id')['rides'].transform('sum') == 0
model_data = model_data[~exclude_tracts]
train_did_model = model_transit_did(model_data)
train_did_model.summary()

0,1,2,3
Dep. Variable:,np.log1p(rides),R-squared:,0.267
Model:,OLS,Adj. R-squared:,0.266
Method:,Least Squares,F-statistic:,287.4
Date:,"Tue, 10 Dec 2024",Prob (F-statistic):,0.0
Time:,17:50:48,Log-Likelihood:,-9450.6
No. Observations:,7910,AIC:,18920.0
Df Residuals:,7899,BIC:,19000.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,7.3979,0.023,315.597,0.000,7.352,7.444
UCMP,-0.6557,0.039,-16.738,0.000,-0.733,-0.579
DNC,-0.1079,0.041,-2.622,0.009,-0.189,-0.027
UCMP:DNC,0.6845,0.151,4.529,0.000,0.388,0.981
time,0.0008,0.001,1.486,0.137,-0.000,0.002
dotw,0.0095,0.006,1.479,0.139,-0.003,0.022
lat,0.3832,0.012,32.852,0.000,0.360,0.406
long,0.6321,0.015,42.745,0.000,0.603,0.661
I(lat * long),0.3399,0.017,19.577,0.000,0.306,0.374

0,1,2,3
Omnibus:,3591.157,Durbin-Watson:,2.084
Prob(Omnibus):,0.0,Jarque-Bera (JB):,60442.507
Skew:,-1.746,Prob(JB):,0.0
Kurtosis:,16.084,Cond. No.,628.0


In [45]:
model_data = tract_panel.query('transit == "uber"')
exclude_tracts = model_data.groupby('id')['rides'].transform('sum') == 0
model_data = model_data[~exclude_tracts]
uber_did_model = model_transit_did(model_data)
uber_did_model.summary()

0,1,2,3
Dep. Variable:,np.log1p(rides),R-squared:,0.502
Model:,OLS,Adj. R-squared:,0.502
Method:,Least Squares,F-statistic:,5476.0
Date:,"Tue, 10 Dec 2024",Prob (F-statistic):,0.0
Time:,17:50:50,Log-Likelihood:,-104190.0
No. Observations:,65226,AIC:,208400.0
Df Residuals:,65213,BIC:,208500.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.1159,0.020,154.218,0.000,3.076,3.155
UCMP,0.6665,0.025,26.470,0.000,0.617,0.716
DNC,-0.0729,0.021,-3.420,0.001,-0.115,-0.031
UCMP:DNC,0.1854,0.100,1.861,0.063,-0.010,0.381
time,0.0015,0.000,5.462,0.000,0.001,0.002
dotw,0.1285,0.003,38.474,0.000,0.122,0.135
train_contained,0.2961,0.017,17.912,0.000,0.264,0.328
bike_contained,0.2245,0.003,81.875,0.000,0.219,0.230
lat,0.7125,0.006,129.244,0.000,0.702,0.723

0,1,2,3
Omnibus:,657.005,Durbin-Watson:,1.348
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1088.957
Skew:,-0.05,Prob(JB):,3.4300000000000003e-237
Kurtosis:,3.625,Cond. No.,796.0


In [52]:
headers = ["DiD (Uber)", "DiD (Train)", "DiD (Bike)"]
did_summary = summary_col([uber_did_model, train_did_model, bike_did_model], 
            model_names=headers, 
            info_dict={"N":lambda x:(x.nobs)},
            stars=True,
            regressor_order=['UCMP','DNC','UCMP:DNC',
                             'time','dotw','I(dotw ** 2)'],
            drop_omitted=True)
                            #  'lat','long','I(lat * long)','I(lat ** 2)','I(long ** 2)'],

did_summary.tables[0].index = (did_summary.tables[0].index
                               .str.replace('DNC','During DNC')
                               .str.replace('UCMP','Near DNC')
                               .str.replace('I(dotw ** 2)','dotw**2')
                               .str.replace('I(dotw ** 2)','dotw**2')
                               .str.replace('I(lat ** 2)','lat**2')
                               .str.replace('I(long ** 2)','long**2')
                               .str.replace('I(lat * long)','lat*long'))

did_summary = tabulate(did_summary.tables[0],
                   headers=[""] + headers,
                   showindex=True,
                   tablefmt='github')
with open("../../../reports/replication/did.md","w") as f:
    f.write(did_summary)
!cp ../../../reports/replication/did.md ../../../../eric-mc2-cv/static/uploads/
print(did_summary)

|                     | DiD (Uber)   | DiD (Train)   | DiD (Bike)   |
|---------------------|--------------|---------------|--------------|
| Near DNC            | 0.6665***    | -0.6557***    | 0.2384***    |
|                     | (0.0252)     | (0.0392)      | (0.0178)     |
| During DNC          | -0.0729***   | -0.1079***    | 0.0282       |
|                     | (0.0213)     | (0.0412)      | (0.0177)     |
| Near DNC:During DNC | 0.1854*      | 0.6845***     | 0.2428***    |
|                     | (0.0996)     | (0.1512)      | (0.0724)     |
| time                | 0.0015***    | 0.0008        | 0.0002       |
|                     | (0.0003)     | (0.0005)      | (0.0002)     |
| dotw                | 0.1285***    | 0.0095        | 0.0224***    |
|                     | (0.0033)     | (0.0064)      | (0.0028)     |
| R-squared           | 0.5019       | 0.2668        | 0.6635       |
| R-squared Adj.      | 0.5018       | 0.2658        | 0.6634       |
| N                 

In [None]:

# TODO: Get data from:
# https://www.baseball-reference.com/teams/CHC/2024-schedule-scores.shtml#all_team_schedule