**Purpose**

*This notebook introduces substantive modeling choices e.g.*
- data aggregation, 
- dropping missing data, 
- and coding more features

In [1]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
import geopandas as gpd
from datetime import timedelta

from data.constants import DNC_START, DNC_END, LOCAL_CRS, WORLD_CRS
from data.datemath import from_ymd, date_aggs, to_yw
from power import power_reg

In [2]:
point_panel_in = "../data/interim/point_panel.parquet"
line_panel_in = "../data/interim/line_panel.parquet"
tract_panel_in = "../data/interim/tract_panel.parquet"
comm_panel_in = "../data/interim/comm_panel.parquet"
tracts_in = "../data/interim/tracts.geoparquet"
attend_in = "../data/interim/sports.csv"

point_panel_out = "../data/final/point_panel.parquet"
line_panel_out = "../data/final/line_panel.parquet"
tract_panel_out = "../data/final/tract_panel.parquet"
comm_panel_out = "../data/final/comm_panel.parquet"


# Pipeline In

In [3]:
point_panel = gpd.read_parquet(point_panel_in)
line_panel = gpd.read_parquet(line_panel_in)
tract_panel = gpd.read_parquet(tract_panel_in)
comm_panel = gpd.read_parquet(comm_panel_in)
tracts = gpd.read_parquet(tracts_in)['geoid10']
attend = pd.read_csv(attend_in)

# Code Features

In [4]:
point_panel = point_panel.pipe(date_aggs, 'date')
line_panel = line_panel.pipe(date_aggs, 'date')
tract_panel = tract_panel.pipe(date_aggs, 'date')
comm_panel = comm_panel.pipe(date_aggs, 'date')

In [5]:
# Substantive!
# There aren't enough data points within 400m or even 800m.
# In panel.ipynb we show that UC and MP have higher ridership than other
# places, and furthermore that the signal is stronger closer to UC and MP
# and reverts to the mean as the buffer size grows.
# Thus, using the largest buffer size attenuates and possibly confounds our
# signal with irrelevant rides, but is necessary to achieve a minimal sample
# of treated units.

def label_ucmp(df):
    is_ucmp = (df.filter(regex=r'(UNITED|MCCORMICK).*1600m$') > 0).any(axis=1)
    df['UCMP'] = np.where(is_ucmp, 1, 0)
    return df

point_panel = point_panel.pipe(label_ucmp)
line_panel = line_panel.pipe(label_ucmp)
tract_panel = tract_panel.pipe(label_ucmp)
comm_panel = comm_panel.pipe(label_ucmp)

In [6]:
def label_attendance(df, attend):
    attend = attend.groupby(['date','stadium'],as_index=False)['attendance'].sum()
    attend = attend.loc[attend.stadium != 'seatgeek'] # no transit data outside chicago
    stadiums = {
        'soldier': 'SOLDIER FIELD 1600m',
        'guaranteed rate': 'GUARANTEED RATE FIELD 1600m',
        'wrigley': 'WRIGLEY FIELD 1600m',
        'united center': 'UNITED CENTER 1600m',
    }
    attend['stadium'] = attend['stadium'].map(stadiums)
    
    short_cols = df.filter(regex=r'00m$').columns
    stad_cols = df.filter(regex=r'1600m$').columns
    df = df.drop(columns=list(short_cols.difference(stad_cols))) # only need 1600m
    df = df.melt(id_vars=list(df.columns.difference(stad_cols)),
                 value_vars=stad_cols,
                 var_name='stadium',
                 value_name='near_stadium')
    # XXX: Eventually the regression will drop non-attendance rows so might as well do it now.
    #      This is necessary for the merge on date/stadium to work. 
    #      Because melt has already added date/stadium combos where nearby==0, and those are going to match on the merge.
    df = df.loc[df['near_stadium']>0].drop(columns=['near_stadium'])
    # XXX: We'll do a left-join here because I'm not ready to drop non-game-days.
    labeled =  pd.merge(df, attend, on=['date','stadium'], how='left')
    return labeled

In [7]:
point_panel = point_panel.pipe(label_attendance, attend)
line_panel = line_panel.pipe(label_attendance, attend)
tract_panel = tract_panel.pipe(label_attendance, attend)
comm_panel = comm_panel.pipe(label_attendance, attend)

In [8]:
def fillna(df, fills: dict):
    return df.assign(**{col: df[col].fillna(val) for col, val in fills.items()})

# Assert that null features are due to non-chicago tracts. 
# Let's impute correct values instead of dropping tracts outside of chicago (ie ubers and bike stations)
assert all(tract_panel['UCMP'].notna() | ~tract_panel['id'].isin(tracts))
assert all(tract_panel['airport'].notna() | ~tract_panel['id'].isin(tracts))
assert all(point_panel[['UCMP','airport']].notna().all(axis=1) |
         ((point_panel['transit']=='uber') & point_panel['tract'].notna()))

point_panel = point_panel.pipe(fillna, {'UCMP': 0, 'airport': 0})
line_panel = line_panel.pipe(fillna, {'UCMP': 0, 'airport': 0})
tract_panel = tract_panel.pipe(fillna, {'UCMP': 0, 'airport': 0})
comm_panel = comm_panel.pipe(fillna, {'UCMP': 0, 'airport': 0})

In [9]:
def label_latlong(df):
    # Let's standardize lat/long because they are measured in millions of feet
    # So doing so helps keep the relative scale of X vs Y in check. 
    scale = lambda x: (x - x.mean()) / x.std()
    df['lat'] = scale(df.geometry.to_crs(LOCAL_CRS).centroid.to_crs(WORLD_CRS).y)
    df['long'] = scale(df.geometry.to_crs(LOCAL_CRS).centroid.to_crs(WORLD_CRS).x)
    return df

point_panel = point_panel.pipe(label_latlong)
line_panel = line_panel.pipe(label_latlong)
tract_panel = tract_panel.pipe(label_latlong)
comm_panel = comm_panel.pipe(label_latlong)

In [10]:
def binarize(df, col):
    return df.assign(**{col: df[col]*1.0})

point_panel = point_panel.pipe(binarize, 'DNC')
line_panel = line_panel.pipe(binarize, 'DNC')
tract_panel = tract_panel.pipe(binarize, 'DNC')
comm_panel = comm_panel.pipe(binarize, 'DNC')

# Drop Data

In [11]:
# NaN Check. 
assert point_panel.filter(point_panel.columns.difference(['attendance','stadium'])).notna().all(1).all()
assert line_panel.filter(line_panel.columns.difference(['attendance','stadium'])).notna().all(1).all()
assert tract_panel.filter(tract_panel.columns.difference(['attendance','stadium'])).notna().all(1).all()
assert comm_panel.filter(comm_panel.columns.difference(['attendance','stadium'])).notna().all(1).all()

In [12]:
# XXX Dropping nulls (they will be dropped by model anyway)
point_panel = point_panel.loc[point_panel['attendance'].notna()]
line_panel = line_panel.loc[line_panel['attendance'].notna()]
tract_panel = tract_panel.loc[tract_panel['attendance'].notna()]
comm_panel = comm_panel.loc[comm_panel['attendance'].notna()]

In [13]:
# XXX: Substantive!! 
#      Taking all weeks in June, July, August
MODEL_PRE_WEEKS = 11
MODEL_POST_WEEKS = 1

model_start_yearweek = to_yw(from_ymd(DNC_START) - timedelta(weeks=MODEL_PRE_WEEKS))
model_end_yearweek = to_yw(from_ymd(DNC_END) + timedelta(weeks=MODEL_POST_WEEKS))

# XXX: For attendance model, choosing not to do this because it 1/2's the data
#       and we already have reduced it to just a few locations. The extra data
#       will not hurt our estimates of the DNC effect. It will only make more
#       precise estimates of the non-DNC baseline. But we do need to add month fixed effects now.
# point_panel = point_panel[point_panel['year-week'].between(model_start_yearweek, model_end_yearweek)]
# line_panel = line_panel[line_panel['year-week'].between(model_start_yearweek, model_end_yearweek)]
# tract_panel = tract_panel[tract_panel['year-week'].between(model_start_yearweek, model_end_yearweek)]
# comm_panel = comm_panel[comm_panel['year-week'].between(model_start_yearweek, model_end_yearweek)]

# Aggregate over Time

In [14]:
def model_agg(df):
    unit_aggs = {k: 'first' for k in ['UCMP','airport','lat','long','stadium',
                                      'train_distance', 'train_contained', 
                                       'bike_distance', 'bike_contained',
                                        'uber_distance', 'uber_contained']}
    time_aggs = {'rides':'sum','attendance':'sum','DNC':'max', 
                 'is_weekend': 'max', 'dotw': 'max', 'monthofyear':'max'}  
    aggs = {k:v for k,v in (unit_aggs | time_aggs).items() if k in df.columns}
    # weekend is constant for weekly model but we just won't include it in the model spec
    df_weekly = df.groupby(['year-week','id','transit'], as_index=False).agg(aggs)
    df_daily = df.groupby(['date','id','transit'], as_index=False).agg(aggs)
    return df_weekly, df_daily

point_panel_weekly, point_panel_daily = model_agg(point_panel)
line_panel_weekly, line_panel_daily = model_agg(line_panel)
tract_panel_weekly, tract_panel_daily = model_agg(tract_panel)
comm_panel_weekly, comm_panel_daily = model_agg(comm_panel)

# Choose and Justify Agg

Monthly: obviously stupid. attenuates treatment too much.

Weekly: we have 4/7 days of the week in the DNC. and no weekend days actually.
so we attenuate our signal by 3/7. 

Daily: we have more trouble modeling dotw variation. (maybe not an issue after eliminating weekends)

## with non-treatment model

In [15]:
def partial_r2(full, reduced, full_name, reduced_name):
    better = max(full.rsquared, reduced.rsquared)
    worse = min(full.rsquared, reduced.rsquared)
    partial = (better - worse) / (1 - worse)  # Proportion of unexplained variance explained by better model.
    better_name = full_name if full.rsquared > reduced.rsquared else reduced_name
    print("better model = {}, R2 = {:.3f}, partial R2 = {:.3f}".format(better_name, better, partial))

In [16]:
# Compute the partial R2 of weekly vs daily. Only include non-treatment regressors.
formula = "np.log1p(rides) ~ transit + (lat + long)**2 + I(lat**2) + I(long**2)"
full = sm.OLS.from_formula(formula, point_panel_daily).fit()
reduced = sm.OLS.from_formula(formula, point_panel_weekly).fit()
partial_r2(full, reduced, "daily", "weekly")
# The weekly model actually has a slightly higher R2, 
# meaning we're not great at explaining DOTW variation.

better model = daily, R2 = 0.803, partial R2 = 0.168


In [17]:
# Power analysis:
DNC_ATTEND = 5e4
effect_sum = 4 * 2 * DNC_ATTEND  # days in event * rides per day * attendees

pd.concat([power_reg(full, np.log1p(effect_sum/len(point_panel_daily))).rename('daily'),
           power_reg(reduced, np.log1p(effect_sum/len(point_panel_weekly))).rename('weekly')],axis=1) \
        .assign(ratio=lambda x: x['weekly']/x['daily'])
# The daily model has smaller residual variance but identical power achieved.

Unnamed: 0,daily,weekly,ratio
mean,4.289281,5.197714,1.211791
std,0.835855,0.980399,1.17293
mde,0.029292,0.059044,2.015709
effect_size,4.152425,5.224902,1.258277
power_achieved,1.0,1.0,1.0
t_stat,397.150807,247.91559,0.624235
is_detectable,True,True,1.0


## with treatment model

In [18]:
# Compute the partial R2 of weekly vs daily. 
formula = "np.log1p(rides) ~ DNC * transit + (lat + long)**2 + I(lat**2) + I(long**2)"
full = sm.OLS.from_formula(formula, point_panel_daily).fit()
reduced = sm.OLS.from_formula(formula, point_panel_weekly).fit()
partial_r2(full, reduced, "daily", "weekly")
# The weekly model actually has a slightly higher R2, 
# meaning we're not great at explaining DOTW variation.

better model = daily, R2 = 0.806, partial R2 = 0.172


In [19]:
pd.concat([power_reg(full, np.log1p(effect_sum/len(point_panel_daily)), "DNC").rename('daily'),
           power_reg(reduced, np.log1p(effect_sum/len(point_panel_weekly)), "DNC").rename('weekly')],axis=1) 
# The standard errors are smaller on the daily model but achieved effect size is net the same,
# probably because we didn't change the effect delta but scaled down the nobs and effect size
# by the same amount.

Unnamed: 0,daily,weekly
mean,0.0,0.0
std,0.057656,0.123643
mde,0.002021,0.007446
effect_size,4.152425,5.224902
power_achieved,1.0,1.0
t_stat,5757.646155,1965.784433
is_detectable,True,True


I think we'll keep the daily observations because it gives us more flexibility to 
switch to alternative treatment specifications (buffer or event study) later on.

# Pipeline Out

In [20]:
point_panel_daily.to_parquet(point_panel_out, index=False)
line_panel_daily.to_parquet(line_panel_out, index=False)
tract_panel_daily.to_parquet(tract_panel_out, index=False)
comm_panel_daily.to_parquet(comm_panel_out, index=False)