In [23]:
import pandas as pd
import os
import numpy as np

from tableone import TableOne
import plotly.graph_objects as go

from data.constants import DATA_FOLDER

In [24]:
table_out = "../../../reports/replication/tract-balance-attend.md"

In [25]:
point_panel_in = os.path.join(DATA_FOLDER, "final", "point_panel.parquet")
tract_panel_in = os.path.join(DATA_FOLDER, "final", "tract_panel.parquet")

In [26]:
point_panel = pd.read_parquet(point_panel_in)
tract_panel = pd.read_parquet(tract_panel_in)

In [27]:
tract_panel = tract_panel[(tract_panel.date < "2024-08-19") | (tract_panel.date > "2024-08-22")]
point_panel = point_panel[(point_panel.date < "2024-08-19") | (point_panel.date > "2024-08-22")]

In [28]:
assert tract_panel.stadium.nunique() == 5

# Baseline descriptive stats:

## Space-like

In [29]:
def spacelike_stats(df):
    spacelike = (df
             .drop(columns=['date','DNC','is_weekend','dotw','rides','attendance','monthofyear'])
             .drop_duplicates()
             .groupby(['transit'])
             .agg({"id":"nunique", 
                   "UCMP":"sum",
                   'stadium': 'nunique',
                   'lat':["min","max","mean"],
                   'long':["min","max","mean"]})).T
    spacelike.index=['n units.', 
                     'near DNC',
                     'n. stadiums',
                    'lat: min','lat: max','lat: mean',
                    'lon: min','lon: max','lon: mean']
    return spacelike.round(2)

In [30]:
plot_data = spacelike_stats(tract_panel)

fig = go.Figure(data=[go.Table(
    header=dict(values=['transit'] + list(plot_data.columns),
                align='right'),
    cells=dict(values=[plot_data.index, plot_data.bike, plot_data.train, plot_data.uber],
               align='right'))
])
fig.update_layout(width=600)

In [31]:
def timelike_stats(df):
    df = (df
             .drop(columns=['DNC','is_weekend','dotw','monthofyear'])
             .drop_duplicates()
             .groupby(['transit'])
             .agg({'date': 'nunique',
                   'rides': ['min','max','mean']})).T
    df.index = [
        "n. weekdays", "rides: min", "rides: max", "rides: mean"
    ]
    return df.round(2)

In [32]:
plot_data = pd.concat([spacelike_stats(tract_panel), timelike_stats(tract_panel)])
plot_data
fig = go.Figure(data=[go.Table(
    header=dict(values=['transit'] + list(plot_data.columns),
                align='right'),
    cells=dict(values=[plot_data.index, plot_data.bike, plot_data.train, plot_data.uber],
               align='right'))
])
fig.update_layout(width=700)
fig.show()

# Balance

In [33]:
def fix_count_rows(tone, count_row):
    row_labels = tone.tableone.index.get_level_values(0)
    is_count = row_labels.str.contains(count_row)
    new_labels =  np.where(is_count,
                            row_labels.str.replace(', mean (SD)',''),
                            row_labels)
    tone.tableone.index = pd.MultiIndex.from_tuples(zip(new_labels, 
                                            tone.tableone.index.get_level_values(1).values))
    return tone

def balance_table(df, unit_name, unit_abbr):
    space_data = df.drop_duplicates(['transit','id']).copy()
    spacelike_cols = ['transit','lat','long','stadium','train_distance','train_contained','bike_distance','bike_contained']
    spacelike_cols = [c for c in spacelike_cols if c in df.columns]
    space_table = TableOne(space_data,
         columns=spacelike_cols,
         groupby='UCMP',
         rename={'UCMP':'Near DNC', 'n':unit_name},
         pval=True,
         missing=False,
         overall=False)
    
    space_table = fix_count_rows(space_table, unit_name)

    time_data = df.drop_duplicates(['transit','id','date'])
    time_table = TableOne(time_data,
                      columns=['rides','attendance','is_weekend','monthofyear'],
                      groupby='UCMP',
                      rename={'UCMP':'Near DNC',
                              'n':f'{unit_abbr}-days',
                              'rides': 'daily rides'},
                      pval=True,
                      missing=False,
                      overall=False)
    
    time_table = fix_count_rows(time_table, unit_abbr)

    
    def apply_order(x, keys):
        orders = np.zeros_like(x)
        for i, key in enumerate(keys):
            orders += i * x.str.contains(key)
        return orders

    bal_table = pd.concat([space_table.tableone, time_table.tableone])
    # key_order = [unit_name, unit_abbr, 'rides', 'transit','lat','lon']
    # order_func = lambda x: apply_order(x, key_order)
    # bal_table = bal_table.sort_index(key=order_func)
    
    # Hack to concat TableOne tables.
    space_table.tableone = bal_table
    return space_table

In [34]:
tract_balance = balance_table(tract_panel, 'tracts', 'tract')
point_balance = balance_table(point_panel, 'stations', 'station')

In [35]:
print(point_balance.tabulate(tablefmt='simple'))

                                                             0                  1                  P-Value
----------------------  -----------------------------------  -----------------  -----------------  ---------
stations                                                     87                 55
transit, n (%)          bike                                 75 (86.2)          47 (85.5)          1.000
                        train                                12 (13.8)          8 (14.5)
lat, mean (SD)                                               0.3 (1.3)          -0.4 (0.4)         <0.001
long, mean (SD)                                              0.0 (0.8)          -0.3 (1.2)         0.068
stadium, n (%)          GUARANTEED RATE FIELD 1600m          25 (28.7)                             <0.001
                        SOLDIER FIELD 1600m                  14 (16.1)
                        WRIGLEY FIELD 1600m                  48 (55.2)
                        HYATT REGENCY MCCORMICK 

In [36]:
print(tract_balance.tabulate(tablefmt='simple'))

                                                                 0                  1                  P-Value
--------------------------  -----------------------------------  -----------------  -----------------  ---------
tracts                                                           125                91
transit, n (%)              bike                                 46 (36.8)          37 (40.7)          0.541
                            train                                15 (12.0)          14 (15.4)
                            uber                                 64 (51.2)          40 (44.0)
lat, mean (SD)                                                   0.4 (1.2)          -0.4 (0.4)         <0.001
long, mean (SD)                                                  0.0 (0.7)          -0.4 (1.2)         0.006
stadium, n (%)              GUARANTEED RATE FIELD 1600m          37 (29.6)                             <0.001
                            SOLDIER FIELD 1600m                

In [37]:
# XXX: Don't overwrite the dnc-effect table. Make new table for attendance model if you want to report it.

with open(table_out,"w") as f:
    f.write(tract_balance.tabulate(headers=['Not Near DNC', 'Near DNC', 'P-Value'],tablefmt="github"))
!cp $table_out ../../../../eric-mc2-cv/static/uploads/