In [84]:
import pandas as pd
import numpy as np

from tableone import TableOne
from tabulate import tabulate, SEPARATING_LINE


In [85]:
tract_panel_in = "../../../data/final/tract_panel.parquet"
model_panel_in = "../../../data/final/model_data.parquet"

tract_balance_out = "../../../reports/replication/tract-balance.md"
model_balance_out = "../../../reports/replication/balance.md"

In [86]:
tract_panel = pd.read_parquet(tract_panel_in)
model_panel = pd.read_parquet(model_panel_in)

In [87]:
# For the baseline / balance tests we use PRE-TREATMENT days
tract_panel = tract_panel[tract_panel.date < "2024-08-19"]
model_panel = model_panel[model_panel.date < "2024-08-19"]

# Balance

In [88]:
def fix_count_rows(tone, count_row):
    row_labels = tone.tableone.index.get_level_values(0)
    is_count = row_labels.str.contains(count_row)
    new_labels =  np.where(is_count,
                            row_labels.str.replace(', mean (SD)',''),
                            row_labels)
    tone.tableone.index = pd.MultiIndex.from_tuples(zip(new_labels, 
                                            tone.tableone.index.get_level_values(1).values))
    return tone

def balance_table(df, unit_name, unit_abbr):
    space_data = df.drop_duplicates(['transit','id']).copy()
    dist_cols = ['train_distance','bus_distance','bike_distance']
    space_table = TableOne(space_data,
         categorical=['transit'],
         continuous=['lat','long','landarea'] + dist_cols,
         groupby='UCMP',
         rename={'UCMP':'Near DNC', 'n':unit_name, 'landarea':'sqrt(area)'},
         pval=True,
         missing=False,
         overall=False)
    
    space_table = fix_count_rows(space_table, unit_name)

    time_data = df.drop_duplicates(['transit','id','UCMP','date','rides']).copy()
    time_table = TableOne(time_data,
                      columns=['rides'],
                      groupby='UCMP',
                      rename={'UCMP':'Near DNC',
                              'n':f'{unit_abbr}-days',
                              'rides': 'daily rides'},
                      pval=True,
                      missing=False,
                      overall=False)
    
    time_table = fix_count_rows(time_table, unit_abbr)

    
    def apply_order(x, keys):
        orders = np.zeros_like(x)
        for i, key in enumerate(keys):
            orders += i * x.str.contains(key)
        return orders

    key_order = [unit_name, unit_abbr, 'rides', 'transit',
                 'train_distance','bus_distance','bike_distance',
                 'area','lat','long']
    order_func = lambda x: apply_order(x, key_order)
    bal_table = pd.concat([space_table.tableone, time_table.tableone])
    bal_table = bal_table.sort_index(key=order_func)
    
    # Hack to concat TableOne tables.
    space_table.tableone = bal_table
    return space_table

## Tract-level Model

In [89]:
tract_balance = balance_table(tract_panel, 'tracts', 'tract')
print(tract_balance.tabulate(tablefmt='simple'))

                                  0                  1                P-Value
-------------------------  -----  -----------------  ---------------  ---------
tracts                            1898               91
tract-days                        113662             6950
daily rides, mean (SD)            358.3 (1549.0)     756.3 (1699.2)   <0.001
transit, n (%)             bike   600 (31.6)         37 (40.7)        <0.001
                           train  96 (5.1)           14 (15.4)
                           uber   1202 (63.3)        40 (44.0)
train_distance, mean (SD)         12163.8 (17080.3)  2216.0 (1015.3)  <0.001
bus_distance, mean (SD)           6383.0 (15019.5)   522.3 (346.5)    <0.001
bike_distance, mean (SD)          7302.1 (15832.1)   852.6 (514.5)    <0.001
sqrt(area), mean (SD)             1021.8 (607.8)     841.1 (328.1)    <0.001
lat, mean (SD)                    -0.1 (1.2)         -0.1 (0.2)       0.208
long, mean (SD)                   -0.3 (1.4)         0.5 (0.3)  

In [90]:
with open(tract_balance_out,"w") as f:
    f.write(tract_balance.tabulate(headers=['Not Near DNC', 'Near DNC', 'P-Value'],tablefmt="github"))
!cp {tract_balance_out} ../../../../eric-mc2-cv/static/uploads/

## Nominal Model

In [91]:
def balance_transit(df, transit, unit_name):
    tbl = balance_table(df.loc[df.transit==transit], unit_name+'s', unit_name)
    tbl.tableone = tbl.tableone[~tbl.tableone.index.get_level_values(0).str.contains('transit')]
    tbl.tableone.index = tbl.tableone.index.set_levels(levels=[transit],level=1)
    return tbl

In [100]:
train_balance = balance_transit(model_panel, 'train', 'station').tableone.loc[lambda x: ~x.index.get_level_values(0).str.contains('train')]
bike_balance = balance_transit(model_panel, 'bike', 'dock').tableone.loc[lambda x: ~x.index.get_level_values(0).str.contains('bike')]
uber_balance = balance_transit(model_panel, 'uber', 'tract').tableone
tbl = pd.concat([train_balance, bike_balance, uber_balance])
tbl.columns = tbl.columns.droplevel(0)
tbl = tbl.swaplevel(axis=0)
tbl = tbl.reset_index()
tbl['level_0'] = tbl['level_0'].where(~tbl['level_0'].duplicated(),'')
tabulate(tbl, headers=['','','Not Near DNC','Near DNC','P-Value'],tablefmt='html',showindex=False)

Unnamed: 0,Unnamed: 1,Not Near DNC,Near DNC,P-Value
train,stations,114,8,
,station-days,6270,420,
,"daily rides, mean (SD)",2718.4 (2337.9),1482.8 (995.9),<0.001
,"bus_distance, mean (SD)",305.0 (1472.7),165.3 (167.7),0.354
,"bike_distance, mean (SD)",387.6 (1535.0),288.5 (281.6),0.573
,"sqrt(area), mean (SD)",2521.0 (3723.0),3123.8 (3564.8),0.657
,"lat, mean (SD)",-0.1 (0.9),-0.3 (0.2),0.027
,"long, mean (SD)",-0.0 (1.0),0.2 (0.4),0.296
bike,docks,1459,47,
,dock-days,42316,2568,


In [97]:
tbl_data = []
for i,row in enumerate(tbl.to_numpy()):
    if row[0] != '' and i != 0:
        tbl_data.append(SEPARATING_LINE)
    tbl_data.append(row.tolist())

In [98]:
with open(model_balance_out,"w") as f:
    f.write(tabulate(tbl_data, headers=['','','Not Near DNC','Near DNC','P-Value'],tablefmt='github',showindex=False))
!cp {model_balance_out} ../../../../eric-mc2-cv/static/uploads/