In [1]:
import geopandas as gpd
import numpy as np
import pandas as pd
import statsmodels.api as sm
from pysal.lib import weights
from pysal.model import spreg
from scipy.stats.mstats import zscore
from statsmodels.iolib.summary2 import summary_col
from statsmodels.tools.tools import add_constant
np.random.seed(0)

shp_path = 'data/tracts_shapefile/tracts_shapefile.shp'
indicators_path = 'data/tracts_indicators_grades_eras_index.csv'
response = 'grid_index'

spat_diag = False

In [2]:
df = pd.read_csv(indicators_path, dtype={'geoid':str, 'state':str, 'county':str})
df.shape

(72663, 180)

In [3]:
gdf = gpd.read_file(shp_path).set_index('GEOID')
gdf.shape

(74133, 12)

In [4]:
# restrict modeling to only urban tracts
df = df[df['is_urban'] == 1]
df.shape

(46362, 180)

In [5]:
df.corr()[response].abs().sort_values(ascending=False).head(10)

grid_index             1.000000
grid_index_norm        0.999998
grid_index_quant       0.929088
prop_4way              0.903783
orientation_entropy    0.869148
orientation_order      0.867404
length_entropy_log     0.708974
prop_deadend           0.690686
straightness           0.660091
circuity_avg           0.604696
Name: grid_index, dtype: float64

## Modeling

In [6]:
# identify the era dummies in the dataframe
era_primary_dummies = sorted([c for c in df.columns if 'dummy_primary_' in c and '_1939_earlier' not in c])

# get the state dummies
states = df['state_abbrev'].unique()
state_dummies = states[1:].tolist()
#state_dummies = sorted([s for s in states if s != 'CA']) #all but CA
len(state_dummies) # there are 51 states + DC

50

In [7]:
%%time
# create county dummies
df['st_county'] = df['state'].astype(str) + df['county'].astype(str)
counties = df['st_county'].unique()
for county in counties:
    df[county] = df['st_county'].map(lambda x: 1 if x==county else 0)

county_dummies = counties[1:].tolist()
#county_dummies = sorted([c for c in counties if c != '06037']) #all but LA county
print(len(county_dummies))

1382
Wall time: 28.2 s


In [8]:
# define which dummies to use as the spatial fixed effects
# if including both county + state, you'll get colinearity unless you drop one county from each state?
fixed_effects = county_dummies #+ state_dummies
len(fixed_effects)

1382

In [9]:
def get_response_and_design(df, response, predictors, condition_number=True):
    
    # select predictors and drop any rows with nulls in the response or predictors
    df_model = df.replace([np.inf, -np.inf], np.nan).dropna(subset=predictors + [response])

    # create design matrix and response vector (and response as matrix for pysal)
    X = df_model[predictors]
    y = df_model[response]

    # drop columns that are constants (to prevent perfect colinearity)
    # this happens if a county has no observations, for instance
    X = X.loc[:, X.nunique() != 1]
    
    # what are the geoids of the observations retained in the response vector + design matrix?
    geoids = df_model['geoid'].values
    
    if condition_number:
        cn = np.linalg.cond(zscore(X))
        return y, X, geoids, cn
    else:
        return y, X, geoids

In [10]:
def make_pysal_table(model, precision=4, ignore=None):
    
    try:
        idx = model.name_z
    except:
        idx = model.name_x
    
    z_stat = np.array(model.z_stat)
    table = pd.DataFrame({'beta' : model.betas.flatten(),
                          's.e.' : model.std_err,
                          'z'    : z_stat[:, 0],
                          'p'    : z_stat[:, 1]}, 
                          index=idx)
    
    if ignore is not None:
        to_drop = [c for c in ignore if c in table.index]
        table = table.drop(to_drop, axis='rows')
    
    return table.round(precision)

## Model 0

check the grid index vs its components to check its validity

In [11]:
%%time
regressors0 = ['orientation_order', 'prop_4way', 'straightness']
predictors0 = regressors0
y, X, geoids, cn = get_response_and_design(df, response, predictors0)
print(cn)

2.4164221613078656
Wall time: 1.8 s


In [12]:
%%time
# estimate the model with OLS
result0 = sm.OLS(y, add_constant(X)).fit()
print(result0.rsquared)

0.9722283850827019
Wall time: 43 ms


## Model 1a: OLS

predict grid index from era dummies, spatial fixed effects, and controls

In [13]:
%%time
regressors1 = ['aland', 'pop_density', 'prop_single_fam', 'med_rooms_per_home', #settlement density/scale
               'intersect_density', 'length_mean', #street spatial scale
               'elevations_iqr', 'grade_mean'] #hilliness
predictors1 = era_primary_dummies + regressors1 + fixed_effects
y, X, geoids, cn = get_response_and_design(df, response, predictors1)
print(cn)

186.07665469560646
Wall time: 9.64 s


In [14]:
%%time
# estimate the model with OLS
result1 = sm.OLS(y, add_constant(X)).fit(cov_type='cluster', cov_kwds={'groups':df.reindex(X.index)['st_county']}) #cluster robust SEs
print(result1.rsquared)

0.6759778264157833
Wall time: 16.2 s


In [15]:
regressors = pd.Series(['const'] + era_primary_dummies + regressors1).drop_duplicates(keep='first').tolist()
print(summary_col(results=result1, regressor_order=regressors, drop_omitted=True, stars=True))


                              grid_index
----------------------------------------
const                         0.6727*** 
                              (0.0183)  
dummy_primary_prop_1940_49    -0.0396***
                              (0.0073)  
dummy_primary_prop_1950_59    -0.0848***
                              (0.0045)  
dummy_primary_prop_1960_69    -0.1422***
                              (0.0064)  
dummy_primary_prop_1970_79    -0.1746***
                              (0.0071)  
dummy_primary_prop_1980_89    -0.1984***
                              (0.0070)  
dummy_primary_prop_1990_99    -0.1952***
                              (0.0072)  
dummy_primary_prop_2000_09    -0.1566***
                              (0.0080)  
dummy_primary_prop_2010_later -0.1160***
                              (0.0120)  
aland                         -5.8740***
                              (0.7031)  
pop_density                   0.0048*** 
                              (0.0012)  
prop_single_fam

## Model 1b: spatially explicit version

In [16]:
%%time
# calculate spatial weights matrix for spatially-explicit alternative specification
W1 = weights.Queen.from_dataframe(gdf.loc[geoids], silence_warnings=True)
W1.transform = 'r'

Wall time: 51.5 s


In [17]:
%%time
# check ols diagnostics to see nature of spatial dependence
Y = pd.DataFrame(y)
if spat_diag:
    ols = spreg.ols.OLS(y=Y.values, x=X.values, w=W1, spat_diag=True, moran=True)
    print(ols.moran_res)
    print(ols.rlm_lag, ols.rlm_error)

Wall time: 12 ms


In [18]:
%%time
# spatial lag model uses w*Y as endogenous var
wY = weights.lag_spatial(W1, Y)

# use w*X and w*w*X as instruments for 2SLS
# do not include spatial fixed effects (as w*X wouldn't make sense with them in it)
wX = weights.lag_spatial(W1, X[regressors1 + era_primary_dummies])
wwX = weights.lag_spatial(W1, wX)
q = np.concatenate([wX, wwX], axis=1)

Wall time: 270 ms


In [19]:
%%time
# spatial lag model via TSLS
# can't use GM_Lag here, because it doesn't let you use a reduced set of X's columns as instruments (to avoid including spatial fixed effects)
tsls_model1 = spreg.twosls.TSLS(y=Y.values, x=X.values, w=W1, name_w='W1', yend=wY, q=q, robust='white', spat_diag=True,
                                name_x=X.columns.tolist(), name_y=response)

table1 = make_pysal_table(tsls_model1, ignore=fixed_effects)
print(tsls_model1.pr2)
print(tsls_model1.n)
print(table1)

0.7371895795439457
46208
                                 beta    s.e.        z       p
CONSTANT                       0.4633  0.0421  11.0146  0.0000
dummy_primary_prop_1940_49    -0.0352  0.0036  -9.7613  0.0000
dummy_primary_prop_1950_59    -0.0686  0.0021 -33.0310  0.0000
dummy_primary_prop_1960_69    -0.1113  0.0025 -44.4440  0.0000
dummy_primary_prop_1970_79    -0.1340  0.0025 -53.5477  0.0000
dummy_primary_prop_1980_89    -0.1513  0.0028 -54.0873  0.0000
dummy_primary_prop_1990_99    -0.1488  0.0030 -49.0817  0.0000
dummy_primary_prop_2000_09    -0.1184  0.0033 -36.1642  0.0000
dummy_primary_prop_2010_later -0.0901  0.0077 -11.6387  0.0000
aland                         -5.0168  0.2490 -20.1480  0.0000
pop_density                    0.0039  0.0003  13.7185  0.0000
prop_single_fam                0.0457  0.0036  12.7644  0.0000
med_rooms_per_home            -0.0220  0.0008 -27.4103  0.0000
intersect_density              0.0009  0.0000  20.8408  0.0000
length_mean                   

Above estimation uses 'white' method to compute robust std errors. This is important as heterogeneity otherwise will make the SEs unrealistically small and artificially inflate our precision. White is robust against heterogeneity across the dataset, but may need cluster-robust method instead to account for heterogeneity within/between groups (as our fixed effects are county-level)? statsmodels offer cluster-robust std err calculation (see earlier model above), but pysal doesn't seem to?

In [20]:
# if significant, spatial error autocorrelation is present
tsls_model1.ak_test

# you might want a combo model if ak stat is significant
# and if the combo model parameter estimates differ greatly from the TSLS spatial lag model
# you can't run GM_Combo_Het with spatial fixed effects 
# because it doesn't let you use a reduced set of X's columns as instruments (to avoid including spatial fixed effects)
# instead, mimic the TSLS approach above, via GM_Endog_Error_Het
# (but if you run the line below, you see similar parameter estimates, so this is unnecessary)
#combo_model = spreg.GM_Endog_Error_Het(y=Y.values, x=X.values, w=W1, name_w='W1', yend=wY, q=q,
#                                       name_x=X.columns.tolist(), name_y=response)

(361.6331636474842, 1.2415650508681393e-80)

In [21]:
table1.to_csv('data/table1.csv', index=True)

#### Interpret total, direct, and indirect effects of spatial lag model

Spatial lag model coefficients are not purely marginal effects:

  - we're also interested in the total effect of a unit change in predictor k on the response
  - total effect = direct effect (k's estimated coefficient) + indirect effect (spatial spillover)
  - total effect is the change in response if you make a unit change in k at all locations simultaneously
  - direct effect is what happens locally if you make a change (in k) at that one location
  - indirect effect is the local effect of spillover from making that change at all other locations

In [22]:
def impacts(variable, model):
    idx = model.name_x.index(variable)
    direct_effect = model.betas[idx][0]
    rho = tsls_model1.betas[-1, 0]
    total_effect = direct_effect / (1 - rho)
    indirect_effect = total_effect - direct_effect
    return total_effect, direct_effect, indirect_effect

effects = {}
for variable in [c for c in tsls_model1.name_x if c not in fixed_effects]:
    effects[variable] = impacts(variable, tsls_model1)
pd.DataFrame(effects, index=['TE', 'DE', 'IE']).T

Unnamed: 0,TE,DE,IE
CONSTANT,0.693217,0.463314,0.229904
dummy_primary_prop_1940_49,-0.052741,-0.03525,-0.017491
dummy_primary_prop_1950_59,-0.102676,-0.068624,-0.034052
dummy_primary_prop_1960_69,-0.1665,-0.111281,-0.055219
dummy_primary_prop_1970_79,-0.200532,-0.134026,-0.066506
dummy_primary_prop_1980_89,-0.226336,-0.151273,-0.075064
dummy_primary_prop_1990_99,-0.222687,-0.148834,-0.073854
dummy_primary_prop_2000_09,-0.177219,-0.118445,-0.058774
dummy_primary_prop_2010_later,-0.134873,-0.090143,-0.04473
aland,-7.506238,-5.016816,-2.489422
