spatial error model: interpret coefficient the same as a standard linear OLS. spatial lag or combo model: cannot do so, due to diffusion/spillover effects.

In [1]:
import geopandas as gpd
import numpy as np
import pandas as pd
import statsmodels.api as sm
from pysal.lib import weights
from pysal.model import spreg
from scipy.stats.mstats import zscore
from statsmodels.iolib.summary2 import summary_col
from statsmodels.tools.tools import add_constant

shp_path = 'data/tracts_shapefile/tracts_shapefile.shp'
indicators_path = 'data/tracts_indicators_grades_eras_index.csv'
response = 'vehicles_per_household'

spat_diag = False

In [2]:
df = pd.read_csv(indicators_path, dtype={'geoid':str, 'state':str, 'county':str})
df.shape

(72663, 180)

In [3]:
gdf = gpd.read_file(shp_path).set_index('GEOID')
gdf.shape

(74133, 12)

In [4]:
# restrict modeling to only urban tracts
df = df[df['is_urban'] == 1]
df.shape

(46362, 180)

In [5]:
df.corr()[response].abs().sort_values(ascending=False).head(10)

vehicles_per_household    1.000000
prop_single_fam           0.726769
vehicles_per_capita       0.711982
prop_drive_alone          0.659034
med_rooms_per_home        0.657760
available_vehicles        0.535651
year_median               0.525279
pop_density               0.520453
year_mean                 0.512673
prop_deadend              0.506544
Name: vehicles_per_household, dtype: float64

## Modeling

In [6]:
# identify the era dummies in the dataframe
era_primary_dummies = [c for c in df.columns if 'dummy_ztrax_' in c and '_1939_earlier' not in c]

# get the state dummies
states = df['state_abbrev'].unique()
state_dummies = sorted([s for s in states if s != 'CA']) #all but CA
len(state_dummies)

50

In [7]:
%%time
# create county dummies
df['st_county'] = df['state'].astype(str) + df['county'].astype(str)
counties = df['st_county'].unique()
for county in counties:
    df[county] = df['st_county'].map(lambda x: 1 if x==county else 0)

county_dummies = counties[1:].tolist()
#county_dummies = sorted([c for c in counties if c != '06037']) #all but LA county
print(len(county_dummies))

1382
Wall time: 27.6 s


In [8]:
# define which dummies to use as the spatial fixed effects
# if including both county + state, you'll get colinearity unless you drop one county from each state?
fixed_effects = county_dummies #+ state_dummies
len(fixed_effects)

1382

In [9]:
def get_response_and_design(df, response, predictors, condition_number=True):
    
    # select predictors and drop any rows with nulls in the response or predictors
    df_model = df.replace([np.inf, -np.inf], np.nan).dropna(subset=predictors + [response])

    # create design matrix and response vector (and response as matrix for pysal)
    X = df_model[predictors]
    y = df_model[response]

    # drop columns that are constants (to prevent perfect colinearity)
    # this happens if a county has no observations, for instance
    X = X.loc[:, X.nunique() != 1]
    
    # what are the geoids of the observations retained in the response vector + design matrix?
    geoids = df_model['geoid'].values
    
    if condition_number:
        cn = np.linalg.cond(zscore(X))
        return y, X, geoids, cn
    else:
        return y, X, geoids

In [10]:
def make_pysal_table(model, precision=4, ignore=None):
    
    try:
        idx = model.name_z
    except:
        idx = model.name_x
    
    z_stat = np.array(model.z_stat)
    table = pd.DataFrame({'beta' : model.betas.flatten(),
                          's.e.' : model.std_err,
                          'z'    : z_stat[:, 0],
                          'p'    : z_stat[:, 1]}, 
                          index=idx)
    
    if ignore is not None:
        to_drop = [c for c in ignore if c in table.index]
        table = table.drop(to_drop, axis='rows')
    
    return table.round(precision)

## Model 2

grid index + spatial fixed effects

In [11]:
%%time
regressors1 = ['grid_index']
predictors1 = regressors1 #+ fixed_effects
y, X, geoids, cn = get_response_and_design(df, response, predictors1)
Y = pd.DataFrame(y)
print(cn)

1.0
Wall time: 2.38 s


In [12]:
%%time
# estimate the model with OLS
result1 = sm.OLS(y, add_constant(X)).fit()
print(result1.rsquared)

0.22778378998518856
Wall time: 49 ms


In [13]:
%%time
# calculate spatial weights matrix for spatially-explicit alternative specification
W1 = weights.Queen.from_dataframe(gdf.loc[geoids], silence_warnings=True)
W1.transform = 'r'

Wall time: 57.3 s


In [14]:
%%time
# first check ols diagnostics to see nature of spatial dependence
if spat_diag:
    ols = spreg.ols.OLS(y=Y.values, x=X.values, w=W1, spat_diag=True, moran=True)
    print(ols.moran_res)
    print(ols.rlm_lag, ols.rlm_error)

Wall time: 0 ns


In [15]:
%%time
sp_er_model1 = spreg.GM_Error_Het(y=Y.values, x=X.values, w=W1, name_w='W1',
                                  name_x=X.columns.tolist(), name_y=response)
table1 = make_pysal_table(sp_er_model1, ignore=fixed_effects)
print('n =', len(X), 'R2 =', round(sp_er_model1.pr2, 3))
print('response =', y.name)
print(table1)

n = 45638 R2 = 0.228
response = vehicles_per_household
              beta    s.e.         z    p
CONSTANT    1.9049  0.0074  258.4514  0.0
grid_index -0.4867  0.0111  -43.8692  0.0
lambda      0.7844  0.0026  302.1510  0.0
Wall time: 850 ms


## Model 2a

grid index + additional controls + spatial fixed effects

In [16]:
# only 2000+ tracts
#mask = (df['dummy_primary_prop_2000_09'] == 1) | (df['dummy_primary_prop_2010_later'] == 1)
#df = df[mask]
len(df)

46362

In [17]:
%%time
regressors2 = ['grid_index', #griddedness
               'pop_density', 'prop_single_fam', 'med_rooms_per_home', 'mean_household_size', #settlement density/scale
               'med_hh_income', 'mean_commute_time', #economic (income and job proximity)
               'intersect_density', 'length_mean', #street spatial scale
               'grade_mean'] #hilliness
predictors2 = regressors2 + fixed_effects
y, X, geoids, cn = get_response_and_design(df, response, predictors2)
Y = pd.DataFrame(y)
print(cn)

185.89881386489756
Wall time: 10.3 s


In [18]:
%%time
# estimate the model with OLS
result2 = sm.OLS(y, add_constant(X)).fit()
print(result2.rsquared)

0.8580558126882654
Wall time: 15 s


In [19]:
%%time
# calculate spatial weights matrix for spatially-explicit alternative specification
W2 = weights.Queen.from_dataframe(gdf.loc[geoids], silence_warnings=True)
W2.transform = 'r'

Wall time: 52.3 s


In [20]:
%%time
# check ols diagnostics to see nature of spatial dependence
if spat_diag:
    ols = spreg.ols.OLS(y=Y.values, x=X.values, w=W2, spat_diag=True, moran=True)
    print(ols.moran_res)
    print(ols.rlm_lag, ols.rlm_error)

Wall time: 0 ns


In [21]:
%%time
sp_er_model2 = spreg.GM_Error_Het(y=Y.values, x=X.values, w=W2, name_w='W2',
                                  name_x=X.columns.tolist(), name_y=response)
table2 = make_pysal_table(sp_er_model2, ignore=fixed_effects)
print('n =', len(X), 'R2 =', round(sp_er_model2.pr2, 3))
print('response =', y.name)
print(table2)

n = 45594 R2 = 0.853
response = vehicles_per_household
                       beta    s.e.         z       p
CONSTANT             0.8150  0.1798    4.5314  0.0000
grid_index          -0.1809  0.0069  -26.1874  0.0000
pop_density         -0.0055  0.0003  -16.7989  0.0000
prop_single_fam      0.4679  0.0065   71.7981  0.0000
med_rooms_per_home   0.0269  0.0018   15.3162  0.0000
mean_household_size  0.1885  0.0031   60.6500  0.0000
med_hh_income        0.0036  0.0001   56.0603  0.0000
mean_commute_time   -0.0031  0.0002  -12.6230  0.0000
intersect_density   -0.0006  0.0001  -12.7654  0.0000
length_mean          0.0003  0.0000    7.4102  0.0000
grade_mean          -0.2339  0.1150   -2.0342  0.0419
lambda               0.6319  0.0046  137.7427  0.0000
Wall time: 15.5 s


In [22]:
%%time
# re-estimate as standardized regression
sp_er_model2_std = spreg.GM_Error_Het(y=zscore(Y), x=zscore(X), w=W2, name_w='W2',
                                      name_x=X.columns.tolist(), name_y=response)
table2_std = make_pysal_table(sp_er_model2_std, ignore=fixed_effects)
print('n =', len(X), 'R2 =', round(sp_er_model2_std.pr2, 3))
print('response =', y.name)
print(table2_std)

n = 45594 R2 = 0.853
response = vehicles_per_household
                       beta    s.e.         z       p
CONSTANT             0.0091  0.0039    2.3232  0.0202
grid_index          -0.0812  0.0031  -26.1874  0.0000
pop_density         -0.0635  0.0038  -16.7989  0.0000
prop_single_fam      0.2996  0.0042   71.7981  0.0000
med_rooms_per_home   0.0725  0.0047   15.3162  0.0000
mean_household_size  0.2352  0.0039   60.6500  0.0000
med_hh_income        0.2738  0.0049   56.0603  0.0000
mean_commute_time   -0.0513  0.0041  -12.6230  0.0000
intersect_density   -0.0443  0.0035  -12.7654  0.0000
length_mean          0.0219  0.0030    7.4102  0.0000
grade_mean          -0.0066  0.0032   -2.0342  0.0419
lambda               0.6319  0.0046  137.7427  0.0000
Wall time: 14.8 s


## Model 2b

grid index components + additional controls + spatial fixed effects

In [23]:
%%time
regressors3 = ['orientation_order', 'prop_4way', 'straightness', #grid index components
               'pop_density', 'prop_single_fam', 'med_rooms_per_home', 'mean_household_size', #settlement density/scale
               'med_hh_income', 'mean_commute_time', #economic (income and job proximity)
               'intersect_density', 'length_mean', #street spatial scale
               'grade_mean'] #hilliness
predictors3 = regressors3 + fixed_effects
y, X, geoids, cn = get_response_and_design(df, response, predictors3)
Y = pd.DataFrame(y)
print(cn)

201.72532394781297
Wall time: 9.22 s


In [24]:
%%time
# estimate the model with OLS
result3 = sm.OLS(y, add_constant(X)).fit()
print(result3.rsquared)

0.8589490401813067
Wall time: 14.9 s


In [25]:
%%time
# calculate spatial weights matrix for spatially-explicit alternative specification
W3 = weights.Queen.from_dataframe(gdf.loc[geoids], silence_warnings=True)
W3.transform = 'r'

Wall time: 1min 1s


In [26]:
%%time
# check ols diagnostics to see nature of spatial dependence
if spat_diag:
    ols = spreg.ols.OLS(y=Y.values, x=X.values, w=W3, spat_diag=True, moran=True)
    print(ols.moran_res)
    print(ols.rlm_lag, ols.rlm_error)

Wall time: 0 ns


In [27]:
%%time
sp_er_model3 = spreg.GM_Error_Het(y=Y.values, x=X.values, w=W3, name_w='W3',
                                  name_x=X.columns.tolist(), name_y=response)
table3 = make_pysal_table(sp_er_model3, ignore=fixed_effects)
print('n =', len(X), 'R2 =', round(sp_er_model3.pr2, 3))
print('response =', y.name)
print(table3)

n = 45594 R2 = 0.854
response = vehicles_per_household
                       beta    s.e.         z       p
CONSTANT             0.9868  0.1806    5.4634  0.0000
orientation_order   -0.0336  0.0045   -7.4113  0.0000
prop_4way           -0.1263  0.0073  -17.4022  0.0000
straightness        -0.2215  0.0239   -9.2577  0.0000
pop_density         -0.0053  0.0003  -16.2116  0.0000
prop_single_fam      0.4716  0.0065   72.6821  0.0000
med_rooms_per_home   0.0272  0.0018   15.5083  0.0000
mean_household_size  0.1880  0.0031   60.5506  0.0000
med_hh_income        0.0036  0.0001   55.8659  0.0000
mean_commute_time   -0.0031  0.0002  -12.6301  0.0000
intersect_density   -0.0006  0.0001  -12.7543  0.0000
length_mean          0.0003  0.0000    7.1797  0.0000
grade_mean          -0.2202  0.1149   -1.9167  0.0553
lambda               0.6286  0.0046  136.3195  0.0000
Wall time: 15.3 s


In [28]:
%%time
# re-estimate as standardized regression
sp_er_model3_std = spreg.GM_Error_Het(y=zscore(Y), x=zscore(X), w=W3, name_w='W3',
                                      name_x=X.columns.tolist(), name_y=response)
table3_std = make_pysal_table(sp_er_model3_std, ignore=fixed_effects)
print('n =', len(X), 'R2 =', round(sp_er_model3_std.pr2, 3))
print('response =', y.name)
print(table3_std)

n = 45594 R2 = 0.854
response = vehicles_per_household
                       beta    s.e.         z       p
CONSTANT             0.0087  0.0039    2.2450  0.0248
orientation_order   -0.0226  0.0031   -7.4113  0.0000
prop_4way           -0.0526  0.0030  -17.4022  0.0000
straightness        -0.0222  0.0024   -9.2577  0.0000
pop_density         -0.0612  0.0038  -16.2116  0.0000
prop_single_fam      0.3020  0.0042   72.6821  0.0000
med_rooms_per_home   0.0733  0.0047   15.5083  0.0000
mean_household_size  0.2346  0.0039   60.5506  0.0000
med_hh_income        0.2743  0.0049   55.8659  0.0000
mean_commute_time   -0.0513  0.0041  -12.6301  0.0000
intersect_density   -0.0439  0.0034  -12.7543  0.0000
length_mean          0.0209  0.0029    7.1797  0.0000
grade_mean          -0.0062  0.0032   -1.9167  0.0553
lambda               0.6286  0.0046  136.3195  0.0000
Wall time: 14.8 s


## Log-Log (for elasticity)

In [29]:
df_log = df.copy()
df_log['grid_index_log'] = np.log(df_log['grid_index'])
df_log[response + '_log'] = np.log(df_log[response])

In [30]:
%%time
regressors_log = ['grid_index_log', #griddedness
               'pop_density', 'prop_single_fam', 'med_rooms_per_home', 'mean_household_size', #settlement density/scale
               'med_hh_income', 'mean_commute_time', #economic (income and job proximity)
               'intersect_density', 'length_mean', #street spatial scale
               'grade_mean'] #hilliness
predictors_log = regressors_log + fixed_effects
y_log, X_log, geoids, cn = get_response_and_design(df_log, response + '_log', predictors_log)
Y_log = pd.DataFrame(y_log)
print(cn)

185.30444398688346
Wall time: 9.71 s


In [31]:
%%time
# calculate spatial weights matrix for spatially-explicit alternative specification
W_log = weights.Queen.from_dataframe(gdf.loc[geoids], silence_warnings=True)
W_log.transform = 'r'

Wall time: 54.9 s


In [32]:
%%time
sp_er_model_log = spreg.GM_Error_Het(y=Y_log.values, x=X_log.values, w=W_log, name_w='W_log',
                                     name_x=X_log.columns.tolist(), name_y=response + '_log')
table_log = make_pysal_table(sp_er_model_log, ignore=fixed_effects)
print(sp_er_model_log.pr2)
print(table_log)

0.8207154638120562
                       beta    s.e.         z       p
CONSTANT            -0.0810  0.1083   -0.7479  0.4545
grid_index_log      -0.0316  0.0024  -13.0004  0.0000
pop_density         -0.0097  0.0006  -15.2286  0.0000
prop_single_fam      0.2735  0.0050   54.1870  0.0000
med_rooms_per_home   0.0115  0.0015    7.6990  0.0000
mean_household_size  0.1287  0.0026   49.9644  0.0000
med_hh_income        0.0026  0.0001   41.1741  0.0000
mean_commute_time   -0.0031  0.0002  -12.7671  0.0000
intersect_density   -0.0003  0.0001   -6.2019  0.0000
length_mean          0.0002  0.0000    4.9316  0.0000
grade_mean           0.1588  0.1057    1.5026  0.1329
lambda               0.6815  0.0059  115.3337  0.0000
Wall time: 14.7 s


## Results table

In [33]:
print(sp_er_model2.n, sp_er_model2.pr2)
print(sp_er_model3.n, sp_er_model3.pr2)

45594 0.8534081829955339
45594 0.8543088196769005


In [34]:
# spatially explicit estimates
def str_format(x):
    if pd.isnull(x):
        return ' '
    elif np.abs(x) < 0.0001:
        return '<0.0001'
    else:
        return f'{x:0.4f}'
    
regressors = pd.Series(['CONSTANT'] + regressors2 + regressors3 + ['lambda']).drop_duplicates(keep='first').tolist()
table = pd.merge(left=table2, right=table3, left_index=True, right_index=True, how='outer').reindex(regressors)
table = table.applymap(str_format)
table.to_csv('data/table2.csv', index=True)
table

Unnamed: 0,beta_x,s.e._x,z_x,p_x,beta_y,s.e._y,z_y,p_y
CONSTANT,0.815,0.1798,4.5314,<0.0001,0.9868,0.1806,5.4634,<0.0001
grid_index,-0.1809,0.0069,-26.1874,<0.0001,,,,
pop_density,-0.0055,0.0003,-16.7989,<0.0001,-0.0053,0.0003,-16.2116,<0.0001
prop_single_fam,0.4679,0.0065,71.7981,<0.0001,0.4716,0.0065,72.6821,<0.0001
med_rooms_per_home,0.0269,0.0018,15.3162,<0.0001,0.0272,0.0018,15.5083,<0.0001
mean_household_size,0.1885,0.0031,60.65,<0.0001,0.188,0.0031,60.5506,<0.0001
med_hh_income,0.0036,0.0001,56.0603,<0.0001,0.0036,0.0001,55.8659,<0.0001
mean_commute_time,-0.0031,0.0002,-12.623,<0.0001,-0.0031,0.0002,-12.6301,<0.0001
intersect_density,-0.0006,0.0001,-12.7654,<0.0001,-0.0006,0.0001,-12.7543,<0.0001
length_mean,0.0003,<0.0001,7.4102,<0.0001,0.0003,<0.0001,7.1797,<0.0001


In [35]:
# plain old OLS estimates and R2s
results = [result1, result2, result3]
regressors = pd.Series(['const'] + regressors1 + regressors2 + regressors3).drop_duplicates(keep='first').tolist()
for result in results:
    print(round(result.rsquared, 4))
summary_col(results=results, regressor_order=regressors, drop_omitted=True, stars=True)

0.2278
0.8581
0.8589


0,1,2,3
,vehicles_per_household I,vehicles_per_household II,vehicles_per_household III
const,2.1651***,0.8271***,1.1625***
,(0.0048),(0.0775),(0.0809)
grid_index,-1.0639***,-0.3002***,
,(0.0092),(0.0063),
pop_density,,-0.0082***,-0.0079***
,,(0.0003),(0.0003)
prop_single_fam,,0.4958***,0.5009***
,,(0.0055),(0.0055)
med_rooms_per_home,,0.0210***,0.0220***
