In [1]:
import geopandas as gpd
import json
import numpy as np
import pandas as pd
from scipy.stats.mstats import zscore
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
from statsmodels.tools.tools import add_constant
import matplotlib.pyplot as plt

indicators_path = 'data/tracts_indicators_grades_eras_index.csv'
crs = {'init':'epsg:4326'}

response = 'grid_index_geom'

## Load data

In [2]:
df = pd.read_csv(indicators_path, dtype={'geoid':str})
len(df)

72663

In [3]:
df.corr()[response].abs().sort_values(ascending=False).head(10)

grid_index_geom        1.000000
grid_index             0.954403
prop_4way              0.900373
rho                    0.899478
orientation_entropy    0.891207
prop_deadend           0.735248
length_entropy_log     0.690278
straightness           0.663511
k_avg                  0.627165
circuity_avg           0.608322
Name: grid_index_geom, dtype: float64

## Prepare for modeling

In [4]:
# helper function for model output
def make_table(result):

    table = pd.DataFrame()

    table['se'] = result.params / result.tvalues
    #table['t'] = result.tvalues
    table['se'] = table['se'].map(lambda x: '<0.0001' if abs(x) < 0.0001 else f'{x:.4f}')

    table['params'] = result.params
    table['params'] = table['params'].map(lambda x: '<0.0001' if abs(x) < 0.0001 else f'{x:.4f}')

    table['p'] = result.pvalues
    def significance(p):
        if p < 0.001:
            return '***'
        elif p < 0.01:
            return '** '
        elif p < 0.05:
            return '*  '
        else:
            return '   '

    table['params'] = table.apply(lambda row: '{}{}'.format(row['params'], significance(row['p'])), axis=1)
    table = table.drop(columns=['p']).reindex(columns=['params', 'se'])
    return table

In [5]:
# ientify the state dummies in the dataframe
# remove one dummy to prevent perfect collinearity
states = df['state_abbrev'].unique()
state_dummies = [s for s in states if s != 'RI']
print(len(states), len(state_dummies))

51 50


In [6]:
# identify the era dummies in the dataframe
era_primary_dummies = [c for c in df.columns if 'dummy_primary_' in c and '_1939_earlier' not in c]
era_plurality_dummies = [c for c in df.columns if 'dummy_plurality_' in c and '_1939_earlier' not in c]
era_majority_dummies = [c for c in df.columns if 'dummy_majority_' in c and '_1939_earlier' not in c]
era_earliest_dummies = [c for c in df.columns if 'dummy_earliest_' in c and '_1939_earlier' not in c]
len(df)

72663

In [7]:
df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=[response])
len(df)

72659

## Regression model 0

check the grid index vs its components to check its validity

In [8]:
predictors = ['rho', 'prop_4way', 'straightness']
df_model = df.replace([np.inf, -np.inf], np.nan).dropna(subset=predictors)
X = df_model[predictors]
y = df_model[response]

In [9]:
# estimate model across all tracts
Xc = add_constant(X)
model = sm.OLS(y, Xc)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:        grid_index_geom   R-squared:                       0.972
Model:                            OLS   Adj. R-squared:                  0.972
Method:                 Least Squares   F-statistic:                 8.471e+05
Date:                Tue, 15 Jan 2019   Prob (F-statistic):               0.00
Time:                        17:42:54   Log-Likelihood:             1.4082e+05
No. Observations:               72659   AIC:                        -2.816e+05
Df Residuals:                   72655   BIC:                        -2.816e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const           -0.1338      0.003    -40.495   

## Regression model 1

by primary decade

In [10]:
physical_predictors = ['aland', 'total_pop_k', #city size, spatial extent
                       'is_urban', 'prop_single_fam', 'med_rooms_per_home', #settlement density/scale
                       'intersect_density', 'length_mean', #'length_entropy_log', #street spatial scale
                       'prop_deadend', 'k_avg', #urban fabric connectivity
                       'elevations_iqr', 'grade_mean'] #hilliness

df['length_mean'] = df['length_mean'] / 1000 #put these regression coefficients in units of km

In [11]:
predictors = physical_predictors + sorted(era_primary_dummies) + sorted(state_dummies)
df_model = df.replace([np.inf, -np.inf], np.nan).dropna(subset=predictors)
X = df_model[predictors]
y = df_model[response]

In [12]:
# condition number <20 means low multicollinearity
print(np.linalg.cond(zscore(X)))

35.3570366005539


In [13]:
# estimate a model across the full data set (all cities)
Xc = add_constant(X) 
model = sm.OLS(y, Xc)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:        grid_index_geom   R-squared:                       0.731
Model:                            OLS   Adj. R-squared:                  0.731
Method:                 Least Squares   F-statistic:                     2836.
Date:                Tue, 15 Jan 2019   Prob (F-statistic):               0.00
Time:                        17:42:56   Log-Likelihood:                 57942.
No. Observations:               72082   AIC:                        -1.157e+05
Df Residuals:                   72012   BIC:                        -1.151e+05
Df Model:                          69                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const         

In [14]:
print(result.aic) 
print(result.llf)

-115744.3318350598
57942.1659175299


In [15]:
table1 = make_table(result)

## Regression model 2

plurality decade

In [16]:
predictors = physical_predictors + sorted(era_plurality_dummies) + sorted(state_dummies)
df_model = df.replace([np.inf, -np.inf], np.nan).dropna(subset=predictors)
X = df_model[predictors]
y = df_model[response]

In [17]:
# condition number <20 means low multicollinearity
print(np.linalg.cond(zscore(X)))

35.499726604191835


In [18]:
# estimate a model across the full data set (all cities)
Xc = add_constant(X)
model = sm.OLS(y, Xc)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:        grid_index_geom   R-squared:                       0.732
Model:                            OLS   Adj. R-squared:                  0.732
Method:                 Least Squares   F-statistic:                     2848.
Date:                Tue, 15 Jan 2019   Prob (F-statistic):               0.00
Time:                        17:42:58   Log-Likelihood:                 58046.
No. Observations:               72082   AIC:                        -1.160e+05
Df Residuals:                   72012   BIC:                        -1.153e+05
Df Model:                          69                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
const     

In [19]:
print(result.aic)
print(result.llf)

-115952.53219918517
58046.26609959258


In [20]:
table2 = make_table(result)

## Regression model 3

only tracts with a majority built in one decade

In [21]:
# identify all tracts with a majority built in each decade
c = ['prop_1939_earlier', 'prop_1940_49', 'prop_1950_59', 'prop_1960_69', 'prop_1970_79', 'prop_1980_89', 'prop_1990_99', 'prop_2000_09', 'prop_2010_later']
majority_mask = (df_model[c] > 0.5).apply(sum, axis=1).astype(bool)
sum(majority_mask)

12840

In [22]:
predictors = physical_predictors + sorted(era_majority_dummies) + sorted(state_dummies)
df_model = df.replace([np.inf, -np.inf], np.nan).dropna(subset=predictors)

# create design matrix containing predictors (drop nulls), and a response variable vector
X = df_model[majority_mask][predictors]
y = df_model[majority_mask][response]

# must remove any vars that are constant (ie, all zeros) as pysal demands no constant vectors
# this happens because some states have no units with majority built in single decade
drop = X.columns[X.sum()==0]
X = X.drop(columns=drop)
print(drop)

Index([], dtype='object')


In [23]:
# condition number <20 means low multicollinearity
print(np.linalg.cond(zscore(X)))

28.414593473661082


In [24]:
# estimate a model across the full data set (all cities)
Xc = add_constant(X)
model = sm.OLS(y, Xc)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:        grid_index_geom   R-squared:                       0.717
Model:                            OLS   Adj. R-squared:                  0.715
Method:                 Least Squares   F-statistic:                     467.9
Date:                Tue, 15 Jan 2019   Prob (F-statistic):               0.00
Time:                        17:43:01   Log-Likelihood:                 9051.0
No. Observations:               12840   AIC:                        -1.796e+04
Df Residuals:                   12770   BIC:                        -1.744e+04
Df Model:                          69                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
const       

In [25]:
print(result.aic)
print(result.llf)

-17962.004148677406
9051.002074338703


In [26]:
table3 = make_table(result)

## Regression model 4

only tracts with at least one decade of at least 20%, and identified by earliest such decade

In [27]:
earliest_mask = pd.notnull(df_model['earliest_decade'])

In [28]:
predictors = physical_predictors + sorted(era_earliest_dummies) + sorted(state_dummies)
df_model = df.replace([np.inf, -np.inf], np.nan).dropna(subset=predictors)

# create design matrix containing predictors (drop nulls), and a response variable vector
X = df_model[earliest_mask][predictors]
y = df_model[earliest_mask][response]

# must remove any vars that are constant (ie, all zeros) as pysal demands no constant vectors
# this happens because some states have no units with majority built in single decade
drop = X.columns[X.sum()==0]
X = X.drop(columns=drop)
print(drop)

Index([], dtype='object')


In [29]:
# condition number <20 means low multicollinearity
print(np.linalg.cond(zscore(X)))

36.26069943278977


In [30]:
# estimate a model across the full data set (all tracts)
Xc = add_constant(X)
model = sm.OLS(y, Xc)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:        grid_index_geom   R-squared:                       0.731
Model:                            OLS   Adj. R-squared:                  0.730
Method:                 Least Squares   F-statistic:                     2684.
Date:                Tue, 15 Jan 2019   Prob (F-statistic):               0.00
Time:                        17:43:03   Log-Likelihood:                 54699.
No. Observations:               68306   AIC:                        -1.093e+05
Df Residuals:                   68236   BIC:                        -1.086e+05
Df Model:                          69                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
const       

In [31]:
print(result.aic)
print(result.llf)

-109257.53624000354
54698.76812000177


In [32]:
table4 = make_table(result)

## Merge the results tables and save to disk

In [33]:
table = pd.concat((table1, table2, table3, table4), axis=1, sort=False).fillna('')
table.to_csv('data/table-ols-models.csv', index=True, encoding='utf-8')