In [1]:
import geopandas as gpd
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
clip = False

## Load the data

In [2]:
census_path = 'data/census_data.csv'
indicators_path = 'data/tracts_indicators_grades.csv'
tracts_path = 'data/tracts_shapefile'
output_path = 'data/tracts_indicators_grades_eras_index.csv'
crs = {'init':'epsg:4326'}

In [3]:
indicators = pd.read_csv(indicators_path, dtype={'geoid':str})
cd = pd.read_csv(census_path, dtype={'GEOID10':str, 'state':str, 'county':str})
indicators = pd.merge(indicators, cd, left_on='geoid', right_on='GEOID10')
len(indicators)

72663

In [4]:
tracts = gpd.read_file(tracts_path, crs=crs).rename(columns={'ALAND':'aland'})[['GEOID', 'aland']]

In [5]:
gdf = gpd.GeoDataFrame(pd.merge(indicators, tracts, left_on='geoid', right_on='GEOID'), crs=crs)
gdf = gdf.drop(columns=['GEOID', 'GEOID10'])
len(gdf)

72663

In [6]:
with open('data/states_by_fips.json') as f:
    fips_to_state = json.load(f)
gdf['state_abbrev'] = gdf['state'].map(lambda x: fips_to_state[x]['abbreviation'])

## Create and convert variables

In [7]:
# convert land area and densities to square kilometers
gdf['aland'] = gdf['aland'] / 1e6 #convert m2 to km2
gdf['intersect_density'] = (gdf['n'] / gdf['aland']) * (1 - gdf['prop_deadend']) #per km2
gdf['pop_density'] = gdf['total_pop'] / gdf['aland'] #per km2

In [8]:
# population in units of 1,000 persons
gdf['total_pop_k'] = gdf['total_pop'] / 1000

In [9]:
# log of mean street segment length
gdf['length_mean_log'] = np.log(gdf['length_mean'])

In [10]:
# straightness is inverse of circuity
gdf['straightness'] = 1 / gdf['circuity_avg']

In [11]:
# create state dummies
states = gdf['state_abbrev'].unique()
for state in states:
    gdf[state] = gdf['state_abbrev'].map(lambda x: 1 if x==state else 0)

In [12]:
# dummy for if tract is rural vs urban
# census bureau considers a block urban if it has at least 1000 people per sq mile
urban_density = 1000 / 2.59 # 1000 people per sq mile converted to sq km
gdf['is_urban'] = (gdf['pop_density'] > urban_density).astype(int)
gdf['is_urban'].value_counts()

1    46311
0    26352
Name: is_urban, dtype: int64

## Create grid index

In [13]:
index_components = ['rho', 'straightness', 'prop_4way']
gdf[index_components].describe()

Unnamed: 0,rho,straightness,prop_4way
count,72659.0,72663.0,72663.0
mean,0.484327,0.935759,0.210324
std,0.316234,0.045987,0.17177
min,0.002183,0.0,0.0
25%,0.185824,0.911671,0.092486
50%,0.465526,0.939706,0.156627
75%,0.772004,0.969004,0.271676
max,1.0,1.000007,1.0


In [14]:
# clip vectors to 4 std devs above/below mean to make variances more similar
if clip:
    sigma = 4
    for col in index_components:
        lower = gdf[col].mean() - gdf[col].std() * sigma
        upper = gdf[col].mean() + gdf[col].std() * sigma
        gdf[col] = gdf[col].clip(lower, upper)

    # min-max scale to get them back into (0,1) range
    gdf[index_components] = (gdf[index_components]-gdf[index_components].min())/(gdf[index_components].max()-gdf[index_components].min())
    print(gdf[index_components].describe())

In [15]:
# fix any rounding errors so all three components are in range 0 to 1
gdf[index_components] = gdf[index_components].clip(lower=0, upper=1)
gdf[index_components].describe()

Unnamed: 0,rho,straightness,prop_4way
count,72659.0,72663.0,72663.0
mean,0.484327,0.935759,0.210324
std,0.316234,0.045987,0.17177
min,0.002183,0.0,0.0
25%,0.185824,0.911671,0.092486
50%,0.465526,0.939706,0.156627
75%,0.772004,0.969004,0.271676
max,1.0,1.0,1.0


In [16]:
# or standardized (mean-normalized) version with mean=0 and std=1
gdf_norm = (gdf[index_components] - gdf[index_components].mean()) / gdf[index_components].std()
gdf_norm.describe()

Unnamed: 0,rho,straightness,prop_4way
count,72659.0,72663.0,72663.0
mean,-1.533683e-14,-3.959005e-14,-4.116245e-14
std,1.0,1.0,1.0
min,-1.524643,-20.34848,-1.224456
25%,-0.9439324,-0.5238102,-0.6860279
50%,-0.05945437,0.08582047,-0.3126153
75%,0.9096941,0.7229103,0.3571758
max,1.630668,1.396937,4.597294


create index: the component indicators are non-substitutable/non-compensatory. that means, ideally you'd use geometric mean. but you can only do that if all components are positive (due to the cube-root). with the mean-zero normalized data, some will be negative. so, do arithmetic mean of mean-normalized data and geometric mean of standard 0,1 data (which is essentially min-max normalized).

In [17]:
# arithmetic mean, even-weighting of mean-normalized components (gdf_norm)
gdf['grid_index'] = gdf_norm[index_components].sum(axis=1) / 3

# geometric mean, even-weighting of min-max-normalized components (gdf)
gdf['grid_index_geom'] = stats.mstats.gmean(gdf[index_components], axis=1)

In [18]:
sample = gdf.sample(n=6, random_state=2)
sample[['geoid', 'state_abbrev', 'grid_index', 'grid_index_geom'] + index_components]

Unnamed: 0,geoid,state_abbrev,grid_index,grid_index_geom,rho,straightness,prop_4way
25412,18177000800,IN,0.687643,0.600108,0.743488,0.968931,0.3
31086,25001014700,MA,-0.84145,0.336798,0.267217,0.863212,0.165625
71361,55025011506,WI,-0.305263,0.401471,0.32396,0.91482,0.218341
9300,6073005700,CA,1.97666,0.905874,0.991945,0.98785,0.758621
17211,12099007833,FL,-0.006759,0.524014,0.768401,0.893731,0.209524
12809,8123002018,CO,0.258005,0.463627,0.835084,0.942767,0.126582


In [19]:
# want component indicators that are relevant but not too redundant (ie, strongly correlated)
# here, we see each of our indicators is more strongly correlated with the index than with each other: good
gdf[['grid_index', 'grid_index_geom'] + index_components].corr()

Unnamed: 0,grid_index,grid_index_geom,rho,straightness,prop_4way
grid_index,1.0,0.954403,0.870568,0.835172,0.873673
grid_index_geom,0.954403,1.0,0.899478,0.663511,0.900373
rho,0.870568,0.899478,1.0,0.576998,0.669655
straightness,0.835172,0.663511,0.576998,1.0,0.582524
prop_4way,0.873673,0.900373,0.669655,0.582524,1.0


## Make era dummies then inspect our columns

In [20]:
cols = ['prop_1939_earlier', 'prop_1940_49', 'prop_1950_59', 'prop_1960_69', 
        'prop_1970_79', 'prop_1980_89', 'prop_1990_99', 'prop_2000_09', 'prop_2010_later']

# jitter so we don't get 2 eras with equal value and both are the plurality
np.random.seed(0)
gdf[cols] = gdf[cols].applymap(lambda x: x + np.random.random() * 1e-6)

In [21]:
%%time
# identify decade in which majority of tract's structures were built (where a majority exists)
def determine_majority_decade(row):
    for col in cols:
        if row[col] > 0.5:
            return col

gdf['majority_decade'] = gdf.apply(determine_majority_decade, axis='columns')
majority_dummies = pd.get_dummies(gdf['majority_decade'], prefix='dummy_majority')
gdf = pd.concat([gdf, majority_dummies], axis='columns')

Wall time: 2.95 s


In [22]:
%%time
# identify decade in which plurality of tract's structures were built
def determine_plurality_decade(row):
    for col in cols:
        other_cols = [c for c in cols if c != col]
        if (row[col] > row[other_cols]).all():
            return col

gdf['plurality_decade'] = gdf.apply(determine_plurality_decade, axis='columns')
plurality_dummies = pd.get_dummies(gdf['plurality_decade'], prefix='dummy_plurality')
gdf = pd.concat([gdf, plurality_dummies], axis='columns')

Wall time: 3min 42s


In [23]:
%%time
# identify earliest decade in which >20% of tract's structures were built
def determine_earliest_decade(row, threshold=0.20):
    for col in cols:
        if row[col] > threshold:
            return col
    
gdf['earliest_decade'] = gdf.apply(determine_earliest_decade, axis='columns')
earliest_dummies = pd.get_dummies(gdf['earliest_decade'], prefix='dummy_earliest')
gdf = pd.concat([gdf, earliest_dummies], axis='columns')

Wall time: 1.87 s


In [24]:
%%time
def find_earliest_threshold(row, cols, threshold):
    for col in cols:
        if row[col] > threshold:
            return col

def determine_primary_decade(row, cols=cols):
    for threshold in [0.5, 0.4, 0.3, 0.2, 0.1]:
        decade = find_earliest_threshold(row, cols, threshold)
        if decade is not None:
            return decade

gdf['primary_decade'] = gdf.apply(determine_primary_decade, axis='columns')
primary_dummies = pd.get_dummies(gdf['primary_decade'], prefix='dummy_primary')
gdf = pd.concat([gdf, primary_dummies], axis='columns')

Wall time: 7.14 s


In [25]:
decades = ['majority_decade', 'plurality_decade', 'earliest_decade', 'primary_decade']
gdf[decades].apply(lambda x: x.value_counts())

Unnamed: 0,majority_decade,plurality_decade,earliest_decade,primary_decade
prop_1939_earlier,5790,18079,19673,21222
prop_1940_49,77,1159,1651,1501
prop_1950_59,1211,8512,8564,9219
prop_1960_69,550,5335,5726,5584
prop_1970_79,1181,12557,12814,12428
prop_1980_89,980,7345,7067,6855
prop_1990_99,1082,9022,8255,7813
prop_2000_09,2046,9954,4632,7435
prop_2010_later,40,274,76,180


In [26]:
def fstr(x):
    try:
        return f'{x:0.3f}'
    except:
        return x
    
gdf[cols + decades].sample(n=5, random_state=2).applymap(fstr)

Unnamed: 0,prop_1939_earlier,prop_1940_49,prop_1950_59,prop_1960_69,prop_1970_79,prop_1980_89,prop_1990_99,prop_2000_09,prop_2010_later,majority_decade,plurality_decade,earliest_decade,primary_decade
25412,0.213,0.135,0.273,0.069,0.188,0.076,0.041,0.005,0.0,,prop_1950_59,prop_1939_earlier,prop_1939_earlier
31086,0.047,0.045,0.186,0.21,0.266,0.121,0.046,0.076,0.003,,prop_1970_79,prop_1960_69,prop_1960_69
71361,0.04,0.024,0.032,0.054,0.109,0.136,0.227,0.295,0.083,,prop_2000_09,prop_1990_99,prop_1990_99
9300,0.386,0.066,0.071,0.08,0.264,0.086,0.029,0.019,0.0,,prop_1939_earlier,prop_1939_earlier,prop_1939_earlier
17211,0.0,0.0,0.0,0.003,0.121,0.509,0.238,0.13,0.0,prop_1980_89,prop_1980_89,prop_1980_89,prop_1980_89


In [27]:
gdf[gdf['primary_decade'] != gdf['earliest_decade']][cols + decades].applymap(fstr).head()

Unnamed: 0,prop_1939_earlier,prop_1940_49,prop_1950_59,prop_1960_69,prop_1970_79,prop_1980_89,prop_1990_99,prop_2000_09,prop_2010_later,majority_decade,plurality_decade,earliest_decade,primary_decade
4,0.0,0.0,0.0,0.029,0.027,0.11,0.338,0.448,0.049,,prop_2000_09,prop_1990_99,prop_2000_09
7,0.048,0.021,0.029,0.011,0.156,0.188,0.214,0.3,0.033,,prop_2000_09,prop_1990_99,prop_2000_09
8,0.02,0.015,0.017,0.053,0.107,0.101,0.235,0.318,0.136,,prop_2000_09,prop_1990_99,prop_2000_09
18,0.004,0.003,0.021,0.085,0.107,0.206,0.353,0.156,0.066,,prop_1990_99,prop_1980_89,prop_1990_99
19,0.005,0.0,0.008,0.013,0.011,0.077,0.203,0.518,0.164,prop_2000_09,prop_2000_09,prop_1990_99,prop_2000_09


In [28]:
# not every tract has residential strutures
pd.isnull(gdf['primary_decade']).sum()

426

In [29]:
str(gdf.columns.sort_values().tolist())

"['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'aland', 'circuity_avg', 'county', 'dummy_earliest_prop_1939_earlier', 'dummy_earliest_prop_1940_49', 'dummy_earliest_prop_1950_59', 'dummy_earliest_prop_1960_69', 'dummy_earliest_prop_1970_79', 'dummy_earliest_prop_1980_89', 'dummy_earliest_prop_1990_99', 'dummy_earliest_prop_2000_09', 'dummy_earliest_prop_2010_later', 'dummy_majority_prop_1939_earlier', 'dummy_majority_prop_1940_49', 'dummy_majority_prop_1950_59', 'dummy_majority_prop_1960_69', 'dummy_majority_prop_1970_79', 'dummy_majority_prop_1980_89', 'dummy_majority_prop_1990_99', 'dummy_majority_prop_2000_09', 'dummy_majority_prop_2010_later', 'dummy_plurality_prop_1939_earlier', 'dummy_plurality_prop_1940_49', 'dummy_plurali

In [30]:
gdf.to_csv(output_path, index=False, encoding='utf-8')

## One median tract from different eras

Center a square mile on each and visualize

In [None]:
import math
import osmnx as ox
ox.config(use_cache=True)
graphs_folder = 'G:\\Geoff\\osmnx\\data\\tracts\\graphml'

In [None]:
# get coords of median tract from each decade
for col in cols_plurality:
    mask = (gdf['is_urban']==1) & (gdf['n'] > 10) & (gdf[col] == 1)
    gdf_tmp = gdf[mask]
    row_median = gdf_tmp.sort_values('grid_index_geom').iloc[math.floor(len(gdf_tmp)/2)]
    
    row = row_median
    filename = '{}.graphml'.format(row['geoid'])
    folder = '{}/{}_{}'.format(graphs_folder, row['state'], row['state_abbrev'])
    G = ox.get_largest_component(ox.load_graphml(filename, folder))
    nodes = ox.graph_to_gdfs(G, edges=False, node_geometry=True)
    x, y = nodes.unary_union.centroid.coords[0]
    
    print(col, x, y)
    #fig, ax = ox.plot_figure_ground(point=(y, x), dist=805, network_type='drive', 
    #                                save=True, show=True, filename=f'{col}.png')

In [None]:
# move square mile around a bit to find a well-framed set of streets
points = {'1980s' : (-84.537, 39.287),
          #'1990s' : (-93.177, 44.676),
          #'1970s' : (-121.266, 38.692),
          '1950s' : (-104.654, 38.248),
          '1930s' : (-73.831, 40.909),
          '2010s' : (-111.7802, 33.2261)}

for decade, (x, y) in points.items():
    fig, ax = ox.plot_figure_ground(point=(y, x), dist=805, network_type='drive', 
                                    save=True, show=False, filename=f'decade_{decade}.png')

## Visualize pairwise regressions

In [None]:
fontsize = 14
fontname = 'Century Gothic'
scatter_c = 'none'
scatter_ec = '#003366'
scatter_s = 30
scatter_lw = 1.5
scatter_a = 0.7
scatter_m = 'o'
line_c = 'k'
line_a = 0.25
line_ls = '--'
line_lw = 2.5
line_z = -1
filename_template = 'images/regression_{}_{}.png'

In [None]:
def regress_plot(gdf, x_var, y_var, x_min=None, x_max=None, y_min=None, y_max=None, xlabel='', ylabel='', color_regions=False):

    x = gdf[x_var]
    y = gdf[y_var]
    slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
    print('r={:.3f}, r^2={:.3f}, slope={:.3f}, intercept={:.3f}, p={:.4f}'.format(r_value, r_value ** 2, slope, intercept, p_value))

    fig, ax = plt.subplots(figsize=(6,6))
    scatter = ax.scatter(x=x, y=y, c=scatter_c, edgecolor=ec, s=scatter_s, 
                         linewidth=scatter_lw, alpha=scatter_a, marker=scatter_m)

    # set x and y limits
    if x_min is None:
        x_min = x.min() / 1.1
    if x_max is None:
        x_max = x.max() * 1.1
    if y_min is None:
        y_min = y.min() / 1.1
    if y_max is None:
        y_max = y.max() * 1.1
    ax.set_xlim((x_min, x_max))
    ax.set_ylim((y_min, y_max))

    # create a line of best fit
    x_line = np.array([x_min, x_max])

    Y_est = x_line * slope + intercept
    line = ax.plot(x_line, Y_est, c=line_c, alpha=line_a, linestyle=line_ls, linewidth=line_lw, zorder=line_z)

    ax.set_xlabel(xlabel, fontsize=fontsize, fontname=fontname)
    ax.set_ylabel(ylabel, fontsize=fontsize, fontname=fontname)
    
    if round(p_value < 0.001):
        p_value = 0.001
        symbol = '<'
    else:
        symbol = '='
    
    ax.set_title('r = {:.3f}, p {} {:.3f}'.format(r_value, symbol, p_value), fontdict={'fontsize':fontsize, 'fontname':fontname})

    #fig.savefig(filename_template.format(x_var, y_var), dpi=600, bbox_inches='tight')
    plt.close()

In [None]:
gdf.corr()['grid_index_geom'].abs().sort_values(ascending=False)

In [None]:
regress_plot(gdf, x_var='k_avg', y_var='grid_index_geom',# y_min=0, y_max=1, x_min=2.2, x_max=3.6,
             xlabel='Average Node Degree', ylabel='grid_index_geom')

In [None]:
regress_plot(gdf, x_var='length_median', y_var='grid_index_geom',# x_min=30, x_max=140, y_min=0, y_max=1,
             xlabel='Median Street Segment Length (m)', ylabel=r'$\rho$')

In [None]:
regress_plot(gdf, x_var='length_entropy_log', y_var='grid_index_geom', #x_min=2.0, x_max=3.2, y_min=0, y_max=1,
             xlabel='Steet Segment Length (log) Entropy', ylabel=r'$\rho$')

In [None]:
regress_plot(gdf, x_var='prop_deadend', y_var='grid_index_geom', #x_min=0.0, x_max=0.4, y_min=0, y_max=1,
             xlabel='Dead-End Proportion', ylabel='Rho')

In [None]:
regress_plot(gdf, x_var='prop_4way', y_var='grid_index_geom', #x_min=0.0, x_max=0.5, y_min=0, y_max=1,
             xlabel='Four-Way Intersection Proportion', ylabel=r'$\rho$')

In [None]:
regress_plot(gdf, x_var='straightness', y_var='grid_index_geom', #x_min=1, x_max=1.16, y_min=0, y_max=1,
             xlabel='Average Straightness', ylabel=r'$\rho$')

In [None]:
regress_plot(gdf, x_var='prop_4way', y_var='straightness', #x_min=0, x_max=0.6, y_min=1, y_max=1.16,
             xlabel='Four-Way Intersection Proportion', ylabel='Average Straightness')

In [None]:
regress_plot(gdf, x_var='k_avg', y_var='straightness', #x_min=2.2, x_max=3.6, y_min=1, y_max=1.16,
             xlabel='Average Node Degree', ylabel='Average Straightness')

## Look at individual stats

In [None]:
gdf['grid_index_geom'].describe()

In [None]:
print(gdf['grid_index_geom'].sort_values().head())
print(gdf['grid_index_geom'].sort_values().tail())

In [None]:
ax = gdf['grid_index_geom'].hist(bins=100)
ax.set_xlim((0,1))
plt.show()

In [None]:
y = gdf['grid_index_geom'].sort_values()
fig, ax = plt.subplots(figsize=(5,5))
ax.scatter(x=range(len(y)), y=y, s=20, marker='o', edgecolor='b', color='none', alpha=0.7)
xmax = int(len(gdf) * 1.02)
xmin = int(len(gdf) * -0.02)
ymax = 1.02
ymin = -0.02
plt.plot([xmin, xmax], [ymin, ymax], c='#999999', ls=':', zorder=-1)
ax.set_xlim((xmin,xmax))
ax.set_ylim((ymin,ymax))
ax.set_ylabel('grid_index_geom')
ax.set_xlabel('Tract Rank')
plt.show()

In [None]:
print(gdf.groupby('state_abbrev')[['grid_index_geom', 'prop_4way']].median().sort_values('grid_index_geom').head())
print(gdf.groupby('state_abbrev')[['grid_index_geom', 'prop_4way']].median().sort_values('grid_index_geom').tail())

In [None]:
# total nodes and edges in dataset
print('{:,}'.format(gdf['m'].sum()))
print('{:,}'.format(gdf['n'].sum()))