In [1]:
import geopandas as gpd
import json
import matplotlib.pyplot as plt
import numpy as np
import osmnx as ox
import pandas as pd
from geopandas.plotting import _flatten_multi_geoms as flatten
from scipy import stats

indicators_path = 'data/tracts_indicators_grades_eras_index.csv'
tracts_path = 'data/tracts_shapefile'

crs = {'init':'epsg:4326'}

In [2]:
with open('data/states_by_fips.json') as f:
    fips_to_state = json.load(f)
states = [k for k, v in fips_to_state.items() if v['abbreviation'] not in ['GU', 'AS', 'CZ', 'VI', 'PR']]
len(states)

51

In [3]:
indicators = pd.read_csv(indicators_path, dtype={'geoid':str})
len(indicators)

72663

In [4]:
tracts = gpd.read_file(tracts_path, crs=crs).sort_values('GEOID')
len(tracts)

74133

In [5]:
gdf = gpd.GeoDataFrame(pd.merge(indicators, tracts, how='right', left_on='geoid', right_on='GEOID'), crs=crs)
len(gdf)

74133

In [6]:
# only retain tracts in 50 states + DC
gdf = gdf[gdf['STATEFP'].isin(states)]

# only retain tracts with at least 1 network node
#gdf = gdf[gdf['n'] > 0]
len(gdf)

73056

In [7]:
responses = ['grid_index', 'grid_index_geom']
pd.isnull(gdf[responses]).sum()

grid_index         393
grid_index_geom    397
dtype: int64

In [8]:
gdf = gdf.replace([np.inf, -np.inf], np.nan)
pd.isnull(gdf[responses]).sum()

grid_index         393
grid_index_geom    397
dtype: int64

In [9]:
gdf = gdf.sort_values('geoid')
gdf[responses] = gdf[responses].interpolate() #better to do spatial interpoliate with pysal spatial lag 
pd.isnull(gdf[responses]).sum()

grid_index         0
grid_index_geom    0
dtype: int64

In [10]:
# if all else fails, fill missing values with median value
gdf[responses] = gdf[responses].fillna(gdf[responses].median())
pd.isnull(gdf[responses]).sum()

grid_index         0
grid_index_geom    0
dtype: int64

In [11]:
len(gdf)

73056

## Trend over time

In [12]:
cols_majority = [c for c in gdf.columns if 'dummy_majority' in c]
print(gdf[cols_majority].sum())
gdf[cols_majority].sum().sum()

dummy_majority_prop_1939_earlier    5790.0
dummy_majority_prop_1940_49           77.0
dummy_majority_prop_1950_59         1211.0
dummy_majority_prop_1960_69          550.0
dummy_majority_prop_1970_79         1181.0
dummy_majority_prop_1980_89          980.0
dummy_majority_prop_1990_99         1082.0
dummy_majority_prop_2000_09         2046.0
dummy_majority_prop_2010_later        40.0
dtype: float64


12957.0

In [13]:
cols_plurality = [c for c in gdf.columns if 'dummy_plurality' in c]
print(gdf[cols_plurality].sum())
gdf[cols_plurality].sum().sum()

dummy_plurality_prop_1939_earlier    18079.0
dummy_plurality_prop_1940_49          1159.0
dummy_plurality_prop_1950_59          8512.0
dummy_plurality_prop_1960_69          5335.0
dummy_plurality_prop_1970_79         12557.0
dummy_plurality_prop_1980_89          7345.0
dummy_plurality_prop_1990_99          9022.0
dummy_plurality_prop_2000_09          9954.0
dummy_plurality_prop_2010_later        274.0
dtype: float64


72237.0

In [14]:
cols_earliest = [c for c in gdf.columns if 'dummy_earliest' in c]
print(gdf[cols_earliest].sum())
gdf[cols_earliest].sum().sum()

dummy_earliest_prop_1939_earlier    19673.0
dummy_earliest_prop_1940_49          1651.0
dummy_earliest_prop_1950_59          8564.0
dummy_earliest_prop_1960_69          5726.0
dummy_earliest_prop_1970_79         12814.0
dummy_earliest_prop_1980_89          7067.0
dummy_earliest_prop_1990_99          8255.0
dummy_earliest_prop_2000_09          4632.0
dummy_earliest_prop_2010_later         76.0
dtype: float64


68458.0

In [15]:
cols_cumulative = [c for c in gdf.columns if 'dummy_cumulative' in c]
print(gdf[cols_cumulative].sum())
gdf[cols_cumulative].sum().sum()

dummy_cumulative_prop_1939_earlier    19673.0
dummy_cumulative_prop_1940_49          7563.0
dummy_cumulative_prop_1950_59         14389.0
dummy_cumulative_prop_1960_69         11448.0
dummy_cumulative_prop_1970_79         11199.0
dummy_cumulative_prop_1980_89          4813.0
dummy_cumulative_prop_1990_99          2367.0
dummy_cumulative_prop_2000_09           783.0
dummy_cumulative_prop_2010_later          2.0
dtype: float64


72237.0

In [16]:
cols_primary = [c for c in gdf.columns if 'dummy_primary' in c]
print(gdf[cols_primary].sum())
gdf[cols_primary].sum().sum()

dummy_primary_prop_1939_earlier    21222.0
dummy_primary_prop_1940_49          1501.0
dummy_primary_prop_1950_59          9219.0
dummy_primary_prop_1960_69          5584.0
dummy_primary_prop_1970_79         12428.0
dummy_primary_prop_1980_89          6855.0
dummy_primary_prop_1990_99          7813.0
dummy_primary_prop_2000_09          7435.0
dummy_primary_prop_2010_later        180.0
dtype: float64


72237.0

In [17]:
type_primary = 'primary' #either 'majority' or 'plurality' or 'earliest' to designate primary era of development
cols_primary = cols_primary #either majority or plurality or earliest, match type_primary
response = 'grid_index_geom'

# what variables to visualize time series trends for, and their labels
variables = {response : 'Grid Index',
             'rho' : 'Orientation-Order',
             'straightness' : 'Average Straightness',
             'prop_4way' : '4-Way Intersection Proportion',
             'prop_deadend' : 'Dead-End Proportion',
             'intersect_density' : 'Intersection Density',
             'length_mean' : 'Average Street Length',
             'elevations_iqr' : 'Node Elevations IQR'}

def summarize_decades(gdf, cols_primary, type_primary, variables=variables):

    means = {}
    lowers = {}
    uppers = {}
    
    for col in cols_primary:
        decade = col.replace(f'dummy_{type_primary}_prop_', '')
        means[decade] = {}
        lowers[decade] = {}
        uppers[decade] = {}
        
        subset = gdf[gdf[col]==1].dropna(subset=variables.keys())
        for var in variables:
            mean = subset[var].mean()
            means[decade][var] = mean
            conf_lower, conf_upper = stats.t.interval(0.95, len(subset[var])-1, loc=mean, scale=stats.sem(subset[var]))
            lowers[decade][var] = conf_lower
            uppers[decade][var] = conf_upper

    df_means = pd.DataFrame(means).T
    df_lowers = pd.DataFrame(lowers).T
    df_uppers = pd.DataFrame(uppers).T
    return df_means, df_lowers, df_uppers

In [18]:
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(15, 8))

for col, ax in zip(variables.keys(), axes.flat):
    
    # calculate decade/variable means and 95% confidence intervals
    df_means, df_lowers, df_uppers = summarize_decades(gdf[gdf['is_urban']==1], cols_primary, type_primary)
    
    # plot conf intervals, or don't: this only makes sense if you've sampled from a population but we're looking at the entire population of urban tracts
    ax.fill_between(df_means.index, df_uppers[col], df_lowers[col], color='#1f77b4', alpha=0.3, edgecolor='')
    
    # plot means' lines
    df_means[col].plot(ax=ax, lw=2, marker='o', markerfacecolor='w', c='#1f77b4', markeredgewidth=1.5)
    
    # style the plot
    ax.set_xlim((-0.5, len(df_means.index) - 0.5))
    margin = (df_means[col].max() - df_means[col].min()) * 0.2
    ax.set_ylim((df_means[col].min() - margin, df_means[col].max() + margin))
    
    # set tick marks and labels
    ax.set_xticks(range(len(df_means.index)))
    xticklabels = ['Pre-1940', '1940s', '1950s', '1960s', '1970s', '1980s', '1990s', '2000s', '2010–']
    ax.set_xticklabels(xticklabels, rotation=45, rotation_mode='anchor', ha='right')
    for tick in ax.get_xticklabels() + ax.get_yticklabels():
        tick.set_fontname('Arial')
        tick.set_fontsize(11)
    
    # title the subplot and set the grid
    ax.set_title(variables[col], fontdict={'family':'Arial', 'size':14})
    ax.grid(True, ls=':', lw=0.5)
    
fig.subplots_adjust(wspace=1, hspace=0.5)
fig.tight_layout()
fig.savefig(f'images/decades-{type_primary}-urban.png', bbox_inches='tight', dpi=300)
plt.close() 

In [19]:
# plot earliest, majority, and plurality together as line plots
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(15, 8))
for col, ax in zip(variables.keys(), axes.flat):
    
    # plot the points and line   
    #df_means, df_lowers, df_uppers = summarize_decades(gdf[gdf['is_urban']==1], cols_majority, 'majority')
    #df_means[col].plot(ax=ax, lw=1.5, markersize=0, c='#999999', alpha=1, ls='--', dashes=(5, 5))
    
    df_means, df_lowers, df_uppers = summarize_decades(gdf[gdf['is_urban']==1], cols_plurality, 'plurality')
    df_means[col].plot(ax=ax, lw=1.5, markersize=0, c='#666666', alpha=1, ls='--', dashes=(1, 1))
    
    df_means, df_lowers, df_uppers = summarize_decades(gdf[gdf['is_urban']==1], cols_earliest, 'earliest')
    df_means[col].plot(ax=ax, lw=1.5, markersize=0, c='#999999', alpha=1, ls='--', dashes=(4, 2))
    
    #df_means, df_lowers, df_uppers = summarize_decades(gdf[gdf['is_urban']==1], cols_cumulative, 'cumulative')
    #df_means[col].plot(ax=ax, lw=1.5, markersize=0, c='r', alpha=1, ls='--', dashes=(4, 2))
    
    df_means, df_lowers, df_uppers = summarize_decades(gdf[gdf['is_urban']==1], cols_primary, 'primary')
    df_means[col].plot(ax=ax, lw=2, marker='o', markerfacecolor='w', c='#1f77b4', markeredgewidth=1.5)
    #ax.fill_between(df_means.index, df_uppers[col], df_lowers[col], color='#1f77b4', alpha=0.3, edgecolor='')
    
    # set the x and y limits
    ax.set_xlim((-0.5, len(df_means.index) - 0.5))
    margin = (df_means[col].max() - df_means[col].min()) * 0.2
    ax.set_ylim((df_means[col].min() - margin, df_means[col].max() + margin))
    
    # set tick marks and labels
    ax.set_xticks(range(len(df_means.index)))
    xticklabels = ['Pre-1940', '1940s', '1950s', '1960s', '1970s', '1980s', '1990s', '2000s', '2010–']
    ax.set_xticklabels(xticklabels, rotation=45, rotation_mode='anchor', ha='right')
    for tick in ax.get_xticklabels() + ax.get_yticklabels():
        tick.set_fontname('Arial')
        tick.set_fontsize(11)
    
    # title the subplot and set the grid
    ax.set_title(variables[col], fontdict={'family':'Arial', 'size':14})
    ax.grid(True, ls=':', lw=0.5)
    
fig.subplots_adjust(wspace=1, hspace=0.5)
fig.tight_layout()
fig.savefig(f'images/decades-together-urban.png', bbox_inches='tight', dpi=300)
plt.close()

In [20]:
variables

{'grid_index_geom': 'Grid Index',
 'rho': 'Orientation-Order',
 'straightness': 'Average Straightness',
 'prop_4way': '4-Way Intersection Proportion',
 'prop_deadend': 'Dead-End Proportion',
 'intersect_density': 'Intersection Density',
 'length_mean': 'Average Street Length',
 'elevations_iqr': 'Node Elevations IQR'}

In [21]:
df_means, df_lowers, df_uppers = summarize_decades(gdf[gdf['is_urban']==1], cols_primary, 'primary')
df_means

Unnamed: 0,elevations_iqr,grid_index_geom,intersect_density,length_mean,prop_4way,prop_deadend,rho,straightness
1939_earlier,9.192524,0.626427,71.14293,125.880418,0.393305,0.093247,0.718848,0.973833
1940_49,7.386466,0.597724,59.897015,136.531948,0.348836,0.103374,0.706818,0.965558
1950_59,9.408356,0.508234,48.798367,146.225319,0.257869,0.142686,0.620567,0.952374
1960_69,9.934734,0.436275,44.191733,150.18917,0.200684,0.17847,0.533701,0.937236
1970_79,10.080302,0.408858,40.781459,148.555196,0.17765,0.210777,0.503545,0.924384
1980_89,10.265055,0.359391,38.994149,148.832423,0.157424,0.233137,0.407344,0.912033
1990_99,11.721154,0.34186,35.290291,151.321673,0.148827,0.24299,0.372621,0.910992
2000_09,10.636515,0.386152,37.846942,150.37555,0.182855,0.208004,0.430551,0.92204
2010_later,9.941,0.47171,56.381659,136.712248,0.258812,0.140052,0.53197,0.94216


## Map study sites

In [22]:
usa = gpd.read_file('data/cb_2017_us_state_20m')
mask = ~((usa['STUSPS']=='AK') | (usa['STUSPS']=='HI') | (usa['STUSPS']=='PR'))
usa = usa[mask]
crs_proj = '+proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=37.5 +lon_0=-96 +x_0=0 +y_0=0 +datum=NAD83 +units=m +no_defs '
usa_proj = usa.to_crs(crs_proj)
xmin, ymin, xmax, ymax = usa_proj.unary_union.bounds
aspect_ratio = (xmax - xmin) / (ymax - ymin) 

In [23]:
states = [k for k, v in fips_to_state.items() if v['abbreviation'] not in ['AK', 'HI']]

In [24]:
%%time
# project the gdf
gdf_proj = gdf.to_crs(crs_proj)

Wall time: 19.9 s


In [25]:
mask = gdf_proj['state'].isin(states)
gdf_proj = gdf_proj[mask]
len(gdf_proj)

72175

In [26]:
gdf_proj = gdf_proj.sort_values(response) #low to high
colors = ox.get_colors(len(gdf_proj), cmap='inferno', start=0.15) #dark to light
len(colors)

72175

In [27]:
flat_geoms, flat_colors = flatten(gdf_proj['geometry'], colors)
flat_geoms = gpd.GeoSeries(flat_geoms)
print(len(flat_geoms))
print(len(flat_colors))

72416
72416


In [28]:
%%time
width = 14
fig, ax = plt.subplots(figsize=(width, width/aspect_ratio), facecolor='k')

ax = flat_geoms.plot(ax=ax, facecolor=flat_colors, edgecolor=flat_colors, linewidth=0.2)
ax = usa_proj.plot(ax=ax, facecolor='none', edgecolor='w', linewidth=0.4)

ax.set_xlim((xmin, xmax))
ax.set_ylim((ymin, ymax))

ax.axis('off')
fig.tight_layout()
fig.savefig('images/grid_index-choropleth.png', facecolor=fig.get_facecolor(), edgecolor='none', bbox_inches='tight', dpi=300)
plt.close()

Wall time: 57.7 s


In [29]:
fig, ax = plt.subplots()
raster = np.outer(np.ones(10), np.arange(100))
ax.imshow(raster, cmap='inferno')
ax.axis('off')
fig.tight_layout()
fig.savefig('images/grid_index-colorbar.png', bbox_inches='tight', dpi=300)
plt.close() 

## Visualize KDE

In [30]:
bandwidth = 0.7
legend_fontsize = 6.25
label_fontsize = 14
fontname = 'Arial'

In [31]:
gdf['state_abbrev'] = gdf['STATEFP'].map(lambda x: fips_to_state[x]['abbreviation'] if x in fips_to_state else None)

In [32]:
groups = gdf.sort_values('state_abbrev').groupby('state_abbrev')
colors = ox.get_colors(len(groups), cmap='hsv_r', start=0.2)

In [33]:
states = groups[response].median().sort_values().index

In [34]:
fig, ax = plt.subplots(figsize=(7, 7))
for state, c in zip(reversed(states), reversed(colors)):
    group = groups.get_group(state)
    ax = group[response].plot.kde(ax=ax, label=state, linewidth=1.5, alpha=0.6,
                                        color=c, ls='-', bw_method=bandwidth)

ax.set_ylim(bottom=0, top=5.5)
ax.set_xlim(left=-0.19, right=1.19)
ax.set_xlabel('Grid Index', fontsize=label_fontsize, fontname=fontname)
ax.set_ylabel('Probability Density', fontsize=label_fontsize, fontname=fontname)
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), prop={'size':legend_fontsize, 'family':fontname})
fig.tight_layout()
fig.savefig('images/kde-grid_index_geom.png', bbox_inches='tight', dpi=600)
plt.close()

In [35]:
fig, ax = plt.subplots(figsize=(7, 7))
for state, c in zip(reversed(states), reversed(colors)):
    group = groups.get_group(state)
    ax = group['orientation_entropy'].plot.kde(ax=ax, label=state, linewidth=1.5, alpha=0.6,
                                               color=c, ls='-', bw_method=bandwidth)

ax.set_ylim(bottom=0, top=2)
ax.set_xlim(left=0.8, right=4.3)
ax.set_xlabel(r'$H_O$', fontsize=label_fontsize, fontname=fontname)
ax.set_ylabel('Probability Density', fontsize=label_fontsize, fontname=fontname)
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), prop={'size':legend_fontsize, 'family':fontname})
fig.tight_layout()
fig.savefig('images/kde-orientation-entropy.png', bbox_inches='tight', dpi=600)
plt.close()