In [1]:
import osmnx as ox, networkx as nx, pandas as pd, geopandas as gpd, time, matplotlib.pyplot as plt, math
import re
import statsmodels.api as sm
from geopy.distance import great_circle
from shapely.geometry import Polygon
%matplotlib inline
ox.config(use_cache=True, log_file=True, log_console=True, log_filename='calc_stats_every_us_urban_area',
          data_folder='G:/Geoff/osmnx/urbanized-areas-usa', cache_folder='G:/Geoff/osmnx/cache')

In [2]:
gdf = gpd.read_file('input_data/tl_2016_us_uac10').sort_values(by='ALAND10', ascending=False)
len(gdf)

3601

In [3]:
# remove the tiny census "urban clusters" to retain only "urban areas"
gdf = gdf[~gdf['NAMELSAD10'].str.contains('Urban Cluster')]
len(gdf)

497

In [4]:
gdf[['NAMELSAD10', 'ALAND10', 'AWATER10', 'GEOID10', 'geometry']].head()

Unnamed: 0,NAMELSAD10,ALAND10,AWATER10,GEOID10,geometry
2321,"New York--Newark, NY--NJ--CT Urbanized Area",8937429045,532939320,63217,"(POLYGON ((-74.896562 40.561084, -74.896255 40..."
3002,"Atlanta, GA Urbanized Area",6850045152,94712176,3817,"(POLYGON ((-85.04216699999999 33.714332, -85.0..."
3250,"Chicago, IL--IN Urbanized Area",6325255332,106765178,16264,"(POLYGON ((-88.471932 42.120298, -88.472899 42..."
2546,"Philadelphia, PA--NJ--DE--MD Urbanized Area",5132095000,127546905,69076,"(POLYGON ((-76.02113299999999 39.981897, -76.0..."
1593,"Boston, MA--NH--RI Urbanized Area",4852285339,202265876,9271,"(POLYGON ((-71.64691599999999 42.644126, -71.6..."


In [5]:
def load_graph_get_stats(row):
    
    start_time = time.time()
    name = row['NAMELSAD10']
    geoid = row['GEOID10']
    land_area = row['ALAND10']
    name_geoid = '{}_{}'.format(row['NAMELSAD10'], row['GEOID10']).replace(',', '_').replace(' ', '_')
    
    try:
        G = ox.load_graphml('{}.graphml'.format(name_geoid))
        stats = ox.basic_stats(G, area=land_area)
        stats['name'] = name
        stats['geoid'] = geoid
        stats['area'] = land_area
        
        # unpack k-counts and k-proportion dicts into individiual keys:values
        for k, count in stats['streets_per_node_counts'].items():
            stats['int_{}_streets_count'.format(k)] = count
        for k, proportion in stats['streets_per_node_proportion'].items():
            stats['int_{}_streets_prop'.format(k)] = proportion
            
        # calculate/drop the extended stats that have values per node
        extended_stats = ox.extended_stats(G)
        se = pd.Series(extended_stats)
        se = se.drop(['avg_neighbor_degree', 'avg_weighted_neighbor_degree', 'clustering_coefficient',
                      'clustering_coefficient_weighted', 'degree_centrality', 'pagerank'])
        extended_stats_clean = se.to_dict()
        
        for key in extended_stats_clean:
            stats[key] = extended_stats_clean[key]
        
        stats['area_km'] = land_area / 1e6
        stats['time'] = time.time()-start_time
        return pd.Series(stats)

    except Exception as e:
        print('{} failed: {}'.format(name, e))

In [6]:
stats = gdf.apply(load_graph_get_stats, axis=1)
stats.head()



Unnamed: 0,area,area_km,avg_neighbor_degree_avg,avg_weighted_neighbor_degree_avg,circuity_avg,clustering_coefficient_avg,clustering_coefficient_weighted_avg,count_intersections,degree_centrality_avg,edge_density_km,...,pagerank_min_node,self_loop_proportion,street_density_km,street_length_avg,street_length_total,street_segments_count,streets_per_node_avg,streets_per_node_counts,streets_per_node_proportion,time
2321,8937429045,8937.429045,2.891812,0.038259,1.060343,0.037874,0.00026,307848,1.4e-05,15925.250512,...,254806534,0.007042,8844.471431,148.095174,79046840.0,533757,2.855058,"{0: 1, 1: 65460, 2: 1466, 3: 229681, 4: 75294,...","{0: 2.6787460254105847e-06, 1: 0.1753507148233...",1327.774858
3002,6850045152,6850.045152,2.758075,0.026858,1.09868,0.036286,0.000526,125948,2.5e-05,11463.474331,...,72357514,0.010281,6163.986938,185.660215,42223590.0,227424,2.451515,"{0: 0, 1: 59346, 2: 464, 3: 108342, 4: 16781, ...","{0: 0.0, 1: 0.32028020335251006, 2: 0.00250412...",384.065474
3250,6325255332,6325.255332,2.98027,0.028692,1.065626,0.035814,0.000646,171122,2.7e-05,14235.536574,...,367529557,0.011058,7772.813508,163.248939,49165030.0,301166,2.919495,"{0: 0, 1: 34405, 2: 954, 3: 117716, 4: 51783, ...","{0: 0.0, 1: 0.16739893055413643, 2: 0.00464172...",419.556416
2546,5132095000,5132.095,2.840241,0.099789,1.080482,0.048361,0.000374,136789,3.2e-05,13253.791234,...,103809222,0.009984,7302.505354,159.192725,37477150.0,235420,2.873055,"{0: 0, 1: 27140, 2: 672, 3: 102796, 4: 32576, ...","{0: 0.0, 1: 0.16555948001878862, 2: 0.00409933...",336.567767
1593,4852285339,4852.285339,2.776859,0.030162,1.089688,0.050355,0.000591,117553,3.4e-05,11768.84538,...,70780317,0.023745,6435.435076,153.9559,31226570.0,202828,2.709323,"{0: 0, 1: 30030, 2: 601, 3: 99830, 4: 16538, 5...","{0: 0.0, 1: 0.2034787204488322, 2: 0.004072284...",271.580517


In [7]:
stats.to_csv('data/usa_stats.csv', encoding='utf-8', index=False)
stats['time'].sum()

11414.738343954086