In [1]:
import osmnx as ox, networkx as nx, pandas as pd, geopandas as gpd, time, matplotlib.pyplot as plt, math
import re
import statsmodels.api as sm
from geopy.distance import great_circle
from shapely.geometry import Polygon
%matplotlib inline
ox.config(use_cache=True, log_file=True, log_console=True, log_filename='calc_stats_every_us_urban_area',
          data_folder='G:/Geoff/osmnx/urbanized-areas-usa', cache_folder='G:/Geoff/osmnx/cache')

In [2]:
gdf = gpd.read_file('input_data/tl_2016_us_uac10').sort_values(by='ALAND10', ascending=False)
len(gdf)

3601

In [3]:
# remove the tiny census "urban clusters" to retain only "urban areas"
gdf = gdf[~gdf['NAMELSAD10'].str.contains('Urban Cluster')]
len(gdf)

497

In [4]:
gdf[['NAMELSAD10', 'ALAND10', 'AWATER10', 'GEOID10', 'geometry']].head()

Unnamed: 0,NAMELSAD10,ALAND10,AWATER10,GEOID10,geometry
2321,"New York--Newark, NY--NJ--CT Urbanized Area",8937429045,532939320,63217,"(POLYGON ((-74.896562 40.561084, -74.896255 40..."
3002,"Atlanta, GA Urbanized Area",6850045152,94712176,3817,"(POLYGON ((-85.04216699999999 33.714332, -85.0..."
3250,"Chicago, IL--IN Urbanized Area",6325255332,106765178,16264,"(POLYGON ((-88.471932 42.120298, -88.472899 42..."
2546,"Philadelphia, PA--NJ--DE--MD Urbanized Area",5132095000,127546905,69076,"(POLYGON ((-76.02113299999999 39.981897, -76.0..."
1593,"Boston, MA--NH--RI Urbanized Area",4852285339,202265876,9271,"(POLYGON ((-71.64691599999999 42.644126, -71.6..."


In [5]:
def load_graph_get_stats(row):
    
    start_time = time.time()
    name = row['NAMELSAD10']
    geoid = row['GEOID10']
    land_area = row['ALAND10']
    name_geoid = '{}_{}'.format(row['NAMELSAD10'], row['GEOID10']).replace(',', '_').replace(' ', '_')
    
    try:
        G = ox.load_graphml('{}.graphml'.format(name_geoid))
        stats = ox.basic_stats(G, area=land_area)
        stats['name'] = name
        stats['geoid'] = geoid
        stats['area'] = land_area
        
        # unpack k-counts and k-proportion dicts into individiual keys:values
        for k, count in stats['streets_per_node_counts'].items():
            stats['int_{}_streets_count'.format(k)] = count
        for k, proportion in stats['streets_per_node_proportion'].items():
            stats['int_{}_streets_prop'.format(k)] = proportion
            
        # calculate/drop the extended stats that have values per node
        extended_stats = ox.extended_stats(G)
        se = pd.Series(extended_stats)
        se = se.drop(['avg_neighbor_degree', 'avg_weighted_neighbor_degree', 'clustering_coefficient',
                      'clustering_coefficient_weighted', 'degree_centrality', 'pagerank'])
        extended_stats_clean = se.to_dict()
        
        for key in extended_stats_clean:
            stats[key] = extended_stats_clean[key]
        
        stats['area_km'] = land_area / 1e6
        stats['time'] = time.time()-start_time
        return pd.Series(stats)

    except Exception as e:
        print('{} failed: {}'.format(name, e))

In [6]:
stats = gdf.apply(load_graph_get_stats, axis=1)
stats.head()



Unnamed: 0,area,area_km,avg_neighbor_degree_avg,avg_weighted_neighbor_degree_avg,circuity_avg,clustering_coefficient_avg,clustering_coefficient_weighted_avg,count_intersections,degree_centrality_avg,edge_density_km,...,pagerank_min_node,self_loop_proportion,street_density_km,street_length_avg,street_length_total,street_segments_count,streets_per_node_avg,streets_per_node_counts,streets_per_node_proportion,time
2321,8937429045,8937.429045,2.861807,0.040511,1.066086,0.040215,0.000298,341346,1.2e-05,16817.534433,...,254806534,0.007695,9328.808276,140.77646,83375560.0,592255,2.817923,"{0: 266, 1: 75958, 2: 5100, 3: 256206, 4: 7848...","{0: 0.0006370189429317241, 1: 0.18190483032784...",418.834448
3002,6850045152,6850.045152,2.73314,0.033545,1.102635,0.039578,0.000491,149645,2.1e-05,12111.9628,...,4002944297,0.010398,6515.632037,165.005023,44632370.0,270491,2.443332,"{0: 188, 1: 69470, 2: 2920, 3: 126910, 4: 1931...","{0: 0.000857261414572532, 1: 0.316776332289115...",197.60005
3250,6325255332,6325.255332,2.984439,0.036827,1.068456,0.037757,0.000557,253789,1.8e-05,17215.929666,...,2405958737,0.009053,9309.736282,132.673476,58886460.0,443845,2.941457,"{0: 192, 1: 45678, 2: 2969, 3: 174493, 4: 7541...","{0: 0.0006407282944947423, 1: 0.15243326581213...",279.593899
2546,5132095000,5132.095,2.815747,0.096915,1.085334,0.049217,0.000336,166828,2.6e-05,14489.100116,...,103809222,0.009928,7957.536227,141.996745,40838830.0,287604,2.823101,"{0: 193, 1: 35253, 2: 3335, 3: 125862, 4: 3670...","{0: 0.0009541512997221591, 1: 0.17428339776738...",180.17589
1593,4852285339,4852.285339,2.763284,0.037621,1.099111,0.051735,0.000614,151213,2.6e-05,13228.795154,...,70780317,0.021174,7210.417686,133.734702,34987000.0,261615,2.662112,"{0: 155, 1: 41382, 2: 3249, 3: 127445, 4: 1975...","{0: 0.0008041504539559014, 1: 0.21469260700389...",188.587862


In [7]:
stats.to_csv('usa_stats.csv', encoding='utf-8', index=False)
stats['time'].sum()

8105.622082948685