In [1]:
import config
import geopandas as gpd
import json
import networkx as nx
import numpy as np
import os
import osmnx as ox
import pandas as pd
import time

ox.config(use_cache=True,
          log_file=True,
          log_console=True,
          log_filename='calculate-urbanized_areas',
          cache_folder=config.urbanized_areas_cache_folder)

print(ox.__version__)
print(nx.__version__)

0.7.4
2.1


In [2]:
graphml_folder = config.urbanized_areas_graphml_folder
uas_folder = 'input_data/urbanized_areas' #tiger urban areas shapefiles
stats_folder = config.urbanized_areas_stats_folder

In [3]:
urbanized_areas = []
for state_folder in os.listdir(graphml_folder):
    for urbanized_area_file in os.listdir('{}/{}'.format(graphml_folder, state_folder)):

        data = {}
        data['state_folder'] = state_folder
        data['state_fips'] = state_folder.split('_')[0]
        data['state'] = state_folder.split('_')[1]
        data['urbanized_area_file'] = urbanized_area_file
        data['geoid'] = urbanized_area_file.split('_')[0]
        data['urbanized_area'] = urbanized_area_file.strip('_{}'.format(data['geoid'])).replace('.graphml', '').replace('_', ' ')
        urbanized_areas.append(data)

df = pd.DataFrame(urbanized_areas)
len(df)

497

## Load the UAs shapefile

In [4]:
gdf = gpd.read_file(uas_folder).sort_values(by='ALAND10', ascending=True)
gdf = gdf[~gdf['NAMELSAD10'].str.contains('Urban Cluster')]
gdf = gdf[['GEOID10', 'ALAND10']]
len(gdf)

497

In [5]:
df = pd.merge(df, gdf, how='left', left_on='geoid', right_on='GEOID10').drop(columns=['GEOID10'])

## Calculate the stats

In [6]:
def load_graph_get_stats(row):
    
    start_time = time.time()
    name = row['urbanized_area']
    geoid = row['geoid']
    land_area = row['ALAND10']
    
    
    
    try:
        folder = '{}/{}'.format(graphml_folder, row['state_folder'])
        filename = row['urbanized_area_file']
        G = ox.load_graphml(folder=folder, filename=filename)
        
        stats = ox.basic_stats(G, area=land_area)
        stats['name'] = name
        stats['geoid'] = geoid
        stats['area'] = land_area
        
        # unpack k-counts and k-proportion dicts into individiual keys:values
        for k, count in stats['streets_per_node_counts'].items():
            stats['int_{}_streets_count'.format(k)] = count
        for k, proportion in stats['streets_per_node_proportion'].items():
            stats['int_{}_streets_prop'.format(k)] = proportion
            
        # calculate/drop the extended stats that have values per node
        extended_stats = ox.extended_stats(G)
        se = pd.Series(extended_stats)
        se = se.drop(['avg_neighbor_degree', 'avg_weighted_neighbor_degree', 'clustering_coefficient',
                      'clustering_coefficient_weighted', 'degree_centrality', 'pagerank'])
        extended_stats_clean = se.to_dict()
        
        for key in extended_stats_clean:
            stats[key] = extended_stats_clean[key]
        
        stats['area_km'] = land_area / 1e6
        stats['time'] = time.time()-start_time
        return pd.Series(stats)

    except Exception as e:
        print('{} failed: {}'.format(name, e))

In [7]:
stats_temp = df.apply(load_graph_get_stats, axis=1)
stats_temp.shape

(497, 66)

In [8]:
stats_temp['time'].sum()

6119.933447599411

## Clean up the dataframe

In [9]:
stats = stats_temp.copy()

In [10]:
# stuff to drop
cols_to_drop = ['area', 'time', 'streets_per_node_counts', 'streets_per_node_proportion', 
                'pagerank_max_node', 'pagerank_min_node', 'clean_intersection_count',
                'clean_intersection_density_km']

In [11]:
cols_to_rename = {}
for col in stats.columns:
    if 'int_' in col:
        n = col.split('_')[1]
        if n not in ['1', '3', '4']:
            cols_to_drop.append(col)
        else:
            suffix = 'count' if 'count' in col else 'proportion'
            cols_to_rename[col] = 'intersect_{}way_{}'.format(n, suffix)
            
stats = stats.drop(cols_to_drop, axis=1)

In [12]:
# rename these to friendlier names
cols_to_rename['clustering_coefficient_avg'] = 'cluster_coeff_avg'
cols_to_rename['clustering_coefficient_weighted_avg'] = 'cluster_coeff_weighted_avg'
cols_to_rename['intersection_density_km'] = 'intersect_density_km'
cols_to_rename['intersect_1way_count'] = 'dead_end_count'
cols_to_rename['intersect_1way_proportion'] = 'dead_end_proportion'
cols_to_rename['m'] = 'edge_count'
cols_to_rename['n'] = 'node_count'
cols_to_rename['name'] = 'urbanized_area'
stats = stats.rename(columns=cols_to_rename)
stats = stats.rename(columns=cols_to_rename)

In [13]:
# drop anything lacking a GEOID
stats = stats.dropna(subset=['geoid'])

In [14]:
# make these integers
cols_int = ['intersection_count', 'edge_length_total', 'edge_count', 'node_count', 'street_segments_count']
stats[cols_int] = stats[cols_int].astype(np.int64)

In [15]:
# make city, state, geoid at left of df
cols = stats.columns.tolist()
cols.insert(0, cols.pop(cols.index('urbanized_area')))
cols.insert(1, cols.pop(cols.index('geoid')))
stats = stats.reindex(columns=cols)

## View the results

In [16]:
stats.shape

(497, 32)

In [17]:
stats.columns

Index(['urbanized_area', 'geoid', 'area_km', 'avg_neighbor_degree_avg',
       'avg_weighted_neighbor_degree_avg', 'circuity_avg', 'cluster_coeff_avg',
       'cluster_coeff_weighted_avg', 'degree_centrality_avg',
       'edge_density_km', 'edge_length_avg', 'edge_length_total',
       'dead_end_count', 'dead_end_proportion', 'intersect_3way_count',
       'intersect_3way_proportion', 'intersect_4way_count',
       'intersect_4way_proportion', 'intersection_count',
       'intersect_density_km', 'k_avg', 'edge_count', 'node_count',
       'node_density_km', 'pagerank_max', 'pagerank_min',
       'self_loop_proportion', 'street_density_km', 'street_length_avg',
       'street_length_total', 'street_segments_count', 'streets_per_node_avg'],
      dtype='object')

In [18]:
stats.sort_values('area_km').tail()

Unnamed: 0,urbanized_area,geoid,area_km,avg_neighbor_degree_avg,avg_weighted_neighbor_degree_avg,circuity_avg,cluster_coeff_avg,cluster_coeff_weighted_avg,degree_centrality_avg,edge_density_km,...,node_count,node_density_km,pagerank_max,pagerank_min,self_loop_proportion,street_density_km,street_length_avg,street_length_total,street_segments_count,streets_per_node_avg
225,Boston MA--NH--RI Urbanized Area,9271,4852.213624,2.763338,0.030155,1.089643,0.050951,0.00059,3.4e-05,11651.631841,...,146866,30.267835,2.6e-05,1.025312e-06,0.023923,6388.85741,153.702487,31000100.0,201689,2.708571
364,Philadelphia PA--NJ--DE--MD Urbanized Area,69076,5131.722319,2.830751,0.064147,1.081114,0.048946,0.000385,3.1e-05,13232.578155,...,164575,32.07013,2.7e-05,9.152862e-07,0.009938,7300.928116,158.685731,37466340.0,236104,2.870788
162,Chicago IL--IN Urbanized Area,16264,6323.669758,2.962034,0.028812,1.065826,0.036737,0.000655,2.6e-05,14267.905207,...,209800,33.176938,1.7e-05,7.176539e-07,0.010864,7841.565631,161.445929,49587470.0,307146,2.917193
136,Atlanta GA Urbanized Area,3817,6850.603579,2.736249,0.028541,1.099988,0.041198,0.000514,2.4e-05,11582.6265,...,195352,28.516027,2.1e-05,7.706407e-07,0.010834,6254.523663,177.669303,42847260.0,241163,2.464275
297,New York--Newark NY--NJ--CT Urbanized Area,63217,8934.39532,2.859609,0.034802,1.060897,0.039323,0.000264,1.4e-05,15760.267611,...,373953,41.855435,1.9e-05,4.044022e-07,0.007004,8835.681865,147.527686,78941470.0,535096,2.857816


## Save to disk

In [19]:
if not os.path.exists(stats_folder):
    os.makedirs(stats_folder)
output_path = '{}/urbanized_areas-stats.csv'.format(stats_folder)
stats.to_csv(output_path, encoding='utf-8', index=False)