In [1]:
import config
import geopandas as gpd
import json
import networkx as nx
import numpy as np
import os
import osmnx as ox
import pandas as pd
import time

ox.config(use_cache=True,
          log_file=True,
          log_console=True,
          log_filename='calculate-counties',
          cache_folder=config.counties_cache_folder)

print(ox.__version__)
print(nx.__version__)

0.8dev
2.1


In [2]:
graphml_folder = config.counties_graphml_folder
counties_folder = 'input_data/counties' #tiger counties shapefiles
stats_folder = config.counties_stats_folder

In [3]:
counties = []
for state_folder in os.listdir(graphml_folder):
    for county_file in os.listdir('{}/{}'.format(graphml_folder, state_folder)):

        data = {}
        data['state_folder'] = state_folder
        data['state_fips'] = state_folder.split('_')[0]
        data['state'] = state_folder.split('_')[1]
        data['county_file'] = county_file
        data['geoid'] = county_file.split('_')[0]
        data['county'] = county_file.strip('_{}'.format(data['geoid'])).replace('.graphml', '').replace('_', ' ')
        counties.append(data)

df = pd.DataFrame(counties)
len(df)

8

## Load the counties shapefile

In [4]:
gdf = gpd.read_file(counties_folder).sort_values(by='ALAND', ascending=True)
gdf = gdf[['GEOID', 'ALAND']]
len(gdf)

3233

In [5]:
df = pd.merge(df, gdf, how='left', left_on='geoid', right_on='GEOID').drop(columns=['GEOID'])

## Calculate the stats

In [6]:
def load_graph_get_stats(row):
    
    start_time = time.time()
    name = row['county']
    geoid = row['geoid']
    land_area = row['ALAND']
    
    
    
    try:
        folder = '{}/{}'.format(graphml_folder, row['state_folder'])
        filename = row['county_file']
        G = ox.load_graphml(folder=folder, filename=filename)
        
        stats = ox.basic_stats(G, area=land_area)
        stats['name'] = name
        stats['geoid'] = geoid
        stats['area'] = land_area
        
        # unpack k-counts and k-proportion dicts into individiual keys:values
        for k, count in stats['streets_per_node_counts'].items():
            stats['int_{}_streets_count'.format(k)] = count
        for k, proportion in stats['streets_per_node_proportion'].items():
            stats['int_{}_streets_prop'.format(k)] = proportion
            
        # calculate/drop the extended stats that have values per node
        extended_stats = ox.extended_stats(G)
        se = pd.Series(extended_stats)
        se = se.drop(['avg_neighbor_degree', 'avg_weighted_neighbor_degree', 'clustering_coefficient',
                      'clustering_coefficient_weighted', 'degree_centrality', 'pagerank'])
        extended_stats_clean = se.to_dict()
        
        for key in extended_stats_clean:
            stats[key] = extended_stats_clean[key]
        
        stats['area_km'] = land_area / 1e6
        stats['time'] = time.time()-start_time
        return pd.Series(stats)

    except Exception as e:
        print('{} failed: {}'.format(name, e))

In [7]:
stats_temp = df.apply(load_graph_get_stats, axis=1)
stats_temp.shape

(8, 48)

In [8]:
stats_temp['time'].sum()

10.729074239730835

## Clean up the dataframe

In [9]:
stats = stats_temp.copy()

In [10]:
# stuff to drop
cols_to_drop = ['area', 'time', 'streets_per_node_counts', 'streets_per_node_proportion', 
                'pagerank_max_node', 'pagerank_min_node', 'clean_intersection_count',
                'clean_intersection_density_km']

In [11]:
cols_to_rename = {}
for col in stats.columns:
    if 'int_' in col:
        n = col.split('_')[1]
        if n not in ['1', '3', '4']:
            cols_to_drop.append(col)
        else:
            suffix = 'count' if 'count' in col else 'proportion'
            cols_to_rename[col] = 'intersect_{}way_{}'.format(n, suffix)
            
stats = stats.drop(cols_to_drop, axis=1)

In [12]:
# rename these to friendlier names
cols_to_rename['clustering_coefficient_avg'] = 'cluster_coeff_avg'
cols_to_rename['clustering_coefficient_weighted_avg'] = 'cluster_coeff_weighted_avg'
cols_to_rename['intersection_density_km'] = 'intersect_density_km'
cols_to_rename['intersect_1way_count'] = 'dead_end_count'
cols_to_rename['intersect_1way_proportion'] = 'dead_end_proportion'
cols_to_rename['m'] = 'edge_count'
cols_to_rename['n'] = 'node_count'
cols_to_rename['name'] = 'county'
stats = stats.rename(columns=cols_to_rename)
stats = stats.rename(columns=cols_to_rename)

In [13]:
# drop anything lacking a GEOID
stats = stats.dropna(subset=['geoid'])

In [14]:
# make these integers
cols_int = ['intersection_count', 'edge_length_total', 'edge_count', 'node_count', 'street_segments_count']
stats[cols_int] = stats[cols_int].astype(np.int64)

In [15]:
# make city, state, geoid at left of df
cols = stats.columns.tolist()
cols.insert(0, cols.pop(cols.index('county')))
cols.insert(1, cols.pop(cols.index('geoid')))
stats = stats.reindex(columns=cols)

## View the results

In [16]:
stats.shape

(8, 32)

In [17]:
stats.columns

Index(['county', 'geoid', 'area_km', 'avg_neighbor_degree_avg',
       'avg_weighted_neighbor_degree_avg', 'circuity_avg', 'cluster_coeff_avg',
       'cluster_coeff_weighted_avg', 'degree_centrality_avg',
       'edge_density_km', 'edge_length_avg', 'edge_length_total',
       'dead_end_count', 'dead_end_proportion', 'intersect_3way_count',
       'intersect_3way_proportion', 'intersect_4way_count',
       'intersect_4way_proportion', 'intersection_count',
       'intersect_density_km', 'k_avg', 'edge_count', 'node_count',
       'node_density_km', 'pagerank_max', 'pagerank_min',
       'self_loop_proportion', 'street_density_km', 'street_length_avg',
       'street_length_total', 'street_segments_count', 'streets_per_node_avg'],
      dtype='object')

In [18]:
stats.sort_values('area_km').tail()

Unnamed: 0,county,geoid,area_km,avg_neighbor_degree_avg,avg_weighted_neighbor_degree_avg,circuity_avg,cluster_coeff_avg,cluster_coeff_weighted_avg,degree_centrality_avg,edge_density_km,...,node_count,node_density_km,pagerank_max,pagerank_min,self_loop_proportion,street_density_km,street_length_avg,street_length_total,street_segments_count,streets_per_node_avg
6,Valdez-Cordova Census Area,2261,88635.832004,2.77287,0.019196,1.10528,0.03095,0.000236,0.003764,31.30796,...,1217,0.01373,0.00293,0.000123,0.008977,15.698946,987.572164,1391489.179,1409,2.299918
4,Northwest Arctic Borough,2188,92326.483031,2.981047,0.032291,1.140388,0.058271,8.4e-05,0.012943,6.172284,...,401,0.004343,0.010535,0.0004,0.001927,3.090389,548.701448,285324.753,520,2.588529
0,Bethel Census Area,2050,105228.855314,2.840535,0.028834,1.124234,0.058162,0.000263,0.004912,6.508363,...,972,0.009237,0.004078,0.000162,0.0,3.254182,295.456269,342433.816,1159,2.384774
3,North Slope Borough,2185,230056.014591,2.768916,0.018846,1.145277,0.035225,5.6e-05,0.006184,10.410984,...,793,0.003447,0.004759,0.000195,0.00309,5.211794,1231.010724,1199004.445,974,2.450189
7,Yukon-Koyukuk Census Area,2290,377030.936019,2.886327,0.018639,1.176673,0.050583,0.000144,0.003681,8.896541,...,1314,0.003485,0.003075,0.000115,0.003465,4.452588,1053.837611,1678763.315,1593,2.429224


## Save to disk

In [19]:
if not os.path.exists(stats_folder):
    os.makedirs(stats_folder)
output_path = '{}/counties-stats.csv'.format(stats_folder)
stats.to_csv(output_path, encoding='utf-8', index=False)