In [1]:
import ast
import config
import geopandas as gpd
import json
import networkx as nx
import numpy as np
import os
import osmnx as ox
import pandas as pd
import time

ox.config(use_cache=True,
          log_file=True,
          log_console=True,
          log_filename='calculate-neighborhoods',
          cache_folder=config.neighborhoods_cache_folder)

print(ox.__version__)
print(nx.__version__)

0.7.4
2.1


In [2]:
graphml_folder = config.neighborhoods_graphml_folder
stats_folder = config.neighborhoods_stats_folder

In [3]:
nhoods = []
for state_folder in os.listdir(graphml_folder):
    for city_folder in os.listdir('{}/{}'.format(graphml_folder, state_folder)):
        for nhood_file in os.listdir('{}/{}/{}'.format(graphml_folder, state_folder, city_folder)):
            if '.graphml' in nhood_file:
                data = {}
                data['state_fips'] = state_folder.split('_')[0]
                data['state'] = state_folder.split('_')[1]
                data['city'] = city_folder
                data['nhood'] = nhood_file.replace('.graphml', '').replace('_', ' ')
                data['path'] = '{}/{}/{}'.format(graphml_folder, state_folder, city_folder)
                data['file'] = nhood_file
                nhoods.append(data)

df = pd.DataFrame(nhoods)
df.shape

(16579, 6)

In [4]:
df.head()

Unnamed: 0,city,file,nhood,path,state,state_fips
0,Birmingham,Acipco-Finley.graphml,Acipco-Finley,G:/Geoff/osmnx/data/neighborhoods/graphml/01_A...,AL,1
1,Birmingham,Airport_Highlands.graphml,Airport Highlands,G:/Geoff/osmnx/data/neighborhoods/graphml/01_A...,AL,1
2,Birmingham,Arlington-West_End.graphml,Arlington-West End,G:/Geoff/osmnx/data/neighborhoods/graphml/01_A...,AL,1
3,Birmingham,Belview_Heights.graphml,Belview Heights,G:/Geoff/osmnx/data/neighborhoods/graphml/01_A...,AL,1
4,Birmingham,Bridlewood.graphml,Bridlewood,G:/Geoff/osmnx/data/neighborhoods/graphml/01_A...,AL,1


## Load graph and calculate stats for each neighborhood

In [5]:
def load_graph_get_stats(row):
    
    try:
        start_time = time.time()
        G = ox.load_graphml(filename=row['file'], folder=row['path'])
        nhood_area_m = float(G.graph['nhood_area_m'])
        
        stats = ox.basic_stats(G, area=nhood_area_m)
        stats['nhood'] = row['nhood']
        stats['city'] = row['city']
        stats['state'] = row['state']
        
        # calculate/drop the extended stats that have values per node
        extended_stats = ox.extended_stats(G)
        se = pd.Series(extended_stats)
        se = se.drop(['avg_neighbor_degree', 'avg_weighted_neighbor_degree', 'clustering_coefficient',
                      'clustering_coefficient_weighted', 'degree_centrality', 'pagerank'])
        extended_stats_clean = se.to_dict()
        
        for key in extended_stats_clean:
            stats[key] = extended_stats_clean[key]
        
        stats['area_km'] = nhood_area_m / 1e6
        stats['area'] = nhood_area_m
        stats['time'] = time.time()-start_time
        return pd.Series(stats)

    except Exception as e:
        print('{}, {}, {} failed: {}'.format(row['nhood'], row['city'], row['state'], e))
        return pd.Series()

In [6]:
#sample = list(range(0, len(df), int(len(df)/100)))
#stats = df.iloc[sample].apply(load_graph_get_stats, axis=1)
stats_temp = df.apply(load_graph_get_stats, axis=1)
stats_temp.shape

  circuity_avg = edge_length_total / gc_distances.sum()


Arbor Walk, Gilbert, AZ failed: float division by zero
Sombras del Cerro, Tucson, AZ failed: float division by zero
Winding Wood 2, Clearwater, FL failed: float division by zero
Winding Wood No.4 Condominium Association, Clearwater, FL failed: float division by zero
Fair Isle, Miami, FL failed: float division by zero
Cedar Ridge, Fort_Wayne, IN failed: float division by zero
Centaur Acres, Fort_Wayne, IN failed: float division by zero
Foxchase, Fort_Wayne, IN failed: float division by zero
Autumn Chase, Nashville, TN failed: float division by zero
McMurray Court, Nashville, TN failed: float division by zero
Churchill Heights, San_Antonio, TX failed: float division by zero
Wynnwood Condominium, San_Antonio, TX failed: float division by zero
Riverside Manoe, Fredericksburg, VA failed: float division by zero


(16579, 35)

In [7]:
stats_temp['time'].sum()

2284.8561108112335

## Clean up the dataframe

In [8]:
stats = stats_temp.copy()

In [9]:
def get_count(s, key):
    try:
        c = s[key]
    except:
        c = None
    return c

def get_prop(s, key):
    try:
        d = s
        p = d[key] / sum(d.values())
    except:
        p = None
    return p

In [10]:
stats['dead_end_count'] = stats['streets_per_node_counts'].apply(get_count, key=1)
stats['dead_end_proportion'] = stats['streets_per_node_counts'].apply(get_prop, key=1)
stats['intersect_3way_count'] = stats['streets_per_node_counts'].apply(get_count, key=3)
stats['intersect_3way_proportion'] = stats['streets_per_node_counts'].apply(get_prop, key=3)
stats['intersect_4way_count'] = stats['streets_per_node_counts'].apply(get_count, key=4)
stats['intersect_4way_proportion'] = stats['streets_per_node_counts'].apply(get_prop, key=4)

In [11]:
# stuff to drop
cols_to_drop = ['area', 'time', 'streets_per_node_counts', 'streets_per_node_proportion', 
                'pagerank_max_node', 'pagerank_min_node', 'clean_intersection_count',
                'clean_intersection_density_km']

In [12]:
cols_to_rename = {}
for col in stats.columns:
    if 'int_' in col:
        n = col.split('_')[1]
        if n not in ['1', '3', '4']:
            cols_to_drop.append(col)
        else:
            suffix = 'count' if 'count' in col else 'proportion'
            cols_to_rename[col] = 'intersect_{}way_{}'.format(n, suffix)
            
stats = stats.drop(cols_to_drop, axis=1)

In [13]:
# rename these to friendlier names
cols_to_rename['clustering_coefficient_avg'] = 'cluster_coeff_avg'
cols_to_rename['clustering_coefficient_weighted_avg'] = 'cluster_coeff_weighted_avg'
cols_to_rename['intersection_density_km'] = 'intersect_density_km'
cols_to_rename['intersect_1way_count'] = 'dead_end_count'
cols_to_rename['intersect_1way_proportion'] = 'dead_end_proportion'
cols_to_rename['m'] = 'edge_count'
cols_to_rename['n'] = 'node_count'
cols_to_rename['nhood'] = 'zillow_neighborhood'
stats = stats.rename(columns=cols_to_rename)
stats = stats.rename(columns=cols_to_rename)

In [14]:
# make these integers
cols_int = ['intersection_count', 'edge_length_total', 'edge_count', 'node_count', 'street_segments_count']
stats = stats.dropna(subset=cols_int)
stats[cols_int] = stats[cols_int].astype(np.int64)

In [15]:
# make city, state at left of df
cols = stats.columns.tolist()
cols.insert(0, cols.pop(cols.index('zillow_neighborhood')))
cols.insert(1, cols.pop(cols.index('city')))
cols.insert(2, cols.pop(cols.index('state')))
stats = stats.reindex(columns=cols)

## View the results

In [16]:
stats.shape

(16566, 33)

In [17]:
stats.columns

Index(['zillow_neighborhood', 'city', 'state', 'area_km',
       'avg_neighbor_degree_avg', 'avg_weighted_neighbor_degree_avg',
       'circuity_avg', 'cluster_coeff_avg', 'cluster_coeff_weighted_avg',
       'degree_centrality_avg', 'edge_density_km', 'edge_length_avg',
       'edge_length_total', 'intersection_count', 'intersect_density_km',
       'k_avg', 'edge_count', 'node_count', 'node_density_km', 'pagerank_max',
       'pagerank_min', 'self_loop_proportion', 'street_density_km',
       'street_length_avg', 'street_length_total', 'street_segments_count',
       'streets_per_node_avg', 'dead_end_count', 'dead_end_proportion',
       'intersect_3way_count', 'intersect_3way_proportion',
       'intersect_4way_count', 'intersect_4way_proportion'],
      dtype='object')

In [18]:
stats.head()

Unnamed: 0,zillow_neighborhood,city,state,area_km,avg_neighbor_degree_avg,avg_weighted_neighbor_degree_avg,circuity_avg,cluster_coeff_avg,cluster_coeff_weighted_avg,degree_centrality_avg,...,street_length_avg,street_length_total,street_segments_count,streets_per_node_avg,dead_end_count,dead_end_proportion,intersect_3way_count,intersect_3way_proportion,intersect_4way_count,intersect_4way_proportion
0,Acipco-Finley,Birmingham,AL,4.020022,3.086228,0.03268,1.044219,0.033533,0.0027,0.032609,...,141.604281,34268.236,242,3.047904,29.0,0.173653,73.0,0.437126,64.0,0.383234
1,Airport Highlands,Birmingham,AL,0.785442,2.99187,0.026805,1.024278,0.0,0.0,0.129268,...,134.389113,7122.623,53,2.95122,7.0,0.170732,22.0,0.536585,12.0,0.292683
2,Arlington-West End,Birmingham,AL,4.759317,3.227658,0.033935,1.00682,0.025431,0.003191,0.025228,...,119.870222,41594.967,347,3.137931,28.0,0.12069,117.0,0.50431,83.0,0.357759
3,Belview Heights,Birmingham,AL,2.770418,3.156477,0.035901,1.007829,0.053566,0.014086,0.026354,...,109.565782,39114.984,357,3.393013,11.0,0.048035,107.0,0.467249,110.0,0.480349
4,Bridlewood,Birmingham,AL,0.808771,2.457317,0.024315,1.062857,0.020325,0.009246,0.1,...,140.14878,5746.1,41,2.585366,12.0,0.292683,22.0,0.536585,7.0,0.170732


## Save to disk

In [19]:
if not os.path.exists(stats_folder):
    os.makedirs(stats_folder)
output_path = '{}/neighborhoods-stats.csv'.format(stats_folder)
stats.to_csv(output_path, encoding='utf-8', index=False)