In [1]:
import osmnx as ox
import geopandas as gpd
import pandas as pd
import numpy as np
import networkx as nx
import pathlib
import time
import os
import datetime

# turn response caching on and turn on logging to your terminal window
ox.config(log_console=True, use_cache=True)

ox.__version__

'1.0.1'

# Functions

In [2]:
def rename_stats_columns(stats_dataframe, inplace=False):
    """
    Rename column names for saving to shapefiles.
    """
    
    # columns to be changed
    cols = {'avg_neighbor_degree':'avg_n8d', 
              'avg_neighbor_degree_avg':'avg2_n8d',
              'avg_weighted_neighbor_degree':'avg_w_n8d',
              'avg_weighted_neighbor_degree_avg':'avg2_w_n8d',
              'betweenness_centrality':'btwn_c',
              'betweenness_centrality_avg':'btwn_c_avg',
              'circuity_avg':'cirq_avg',
              'clean_intersection_count':'cl_int_cnt',
              'clean_intersection_density_km':'cl_int_dkm',
              'closeness_centrality':'clos_c',
              'closeness_centrality_avg':'clos_c_avg',
              'clustering_coefficient':'c_cof',
              'clustering_coefficient_avg':'c_cof_avg',
              'clustering_coefficient_weighted':'c_cofw',
              'clustering_coefficient_weighted_avg':'c_cofw_avg',
              'degree_centrality':'degcen',
              'degree_centrality_avg':'degcen_avg',
              'eccentricity':'eccty',
              'edge_connectivity':'m_cvty',
              'edge_density_km':'m_dty_km',
              'edge_length_avg':'m_len_avg',
              'edge_length_total':'m_len_tot',
              'intersection_count':'int_cnt',
              'intersection_density_km':'int_dty_km',
              'node_connectivity':'n_cvty',
              'node_connectivity_avg':'n_cvty_avg',
              'node_density_km':'n_dty_km',
              'pagerank_max':'prnk_max',
              'pagerank_max_node':'prnk_max_n',
              'pagerank_min':'prnk_min',
              'pagerank_min_node':'prnk_min_n',
              'self_loop_proportion':'sloop_prop',
              'street_density_km':'st_dty_km',
              'street_length_avg':'st_len_avg',
              'street_length_total':'st_len_tot',
              'street_segments_count':'st_seg_cnt',
              'streets_per_node_avg':'st_n_avg',
              'streets_per_node_counts':'st_n_cnts',
              'streets_per_node_proportion':'st_n_prop'
             }
    
    if inplace is False:
        new_df = stats_dataframe
    
    if 'pop_density' in stats_dataframe.columns:
        if inplace is False:
            new_df.rename(columns={'pop_density':'popdensity'}, inplace=True)
        else:
            stats_dataframe.rename(columns={'pop_density':'popdensity'}, inplace=True)
            
    if inplace is False:
        new_df.rename(columns=cols, inplace=True)
        return new_df
    else:
        stats_dataframe.rename(columns=cols, inplace=True)

In [3]:
def change_census_data_type(dataframe):
    dataframe = dataframe.astype({
        'POBTOT':'float32',
        'POBFEM':'float32',
        'POBMAS':'float32',
        'P_0A2':'float32',
        'P_0A2_F':'float32',
        'P_0A2_M':'float32',
        'P_3YMAS':'float32',
        'P_3YMAS_F':'float32',
        'P_3YMAS_M':'float32',
        'P_5YMAS':'float32',
        'P_5YMAS_F':'float32',
        'P_5YMAS_M':'float32',
        'P_12YMAS':'float32',
        'P_12YMAS_F':'float32',
        'P_12YMAS_M':'float32',
        'P_15YMAS':'float32',
        'P_15YMAS_F':'float32',
        'P_15YMAS_M':'float32',
        'P_18YMAS':'float32',
        'P_18YMAS_F':'float32',
        'P_18YMAS_M':'float32',
        'P_3A5':'float32',
        'P_3A5_F':'float32',
        'P_3A5_M':'float32',
        'P_6A11':'float32',
        'P_6A11_F':'float32',
        'P_6A11_M':'float32',
        'P_8A14':'float32',
        'P_8A14_F':'float32',
        'P_8A14_M':'float32',
        'P_12A14':'float32',
        'P_12A14_F':'float32',
        'P_12A14_M':'float32',
        'P_15A17':'float32',
        'P_15A17_F':'float32',
        'P_15A17_M':'float32',
        'P_18A24':'float32',
        'P_18A24_F':'float32',
        'P_18A24_M':'float32',
        'P_15A49_F':'float32',
        'P_60YMAS':'float32',
        'P_60YMAS_F':'float32',
        'P_60YMAS_M':'float32',
        'REL_H_M':'float32',
        'POB0_14':'float32',
        'POB15_64':'float32',
        'POB65_MAS':'float32',
        'PROM_HNV':'float32',
        'PNACENT':'float32',
        'PNACENT_F':'float32',
        'PNACENT_M':'float32',
        'PNACOE':'float32',
        'PNACOE_F':'float32',
        'PNACOE_M':'float32',
        'PRES2015':'float32',
        'PRES2015_F':'float32',
        'PRES2015_M':'float32',
        'PRESOE15':'float32',
        'PRESOE15_F':'float32',
        'PRESOE15_M':'float32',
        'P3YM_HLI':'float32',
        'P3YM_HLI_F':'float32',
        'P3YM_HLI_M':'float32',
        'P3HLINHE':'float32',
        'P3HLINHE_F':'float32',
        'P3HLINHE_M':'float32',
        'P3HLI_HE':'float32',
        'P3HLI_HE_F':'float32',
        'P3HLI_HE_M':'float32',
        'P5_HLI':'float32',
        'P5_HLI_NHE':'float32',
        'P5_HLI_HE':'float32',
        'PHOG_IND':'float32',
        'POB_AFRO':'float32',
        'POB_AFRO_F':'float32',
        'POB_AFRO_M':'float32',
        'PCON_DISC':'float32',
        'PCDISC_MOT':'float32',
        'PCDISC_VIS':'float32',
        'PCDISC_LENG':'float32',
        'PCDISC_AUD':'float32',
        'PCDISC_MOT2':'float32',
        'PCDISC_MEN':'float32',
        'PCON_LIMI':'float32',
        'PCLIM_CSB':'float32',
        'PCLIM_VIS':'float32',
        'PCLIM_HACO':'float32',
        'PCLIM_OAUD':'float32',
        'PCLIM_MOT2':'float32',
        'PCLIM_RE_CO':'float32',
        'PCLIM_PMEN':'float32',
        'PSIND_LIM':'float32',
        'P3A5_NOA':'float32',
        'P3A5_NOA_F':'float32',
        'P3A5_NOA_M':'float32',
        'P6A11_NOA':'float32',
        'P6A11_NOAF':'float32',
        'P6A11_NOAM':'float32',
        'P12A14NOA':'float32',
        'P12A14NOAF':'float32',
        'P12A14NOAM':'float32',
        'P15A17A':'float32',
        'P15A17A_F':'float32',
        'P15A17A_M':'float32',
        'P18A24A':'float32',
        'P18A24A_F':'float32',
        'P18A24A_M':'float32',
        'P8A14AN':'float32',
        'P8A14AN_F':'float32',
        'P8A14AN_M':'float32',
        'P15YM_AN':'float32',
        'P15YM_AN_F':'float32',
        'P15YM_AN_M':'float32',
        'P15YM_SE':'float32',
        'P15YM_SE_F':'float32',
        'P15YM_SE_M':'float32',
        'P15PRI_IN':'float32',
        'P15PRI_INF':'float32',
        'P15PRI_INM':'float32',
        'P15PRI_CO':'float32',
        'P15PRI_COF':'float32',
        'P15PRI_COM':'float32',
        'P15SEC_IN':'float32',
        'P15SEC_INF':'float32',
        'P15SEC_INM':'float32',
        'P15SEC_CO':'float32',
        'P15SEC_COF':'float32',
        'P15SEC_COM':'float32',
        'P18YM_PB':'float32',
        'P18YM_PB_F':'float32',
        'P18YM_PB_M':'float32',
        'GRAPROES':'float32',
        'GRAPROES_F':'float32',
        'GRAPROES_M':'float32',
        'PEA':'float32',
        'PEA_F':'float32',
        'PEA_M':'float32',
        'PE_INAC':'float32',
        'PE_INAC_F':'float32',
        'PE_INAC_M':'float32',
        'POCUPADA':'float32',
        'POCUPADA_F':'float32',
        'POCUPADA_M':'float32',
        'PDESOCUP':'float32',
        'PDESOCUP_F':'float32',
        'PDESOCUP_M':'float32',
        'PSINDER':'float32',
        'PDER_SS':'float32',
        'PDER_IMSS':'float32',
        'PDER_ISTE':'float32',
        'PDER_ISTEE':'float32',
        'PAFIL_PDOM':'float32',
        'PDER_SEGP':'float32',
        'PDER_IMSSB':'float32',
        'PAFIL_IPRIV':'float32',
        'PAFIL_OTRAI':'float32',
        'P12YM_SOLT':'float32',
        'P12YM_CASA':'float32',
        'P12YM_SEPA':'float32',
        'PCATOLICA':'float32',
        'PRO_CRIEVA':'float32',
        'POTRAS_REL':'float32',
        'PSIN_RELIG':'float32',
        'TOTHOG':'float32',
        'HOGJEF_F':'float32',
        'HOGJEF_M':'float32',
        'POBHOG':'float32',
        'PHOGJEF_F':'float32',
        'PHOGJEF_M':'float32',
        'VIVTOT':'float32',
        'TVIVHAB':'float32',
        'TVIVPAR':'float32',
        'VIVPAR_HAB':'float32',
        'TVIVPARHAB':'float32',
        'VIVPAR_DES':'float32',
        'VIVPAR_UT':'float32',
        'OCUPVIVPAR':'float32',
        'PROM_OCUP':'float32',
        'PRO_OCUP_C':'float32',
        'VPH_PISODT':'float32',
        'VPH_PISOTI':'float32',
        'VPH_1DOR':'float32',
        'VPH_2YMASD':'float32',
        'VPH_1CUART':'float32',
        'VPH_2CUART':'float32',
        'VPH_3YMASC':'float32',
        'VPH_C_ELEC':'float32',
        'VPH_S_ELEC':'float32',
        'VPH_AGUADV':'float32',
        'VPH_AEASP':'float32',
        'VPH_AGUAFV':'float32',
        'VPH_TINACO':'float32',
        'VPH_CISTER':'float32',
        'VPH_EXCSA':'float32',
        'VPH_LETR':'float32',
        'VPH_DRENAJ':'float32',
        'VPH_NODREN':'float32',
        'VPH_C_SERV':'float32',
        'VPH_NDEAED':'float32',
        'VPH_DSADMA':'float32',
        'VPH_NDACMM':'float32',
        'VPH_SNBIEN':'float32',
        'VPH_REFRI':'float32',
        'VPH_LAVAD':'float32',
        'VPH_HMICRO':'float32',
        'VPH_AUTOM':'float32',
        'VPH_MOTO':'float32',
        'VPH_BICI':'float32',
        'VPH_RADIO':'float32',
        'VPH_TV':'float32',
        'VPH_PC':'float32',
        'VPH_TELEF':'float32',
        'VPH_CEL':'float32',
        'VPH_INTER':'float32',
        'VPH_STVP':'float32',
        'VPH_SPMVPI':'float32',
        'VPH_CVJ':'float32',
        'VPH_SINRTV':'float32',
        'VPH_SINLTC':'float32',
        'VPH_SINCINT':'float32',
        'VPH_SINTIC':'float32'
        })
    return dataframe

In [4]:
def save_shapefiles(geo_df, filename):
    # save data to shapefiles
    folder = './data/' + filename + '_shp'
    filepath = pathlib.Path(folder) / str(filename+'.shp')
    # if save folder does not already exist, create it
    filepath.parent.mkdir(parents=True, exist_ok=True)
    geo_df.to_file(filepath)

# Prepare data

Read data from the Marco Geoestadístico 2020 (see: https://www.inegi.org.mx/app/biblioteca/ficha.html?upc=889463807469)

In order to perform some analysis involving how data metrics change with scale, we import the localidades (i.e., city) from the Marco Geoestadístico to create a graph for each different city.

Geometries we use:

<ul>
    <li>Polígono de localidades urbanas y rurales amanzanadas. Filename: 00<strong>l</strong></li>
</ul>

Note: **00** refers to the numeric key of the federal country.

In [24]:
mexico_MG = gpd.read_file('./input_data/marco_geoestadistico_2020/MG_2020_Integrado/conjunto_de_datos/00l.shp')
mexico_MG.head()

Unnamed: 0,CVEGEO,CVE_ENT,CVE_MUN,CVE_LOC,NOMGEO,AMBITO,geometry
0,10060024,1,6,24,Ojo Zarco [Colonia],Rural,"POLYGON ((2474887.290 1121053.465, 2474847.050..."
1,10060045,1,6,45,Santiago,Rural,"POLYGON ((2465518.761 1122024.712, 2465527.967..."
2,10060233,1,6,233,San Carlos [Fraccionamiento Campestre],Rural,"POLYGON ((2469185.141 1114762.839, 2469185.137..."
3,10060258,1,6,258,El Canal,Rural,"POLYGON ((2464753.818 1124614.112, 2464768.684..."
4,10060287,1,6,287,Las Flores,Rural,"POLYGON ((2469959.082 1126193.767, 2469959.858..."


Read census data from INEGI's Censo de Población y Vivienda 2020 > Principales resultados por localidad (ITER) > Estados Unidos Mexicanos (see: https://www.inegi.org.mx/programas/ccpv/2020/#Datos_abiertos)

Direct download: https://www.inegi.org.mx/contenidos/programas/ccpv/2020/datosabiertos/iter/iter_00_cpv2020_csv.zip

In [25]:
mexico_censo = pd.read_csv('./input_data/censo_mexico_2020_csv/iter_00_cpv2020/conjunto_de_datos/conjunto_de_datos_iter_00CSV20.csv', encoding='utf-8', low_memory=False)

In [26]:
mexico_censo.columns

Index(['ENTIDAD', 'NOM_ENT', 'MUN', 'NOM_MUN', 'LOC', 'NOM_LOC', 'LONGITUD',
       'LATITUD', 'ALTITUD', 'POBTOT',
       ...
       'VPH_CEL', 'VPH_INTER', 'VPH_STVP', 'VPH_SPMVPI', 'VPH_CVJ',
       'VPH_SINRTV', 'VPH_SINLTC', 'VPH_SINCINT', 'VPH_SINTIC', 'TAMLOC'],
      dtype='object', length=232)

In [27]:
#Replace with a 0 the data values that do not have a record
mexico_censo = mexico_censo.replace('*', '0')
mexico_censo = mexico_censo.replace('N/D', '0')

In [28]:
# change data types from string to float32
mexico_censo = change_census_data_type(mexico_censo)

In [29]:
# get data of cdmx as this is a special case (review later)
cdmx = mexico_censo[mexico_censo['ENTIDAD'] == 9]

In [30]:
# drop the different totals there are in the dataframe, which may affect the results when sorting by population
mexico_censo.drop(mexico_censo[(mexico_censo['LOC'] == 0) | (mexico_censo['LOC'] == 9998) | (mexico_censo['LOC'] == 9999)].index, inplace=True)
mexico_censo.reset_index(inplace=True, drop=True)

In [31]:
# sort by population to find the most populated cities
mexico_censo_sorted_by_population = mexico_censo.sort_values(by=['POBTOT'], ascending=False)

In [32]:
# get the first 100 most populated cities
mexico_censo_sorted_by_population_top100 = mexico_censo_sorted_by_population.head(100)

In [33]:
mexico_censo_sorted_by_population_top100.reset_index(inplace=True, drop=True)

In order to merge the data from the Marco Geoestadístico and the Census 2020, we need to create a new column where we took values from ENTIDAD, MUN, LOC, and AGEB to conform the CVEGEO (Clave Geoestadistica Concatenada) in the Census dataframe

The CVEGEO has a length of 9 characters

Check the lengths of the elements that conform the CVEGEO:

In [34]:
# Check the length of the clave geoestadistica that do not count the manzanas
print(f"Clave Geoestadistica: {len(mexico_MG['CVEGEO'].iloc[0])}")

# Entidad
print(f"Entidad: {len(mexico_MG['CVE_ENT'].iloc[0])}")

# Municipio
print(f"Municipio: {len(mexico_MG['CVE_MUN'].iloc[0])}")

# Localidad
print(f"Localidad: {len(mexico_MG['CVE_LOC'].iloc[0])}")

Clave Geoestadistica: 9
Entidad: 2
Municipio: 3
Localidad: 4


In [35]:
# We need to add missing zeros to the values on the elements from the CENSO 2020 dataframe that conform the CVEGEO.
mexico_censo_sorted_by_population_top100['ENTIDAD'] = mexico_censo_sorted_by_population_top100['ENTIDAD'].apply(lambda x: '{0:0>2}'.format(x))
mexico_censo_sorted_by_population_top100['MUN'] = mexico_censo_sorted_by_population_top100['MUN'].apply(lambda x: '{0:0>3}'.format(x))
mexico_censo_sorted_by_population_top100['LOC'] = mexico_censo_sorted_by_population_top100['LOC'].apply(lambda x: '{0:0>4}'.format(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mexico_censo_sorted_by_population_top100['ENTIDAD'] = mexico_censo_sorted_by_population_top100['ENTIDAD'].apply(lambda x: '{0:0>2}'.format(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mexico_censo_sorted_by_population_top100['MUN'] = mexico_censo_sorted_by_population_top100['MUN'].apply(lambda x: '{0:0>3}'.format(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pand

In [36]:
# convert column values into strings
mexico_censo_sorted_by_population_top100 = mexico_censo_sorted_by_population_top100.astype({'ENTIDAD':str, 'MUN':str, 'LOC':str})

In [37]:
# Concatenate columns to create the CVEGEO columns
mexico_censo_sorted_by_population_top100['CVEGEO'] = mexico_censo_sorted_by_population_top100['ENTIDAD'] + mexico_censo_sorted_by_population_top100['MUN'] + mexico_censo_sorted_by_population_top100['LOC']
mexico_censo_sorted_by_population_top100.reset_index(inplace=True, drop=True)

Now we need to merge both dataframes:

In [38]:
#drop columns that are duplicated in the other dataframe
mexico_censo_sorted_by_population_top100.drop(columns=['ENTIDAD', 'MUN', 'LOC'], inplace=True)
mexico_MG.drop(columns=['NOMGEO'], inplace=True)

mexico_pop_top100 = mexico_MG.merge(mexico_censo_sorted_by_population_top100, on=['CVEGEO'], how='inner')

In [39]:
mexico_pop_top100 = mexico_pop_top100.sort_values(by=['POBTOT'], ascending=False)
mexico_pop_top100.reset_index(inplace=True, drop=True)

In [41]:
#view info of the Coordinate Reference System (CRS)
mexico_pop_top100.crs

<Projected CRS: PROJCS["MEXICO_ITRF_2008_LCC",GEOGCS["MEXICO_ITRF_ ...>
Name: MEXICO_ITRF_2008_LCC
Axis Info [cartesian]:
- [east]: Easting (metre)
- [north]: Northing (metre)
Area of Use:
- undefined
Coordinate Operation:
- name: unnamed
- method: Lambert Conic Conformal (2SP)
Datum: International Terrestrial Reference Frame 2008
- Ellipsoid: GRS 1980
- Prime Meridian: Greenwich

In [42]:
# Re-project to CRS pseudo mercator for units in meters
mexico_pop_top100 = mexico_pop_top100.to_crs(epsg=3857)
mexico_pop_top100.crs

<Projected CRS: EPSG:3857>
Name: WGS 84 / Pseudo-Mercator
Axis Info [cartesian]:
- X[east]: Easting (metre)
- Y[north]: Northing (metre)
Area of Use:
- name: World between 85.06°S and 85.06°N.
- bounds: (-180.0, -85.06, 180.0, 85.06)
Coordinate Operation:
- name: Popular Visualisation Pseudo-Mercator
- method: Popular Visualisation Pseudo Mercator
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [44]:
# save geodataframes to shapefiles
save_shapefiles(mexico_pop_top100, 'mexico_cities_top100_census_data')

  geo_df.to_file(filepath)


# Get every city graph

In [5]:
gdf = gpd.read_file('./data/mexico_cities_top100_census_data_shp/mexico_cities_top100_census_data.shp')

In [6]:
# get area of each city, in meters (as it is already in a CRS with meter unit)
def get_area(geometry):
    return geometry.area

gdf['area_m'] = gdf['geometry'].map(get_area)

In [7]:
# project it from CRS 3857 to 4326 for OSM
gdf = gdf.to_crs(epsg=4326)
gdf.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [8]:
# where to save networks
gdf['ENT_folder'] = gdf.apply(lambda row: '{}_{}'.format(row['CVE_ENT'], row['NOM_ENT']).replace(' ', '-'), axis=1)
gdf['MUN_folder'] = gdf.apply(lambda row: '{}_{}'.format(row['CVE_MUN'], row['NOM_MUN']).replace(' ', '-'), axis=1)
gdf['LOC_folder'] = gdf.apply(lambda row: '{}/{}/{}_{}'.format(row['ENT_folder'],
                                                                     row['MUN_folder'],
                                                                     row['CVE_LOC'], row['NOM_LOC']).replace(' ', '-'), axis=1)

In [9]:
# create list of queries
queries = gdf.apply(lambda row: {'LOC_folder':row['LOC_folder'], 
                                 'geometry':row['geometry'],
                                 'area_m':row['area_m'],
                                 'POBTOT':row['POBTOT']}, axis=1).tolist()
queries[0]

{'LOC_folder': '09_Ciudad-de-México/007_Iztapalapa/0001_Iztapalapa',
 'geometry': <shapely.geometry.polygon.Polygon at 0x7fc5bc57ec40>,
 'area_m': 127684813.66466376,
 'POBTOT': 1835486.0}

In [10]:
output_folder = './data/mexico_cities_graphs/' #where to save graph shapefiles and graphml

In [11]:
network_type = 'drive'
retain_all = True
buffer = False

In [12]:
%%time
start_time = time.time()
for query in queries:
    try:
        # load graph and save it if it hasn't already been saved in the output_path
        if not os.path.exists('{}/{}.graphml'.format(output_folder, query['LOC_folder'])):
            geometry = query['geometry'].buffer(0) #fix trivially invalid geometries (nested shells, ring self-intersections)
            G = ox.graph_from_polygon(polygon=geometry, network_type=network_type, retain_all=retain_all)
            G.graph['name'] = query['LOC_folder']
            G.graph['area_m'] = query['area_m']
            G.graph['POBTOT'] = query['POBTOT']
            ox.save_graph_shapefile(G, filepath=output_folder+query['LOC_folder'], directed=True)
            ox.save_graphml(G, filepath=output_folder+'{}.graphml'.format(query['LOC_folder']))
    except Exception as e:
        print('"{}" failed: {}'.format(query['LOC_folder'], e))
print('Finished making graphs in {:,.2f} seconds'.format(time.time()-start_time))

  gdf_nodes.to_file(filepath_nodes, driver="ESRI Shapefile", index=True, encoding=encoding)


Finished making graphs in 3,725.72 seconds
CPU times: user 43min 13s, sys: 51.6 s, total: 44min 4s
Wall time: 1h 2min 5s


# Calculate stats

Make a dataframe with the filepath to every city graphml

In [18]:
data_folder = './data/mexico_cities_graphs'

In [19]:
places = []
for entidad_folder in os.listdir(data_folder):
    for municipio_folder in os.listdir('{}/{}'.format(data_folder, entidad_folder)):
        for localidad_file in os.listdir('{}/{}/{}'.format(data_folder, entidad_folder, municipio_folder)):
                if '.graphml' in localidad_file:
                    data = {}
                    data['CVE_ENT'] = entidad_folder.split('_')[0]
                    data['NOM_ENT'] = entidad_folder.replace('{}_'.format(data['CVE_ENT']), '').replace('-', ' ')
                    data['CVE_MUN'] = municipio_folder.split('_')[0]
                    data['NOM_MUN'] = municipio_folder.replace('{}_'.format(data['CVE_MUN']), '').replace('-', ' ')
                    data['CVE_LOC'] = localidad_file.split('_')[0]
                    data['NOM_LOC'] = localidad_file.replace('{}_'.format(data['CVE_LOC']), '').replace('.graphml', '').replace('-', ' ')
                    data['CVEGEO'] = data['CVE_ENT']+data['CVE_MUN']+data['CVE_LOC']
                    data['path'] = '{}/{}/{}'.format(data_folder, entidad_folder, municipio_folder)
                    data['file'] = localidad_file
                    places.append(data)

df = pd.DataFrame(places)
df

Unnamed: 0,CVE_ENT,NOM_ENT,CVE_MUN,NOM_MUN,CVE_LOC,NOM_LOC,CVEGEO,path,file
0,26,Sonora,018,Cajeme,0001,Ciudad Obregón,260180001,./data/mexico_cities_graphs/26_Sonora/018_Cajeme,0001_Ciudad-Obregón.graphml
1,26,Sonora,030,Hermosillo,0001,Hermosillo,260300001,./data/mexico_cities_graphs/26_Sonora/030_Herm...,0001_Hermosillo.graphml
2,26,Sonora,055,San Luis Río Colorado,0001,San Luis Río Colorado,260550001,./data/mexico_cities_graphs/26_Sonora/055_San-...,0001_San-Luis-Río-Colorado.graphml
3,26,Sonora,043,Nogales,0001,Heroica Nogales,260430001,./data/mexico_cities_graphs/26_Sonora/043_Nogales,0001_Heroica-Nogales.graphml
4,22,Querétaro,016,San Juan del Río,0001,San Juan del Río,220160001,./data/mexico_cities_graphs/22_Querétaro/016_S...,0001_San-Juan-del-Río.graphml
...,...,...,...,...,...,...,...,...,...
95,15,México,104,Tlalnepantla de Baz,0001,Tlalnepantla,151040001,./data/mexico_cities_graphs/15_México/104_Tlal...,0001_Tlalnepantla.graphml
96,15,México,106,Toluca,0001,Toluca de Lerdo,151060001,./data/mexico_cities_graphs/15_México/106_Toluca,0001_Toluca-de-Lerdo.graphml
97,15,México,060,Nicolás Romero,0001,Ciudad Nicolás Romero,150600001,./data/mexico_cities_graphs/15_México/060_Nico...,0001_Ciudad-Nicolás-Romero.graphml
98,21,Puebla,114,Puebla,0001,Heroica Puebla de Zaragoza,211140001,./data/mexico_cities_graphs/21_Puebla/114_Puebla,0001_Heroica-Puebla-de-Zaragoza.graphml


Add population for calculating stats in descendent order:

In [21]:
pobtot = gdf[['CVEGEO', 'POBTOT']]
df = df.merge(pobtot, on=['CVEGEO'], how='inner')
df = df.sort_values(by=['POBTOT'], ascending=False)
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,CVE_ENT,NOM_ENT,CVE_MUN,NOM_MUN,CVE_LOC,NOM_LOC,CVEGEO,path,file,POBTOT
0,09,Ciudad de México,007,Iztapalapa,0001,Iztapalapa,090070001,./data/mexico_cities_graphs/09_Ciudad-de-Méxic...,0001_Iztapalapa.graphml,1835486.0
1,02,Baja California,004,Tijuana,0001,Tijuana,020040001,./data/mexico_cities_graphs/02_Baja-California...,0001_Tijuana.graphml,1810645.0
2,15,México,033,Ecatepec de Morelos,0001,Ecatepec de Morelos,150330001,./data/mexico_cities_graphs/15_México/033_Ecat...,0001_Ecatepec-de-Morelos.graphml,1643623.0
3,11,Guanajuato,020,León,0001,León de los Aldama,110200001,./data/mexico_cities_graphs/11_Guanajuato/020_...,0001_León-de-los-Aldama.graphml,1579803.0
4,21,Puebla,114,Puebla,0001,Heroica Puebla de Zaragoza,211140001,./data/mexico_cities_graphs/21_Puebla/114_Puebla,0001_Heroica-Puebla-de-Zaragoza.graphml,1542232.0
...,...,...,...,...,...,...,...,...,...,...
95,07,Chiapas,078,San Cristóbal de las Casas,0001,San Cristóbal de las Casas,070780001,./data/mexico_cities_graphs/07_Chiapas/078_San...,0001_San-Cristóbal-de-las-Casas.graphml,183509.0
96,30,Veracruz de Ignacio de la Llave,131,Poza Rica de Hidalgo,0001,Poza Rica de Hidalgo,301310001,./data/mexico_cities_graphs/30_Veracruz-de-Ign...,0001_Poza-Rica-de-Hidalgo.graphml,180057.0
97,22,Querétaro,016,San Juan del Río,0001,San Juan del Río,220160001,./data/mexico_cities_graphs/22_Querétaro/016_S...,0001_San-Juan-del-Río.graphml,177719.0
98,26,Sonora,055,San Luis Río Colorado,0001,San Luis Río Colorado,260550001,./data/mexico_cities_graphs/26_Sonora/055_San-...,0001_San-Luis-Río-Colorado.graphml,176685.0


Load each graph and calculate stats:

In [92]:
def load_graph_get_stats(row):
    
    try:
        start_time = time.time()
        print('START {}/{}'.format(row['path'], row['file']))
        G = ox.load_graphml(filepath=row['path']+'/'+row['file'])
        G = ox.project_graph(G, to_crs=3857) # project graph to CRS pseudo-UTM with meter units
        area_m = float(G.graph['area_m'])
        
        # calculate basic stats
        stats = ox.basic_stats(G, area=area_m, clean_intersects=True, circuity_dist='euclidean')
        
        stats['CVEGEO'] = row['CVEGEO']
        stats['CVE_ENT'] = row['CVE_ENT']
        stats['NOM_ENT'] = row['NOM_ENT']
        stats['CVE_MUN'] = row['CVE_MUN']
        stats['NOM_MUN'] = row['NOM_MUN']
        stats['CVE_LOC'] = row['CVE_LOC']
        stats['NOM_LOC'] = row['NOM_LOC']
        
        # calculate extended stats excluding shortest paths length, eccentricity, diameter, radius, center
        
        # Convert MultiDiGraph to DiGraph.
        # Chooses between parallel edges by minimizing weight attribute value.
        D = ox.utils_graph.get_digraph(G, weight="length")
        # create undirected Graph from the DiGraph, for those metrics that need it
        Gu = nx.Graph(D)
        
         # average degree of the neighborhood of each node, and average for graph
        avg_neighbor_degree = nx.average_neighbor_degree(G)
        stats["avg_neighbor_degree"] = avg_neighbor_degree
        stats["avg_neighbor_degree_avg"] = sum(avg_neighbor_degree.values()) / len(avg_neighbor_degree)
        
        # avg weighted degree of neighborhood of each node, and average for graph
        avg_wtd_nbr_deg = nx.average_neighbor_degree(G, weight="length")
        stats["avg_weighted_neighbor_degree"] = avg_wtd_nbr_deg
        stats["avg_weighted_neighbor_degree_avg"] = sum(avg_wtd_nbr_deg.values()) / len(avg_wtd_nbr_deg)
        
        # degree centrality for a node is the fraction of nodes it is connected to
        degree_centrality = nx.degree_centrality(G)
        stats["degree_centrality"] = degree_centrality
        stats["degree_centrality_avg"] = sum(degree_centrality.values()) / len(degree_centrality)
        
        # calculate clustering coefficient for the nodes
        stats["clustering_coefficient"] = nx.clustering(Gu)
        
        # average clustering coefficient for the graph
        stats["clustering_coefficient_avg"] = nx.average_clustering(Gu)
        
        # calculate weighted clustering coefficient for the nodes
        stats["clustering_coefficient_weighted"] = nx.clustering(Gu, weight="length")
        
        # average clustering coefficient (weighted) for the graph
        stats["clustering_coefficient_weighted_avg"] = nx.average_clustering(Gu, weight="length")
        
        # pagerank: a ranking of the nodes in the graph based on the structure of
        # the incoming links
        pagerank = nx.pagerank(D, weight="length")
        stats["pagerank"] = pagerank
        
        # node with the highest page rank, and its value
        pagerank_max_node = max(pagerank, key=lambda x: pagerank[x])
        stats["pagerank_max_node"] = pagerank_max_node
        stats["pagerank_max"] = pagerank[pagerank_max_node]
        
        # node with the lowest page rank, and its value
        pagerank_min_node = min(pagerank, key=lambda x: pagerank[x])
        stats["pagerank_min_node"] = pagerank_min_node
        stats["pagerank_min"] = pagerank[pagerank_min_node]
        
        # betweenness centrality of a node is the sum of the fraction of
        # all-pairs shortest paths that pass through node. nx2.4+
        # implementation cannot run on Multi(Di)Graphs, so use DiGraph
        btwn_cent = nx.betweenness_centrality(D, weight="length")
        stats["betweenness_centrality"] = btwn_cent
        stats["betweenness_centrality_avg"] = sum(btwn_cent.values()) / len(btwn_cent)
        
        # closeness centrality of a node is the reciprocal of the sum of the
        # shortest path distances from u to all other nodes
        close_cent = nx.closeness_centrality(G, distance="length")
        stats["closeness_centrality"] = close_cent
        stats["closeness_centrality_avg"] = sum(close_cent.values()) / len(close_cent)
        
        # extract nodes of the graph
        node_list = [node for node in close_cent]
        stats['nodes'] = node_list
                
        stats['area_km'] = area_m / 1e6
        stats['area'] = area_m
        stats['time'] = time.time()-start_time
        print('FINISH {}/{} in {} seconds'.format(row['path'], row['file'], stats['time']))
        return pd.Series(stats)

    except Exception as e:
        print('{}/{} failed: {}'.format(row['path'], row['file'], e))
        return pd.Series()

Calculate stats of the first half

In [95]:
# first half
first_half = df.head(50)

In [None]:
%%time
stats_first = first_half.apply(load_graph_get_stats, axis=1)

In [None]:
# save stats in a csv for further usage to not recalculate stats again
stats_first.to_csv('./data/stats_mexico_cities_first_half.csv', encoding='utf-8', index=False)

Calculate stats of the second half

In [95]:
# second half
second_half = df.tail(50)

In [None]:
%%time
stats_second = second_half.apply(load_graph_get_stats, axis=1)

In [None]:
# save stats in a csv for further usage to not recalculate stats again
stats_second.to_csv('./data/stats_mexico_cities_second_half.csv', encoding='utf-8', index=False)