In [1]:
import importlib
from pathlib import Path
import sys

from arcgis.features import GeoAccessor, GeoSeriesAccessor
import pandas as pd

In [2]:
project_parent = Path('./').absolute().parent

# import the project package from the project package path
# ideally will be imported using 'from arcgis import da'
sys.path.append(str(project_parent/'src'))
import dm

# load the "autoreload" extension so as you change code in src, it gets reloaded
%load_ext autoreload
%autoreload 2

  class GeoAccessorIO(GeoAccessor):


# Introspectivley Examine and Get Geographies

In [3]:
# discover what countries are available, and get dataframe of countries
cntry_df = dm.util.get_countries()

cntry_df

Unnamed: 0,geographic_level,country,year
0,USA_ESRI_2019,USA,2019


In [4]:
# specify a country using the identifier from the country field
usa = dm.Country('USA', source='local')

isinstance(usa, dm.Country)

True

In [5]:
# get the geographic resolutions available for the country as a dataframe from smallest to largest
geos = usa.geographies

geos

Unnamed: 0,geo_name,geo_alias,col_id,col_name,feature_class_path
0,block_groups,Block Groups,ID,NAME,D:\arcgis\ba_data\Data\Demographic Data\USA_ES...
1,census_tracts,Census Tracts,ID,NAME,D:\arcgis\ba_data\Data\Demographic Data\USA_ES...
2,cities_and_towns_places,Cities and Towns (Places),ID,NAME,D:\arcgis\ba_data\Data\Demographic Data\USA_ES...
3,zip_codes,ZIP Codes,ID,NAME,D:\arcgis\ba_data\Data\Demographic Data\USA_ES...
4,county_subdivisions,County Subdivisions,ID,NAME,D:\arcgis\ba_data\Data\Demographic Data\USA_ES...
5,counties,Counties,ID,NAME,D:\arcgis\ba_data\Data\Demographic Data\USA_ES...
6,cbsas,CBSAs,ID,NAME,D:\arcgis\ba_data\Data\Demographic Data\USA_ES...
7,congressional_districts,Congressional Districts,ID,NAME,D:\arcgis\ba_data\Data\Demographic Data\USA_ES...
8,dmas,DMAs,ID,NAME,D:\arcgis\ba_data\Data\Demographic Data\USA_ES...
9,states,States,ID,NAME,D:\arcgis\ba_data\Data\Demographic Data\USA_ES...


In [6]:
cbsa_df = usa.cbsas.get('seattle')

cbsa_df

Unnamed: 0,ID,NAME,SHAPE
0,42660,"Seattle-Tacoma-Bellevue, WA Metropolitan Stati...","{""rings"": [[[-13651055.7226, 5968866.240900002..."


In [7]:
# checking the location of the geometry to make sure it looks correct on a simple map
webmap = cbsa_df.spatial.plot()
webmap.basemap = 'gray-vector'
webmap

MapView(layout=Layout(height='400px', width='100%'))

In [8]:
# get the geographies falling within an area by the geographic name
bg_df = usa.cbsas.get('seattle').block_groups.get()

bg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2474 entries, 0 to 2473
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   ID      2474 non-null   object  
 1   NAME    2474 non-null   object  
 2   SHAPE   2474 non-null   geometry
dtypes: geometry(1), object(2)
memory usage: 58.1+ KB


In [9]:
# get the geographies falling within an area by the index - makes it easier to get the lowest possible geographic resolution
lvl_df = usa.cbsas.get('seattle').level(0).get()

lvl_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2474 entries, 0 to 2473
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   ID      2474 non-null   object  
 1   NAME    2474 non-null   object  
 2   SHAPE   2474 non-null   geometry
dtypes: geometry(1), object(2)
memory usage: 58.1+ KB


In [10]:
# checking what the results look like as a table
lvl_df.head()

Unnamed: 0,ID,NAME,SHAPE
0,530530714071,530530714.071,"{""rings"": [[[-13618997.0451, 5953796.150899999..."
1,530530714072,530530714.072,"{""rings"": [[[-13621890.9066, 5953114.284599997..."
2,530530714073,530530714.073,"{""rings"": [[[-13622599.9004, 5953136.347999997..."
3,530530714112,530530714.112,"{""rings"": [[[-13627506.4183, 5953782.585000001..."
4,530530729061,530530729.061,"{""rings"": [[[-13654973.1668, 5957970.395400003..."


In [11]:
# checking what the results look like as a simple map
webmap02 = lvl_df.spatial.plot()
webmap02.basemap = 'gray-vector'
webmap02

MapView(layout=Layout(height='400px', width='100%'))

In [12]:
# also, many times the area of interest is not a standard geography - a district or possilby a sales territory
# in this case, we can use the within method and pass in either a Spatially Enabled DataFrame, Geometry list, or single geometry to get the smaller geographies for analysis
within_df = usa.level(0).within(cbsa_df)

within_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2474 entries, 0 to 2473
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   ID      2474 non-null   object  
 1   NAME    2474 non-null   object  
 2   SHAPE   2474 non-null   geometry
dtypes: geometry(1), object(2)
memory usage: 58.1+ KB


# Enrich

In [13]:
# accessing the enrich variables makes it easy to introspectively see what variables are available
# incidentally, I would also like to simply be able to reference current year and future year in this table to make scripts more future proof
usa.enrich_variables.head()

Unnamed: 0,name,alias,type,vintage,data_collection,enrich_str,enrich_field_name
0,AGE0_CY,2019 Population Age <1,COUNT,2019,1yearincrements,1yearincrements.AGE0_CY,F1yearincrements_AGE0_CY
1,AGE1_CY,2019 Population Age 1,COUNT,2019,1yearincrements,1yearincrements.AGE1_CY,F1yearincrements_AGE1_CY
2,AGE2_CY,2019 Population Age 2,COUNT,2019,1yearincrements,1yearincrements.AGE2_CY,F1yearincrements_AGE2_CY
3,AGE3_CY,2019 Population Age 3,COUNT,2019,1yearincrements,1yearincrements.AGE3_CY,F1yearincrements_AGE3_CY
4,AGE4_CY,2019 Population Age 4,COUNT,2019,1yearincrements,1yearincrements.AGE4_CY,F1yearincrements_AGE4_CY


In [14]:
# frequently I start exploring a solution by using just the Key US Facts - checking which variables this includes is a standard dataframe filtering function
usa.enrich_variables[(usa.enrich_variables.data_collection == 'KeyUSFacts') & (usa.enrich_variables.vintage == '2019')]

Unnamed: 0,name,alias,type,vintage,data_collection,enrich_str,enrich_field_name
6450,TOTPOP_CY,2019 Total Population,COUNT,2019,KeyUSFacts,KeyUSFacts.TOTPOP_CY,KeyUSFacts_TOTPOP_CY
6452,GQPOP_CY,2019 Population in Group Quarters,COUNT,2019,KeyUSFacts,KeyUSFacts.GQPOP_CY,KeyUSFacts_GQPOP_CY
6453,DIVINDX_CY,2019 Diversity Index,COUNT,2019,KeyUSFacts,KeyUSFacts.DIVINDX_CY,KeyUSFacts_DIVINDX_CY
6456,TOTHH_CY,2019 Total Households,COUNT,2019,KeyUSFacts,KeyUSFacts.TOTHH_CY,KeyUSFacts_TOTHH_CY
6458,AVGHHSZ_CY,2019 Average Household Size,COUNT,2019,KeyUSFacts,KeyUSFacts.AVGHHSZ_CY,KeyUSFacts_AVGHHSZ_CY
6459,MEDHINC_CY,2019 Median Household Income,CURRENCY,2019,KeyUSFacts,KeyUSFacts.MEDHINC_CY,KeyUSFacts_MEDHINC_CY
6461,AVGHINC_CY,2019 Average Household Income,CURRENCY,2019,KeyUSFacts,KeyUSFacts.AVGHINC_CY,KeyUSFacts_AVGHINC_CY
6463,PCI_CY,2019 Per Capita Income,CURRENCY,2019,KeyUSFacts,KeyUSFacts.PCI_CY,KeyUSFacts_PCI_CY
6467,TOTHU_CY,2019 Total Housing Units,COUNT,2019,KeyUSFacts,KeyUSFacts.TOTHU_CY,KeyUSFacts_TOTHU_CY
6469,OWNER_CY,2019 Owner Occupied HUs,COUNT,2019,KeyUSFacts,KeyUSFacts.OWNER_CY,KeyUSFacts_OWNER_CY


In [15]:
# from here it is easy to get an iterable (pd.Series) of the enrichment variables we want
enrich_vars = usa.enrich_variables[(usa.enrich_variables.data_collection == 'KeyUSFacts') & (usa.enrich_variables.vintage == '2019')].enrich_str

enrich_vars

6450     KeyUSFacts.TOTPOP_CY
6452      KeyUSFacts.GQPOP_CY
6453    KeyUSFacts.DIVINDX_CY
6456      KeyUSFacts.TOTHH_CY
6458    KeyUSFacts.AVGHHSZ_CY
6459    KeyUSFacts.MEDHINC_CY
6461    KeyUSFacts.AVGHINC_CY
6463        KeyUSFacts.PCI_CY
6467      KeyUSFacts.TOTHU_CY
6469      KeyUSFacts.OWNER_CY
6471     KeyUSFacts.RENTER_CY
6473     KeyUSFacts.VACANT_CY
6475     KeyUSFacts.MEDVAL_CY
6477     KeyUSFacts.AVGVAL_CY
6479    KeyUSFacts.POPGRW10CY
6480     KeyUSFacts.HHGRW10CY
6481    KeyUSFacts.FAMGRW10CY
6487       KeyUSFacts.DPOP_CY
6488    KeyUSFacts.DPOPWRK_CY
6489    KeyUSFacts.DPOPRES_CY
Name: enrich_str, dtype: object

In [19]:
# get the geographies falling within an area by the geographic ids - dramatically speeds up enrichment because do not have to perform apportionment
# bg_enrich_df = bg_df.spatial.enrich(enrich_var)

bg_enrich_df = usa.cbsas.get('seattle').block_groups.get().spatial.enrich(enrich_vars)

bg_enrich_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2474 entries, 0 to 2473
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   ID                     2474 non-null   object  
 1   NAME                   2474 non-null   object  
 2   SHAPE                  2474 non-null   geometry
 3   OBJECTID               2474 non-null   int64   
 4   HasData                2474 non-null   int64   
 5   aggregationMethod      2474 non-null   object  
 6   KeyUSFacts_TOTPOP_CY   2474 non-null   float64 
 7   KeyUSFacts_GQPOP_CY    2474 non-null   float64 
 8   KeyUSFacts_DIVINDX_CY  2474 non-null   float64 
 9   KeyUSFacts_TOTHH_CY    2474 non-null   float64 
 10  KeyUSFacts_AVGHHSZ_CY  2474 non-null   float64 
 11  KeyUSFacts_MEDHINC_CY  2474 non-null   float64 
 12  KeyUSFacts_AVGHINC_CY  2474 non-null   float64 
 13  KeyUSFacts_PCI_CY      2474 non-null   float64 
 14  KeyUSFacts_TOTHU_CY    2474 non-null   f

In [20]:
bg_enrich_df.head()

Unnamed: 0,ID,NAME,SHAPE,OBJECTID,HasData,aggregationMethod,KeyUSFacts_TOTPOP_CY,KeyUSFacts_GQPOP_CY,KeyUSFacts_DIVINDX_CY,KeyUSFacts_TOTHH_CY,...,KeyUSFacts_RENTER_CY,KeyUSFacts_VACANT_CY,KeyUSFacts_MEDVAL_CY,KeyUSFacts_AVGVAL_CY,KeyUSFacts_POPGRW10CY,KeyUSFacts_HHGRW10CY,KeyUSFacts_FAMGRW10CY,KeyUSFacts_DPOP_CY,KeyUSFacts_DPOPWRK_CY,KeyUSFacts_DPOPRES_CY
0,530530714071,530530714.071,"{""rings"": [[[-13618997.0451, 5953796.150899999...",1,1,BlockApportionment:US.BlockGroups,1653.0,0.0,53.5,508.0,...,86.0,13.0,298837.0,402310.0,1.31,1.15,1.08,1082.0,159.0,923.0
1,530530714072,530530714.072,"{""rings"": [[[-13621890.9066, 5953114.284599997...",2,1,BlockApportionment:US.BlockGroups,1530.0,6.0,59.4,490.0,...,73.0,36.0,262662.0,265588.0,-0.14,-0.22,-0.27,901.0,50.0,851.0
2,530530714073,530530714.073,"{""rings"": [[[-13622599.9004, 5953136.347999997...",3,1,BlockApportionment:US.BlockGroups,1657.0,0.0,67.3,520.0,...,78.0,16.0,206471.0,226697.0,0.57,0.49,0.45,1085.0,364.0,721.0
3,530530714112,530530714.112,"{""rings"": [[[-13627506.4183, 5953782.585000001...",4,1,BlockApportionment:US.BlockGroups,1343.0,0.0,71.4,473.0,...,168.0,27.0,232576.0,234836.0,2.87,2.78,2.48,1294.0,498.0,796.0
4,530530729061,530530729.061,"{""rings"": [[[-13654973.1668, 5957970.395400003...",5,1,BlockApportionment:US.BlockGroups,2934.0,2913.0,57.2,7.0,...,0.0,0.0,0.0,0.0,8.82,0.0,0.0,3065.0,3062.0,3.0


In [26]:
# get the geographies falling within an area by the index and just use the geography polygon - makes it easier to get the lowest possible geographic resolution
cbsa_df = lvl_df = usa.cbsas.get('seattle')
bg_lvl = cbsa_df.level(0).get()
lvl_enrich_df = bg_lvl.spatial.enrich(enrich_vars)

lvl_enrich_df.head()

Unnamed: 0,ID,NAME,SHAPE,OBJECTID,HasData,aggregationMethod,KeyUSFacts_TOTPOP_CY,KeyUSFacts_GQPOP_CY,KeyUSFacts_DIVINDX_CY,KeyUSFacts_TOTHH_CY,...,KeyUSFacts_RENTER_CY,KeyUSFacts_VACANT_CY,KeyUSFacts_MEDVAL_CY,KeyUSFacts_AVGVAL_CY,KeyUSFacts_POPGRW10CY,KeyUSFacts_HHGRW10CY,KeyUSFacts_FAMGRW10CY,KeyUSFacts_DPOP_CY,KeyUSFacts_DPOPWRK_CY,KeyUSFacts_DPOPRES_CY
0,530530714071,530530714.071,"{""rings"": [[[-13618997.0451, 5953796.150899999...",1,1,BlockApportionment:US.BlockGroups,1653.0,0.0,53.5,508.0,...,86.0,13.0,298837.0,402310.0,1.31,1.15,1.08,1082.0,159.0,923.0
1,530530714072,530530714.072,"{""rings"": [[[-13621890.9066, 5953114.284599997...",2,1,BlockApportionment:US.BlockGroups,1530.0,6.0,59.4,490.0,...,73.0,36.0,262662.0,265588.0,-0.14,-0.22,-0.27,901.0,50.0,851.0
2,530530714073,530530714.073,"{""rings"": [[[-13622599.9004, 5953136.347999997...",3,1,BlockApportionment:US.BlockGroups,1657.0,0.0,67.3,520.0,...,78.0,16.0,206471.0,226697.0,0.57,0.49,0.45,1085.0,364.0,721.0
3,530530714112,530530714.112,"{""rings"": [[[-13627506.4183, 5953782.585000001...",4,1,BlockApportionment:US.BlockGroups,1343.0,0.0,71.4,473.0,...,168.0,27.0,232576.0,234836.0,2.87,2.78,2.48,1294.0,498.0,796.0
4,530530729061,530530729.061,"{""rings"": [[[-13654973.1668, 5957970.395400003...",5,1,BlockApportionment:US.BlockGroups,2934.0,2913.0,57.2,7.0,...,0.0,0.0,0.0,0.0,8.82,0.0,0.0,3065.0,3062.0,3.0


# Get Locations

In [None]:
# get the store locations from the business listings
loc_brand_df = usa_local.business.search('Ace Hardware')

# ...and since returning a sptatially enabled dataframe, can use spatial.to_featureclass to save directly with function chaining
usa_local.business.search('Ace Hardware').spatial.to_featureclass(gdb_int/'loc_brand')

loc_brand_df.head()

In [None]:
# get all the competitors for the area of interest

# ...by NAICS or SIC code...
loc_comp_df = usa_local.business.get_competitors(
    code=44413005,  # include ablity to specify shorted codes since NAICS codes can be shorter to be more general
    code_type='NAICS', 
    brand_exclude='Ace Hardware'
)

# ...or simplly by looking up using the existing location brand layer as a template
loc_comp_df = usa_local.business.get_competitors(brand_locations=loc_brand_df)

# ...and since returning a sptatially enabled dataframe, can use spatial.to_featureclass to save directly with function chaining
usa_local.business.get_competitors(brand_locations=loc_brand_df).spatial.to_featureclass(gdb_int/'loc_comp')

# Calculate Proximity Metrics

In [None]:
# calculate the origin to nth destinations table for brand locations
prox_df_brand = usa_local.proximity.get_neareset_nth_locations(
    origin_features=orgin_geo_df,
    origin_id_column='ID',                                      
    origin_centroid_weighting_features='path-to-block-points',  # features used to calculate a population weighted centroid location for routing
    origin_centroid_weighting_column='POP',                     # used to weight each population feature for centroid calculation
    destination_locations=loc_brand_df,
    destination_id_column='STORE_ID'
    destination_brand_or_concept_column='STORE_CONCEPT',              # think Nike Outlet versus Nike Brand Store
)

In [None]:
# calculate the origin to nth destinations table for brand locations
prox_df_brand = usa_local.proximity.get_neareset_nth_locations(
    origin_features=orgin_geo_df,
    origin_id_column='ID',                                      
    origin_centroid_weighting_features='path-to-block-points',  # features used to calculate a population weighted centroid location for routing
    origin_centroid_weighting_column='POP',                     # used to weight each population feature for centroid calculation
    destination_locations=loc_comp_df,
    destination_id_column='LOCNUM'
    destination_brand_or_concept_column='CONAME',               # think Nike Outlet versus Nike Brand Store
)

# ...and can even chain to create output using dataframe to_... functions
prox_df_brand = usa_local.proximity.get_neareset_nth_locations(orgin_geo_df, 'ID', 'path-to-block-points', 'POP', loc_comp_df, 'LOCNUM', 'CONAME').to_csv('prox_df_brand.csv')