# Get all tracts within certain cities

Given a CSV file containing city names, get all the tracts within those cities' boundaries.

In [1]:
import geopandas as gpd
import json
import os
import pandas as pd

all_tracts_path = 'data/us_census_tracts_2014'
places_path = 'data/us_census_places_2014'
states_by_fips_path = 'data/states_by_fips.json'
cities_path = 'data/study_sites.csv'
output_path = 'data/tracts_in_cities_study_area.geojson'

In [2]:
# load the city names that make up our study sites
study_sites = pd.read_csv(cities_path, encoding='utf-8')
len(study_sites)

50

In [3]:
%%time
# load all US census tracts shapefile
all_tracts = gpd.read_file(all_tracts_path)
len(all_tracts)

Wall time: 16.6 s


In [4]:
%%time
# load all US places (cities/towns) shapefile
places = gpd.GeoDataFrame()
for folder in os.listdir(places_path):
    path = '{}/{}'.format(places_path, folder)
    gdf_tmp = gpd.read_file(path)
    places = places.append(gdf_tmp)
len(places)

Wall time: 13.6 s


In [5]:
# get state abbreviation from FIPS
with open(states_by_fips_path) as f:
    states = json.load(f)
fips_state = {k:v['abbreviation'] for k, v in states.items()}
places['state'] = places['STATEFP'].replace(fips_state, inplace=False)

In [6]:
cities_states = study_sites.apply(lambda row: '{}, {}'.format(row['city'], row['state']), axis=1)

In [7]:
# find these city names in the GDF of all census places
gdf_cities = gpd.GeoDataFrame()
for city_state in cities_states:
    city, state = [item.strip() for item in city_state.split(',')]
    
    mask = (places['NAME']==city) & (places['state']==state)
    if not mask.sum()==1:
        mask = (places['NAME'].str.contains(city)) & (places['state']==state)
        if not mask.sum()==1:
            mask = (places['NAME'].str.contains(city)) & (places['state']==state) & ~(places['NAMELSAD'].str.contains('CDP'))
            if not mask.sum()==1:
                print('Cannot uniquely find "{}"'.format(city_state))
                
    gdf_city = places.loc[mask]
    gdf_cities = gdf_cities.append(gdf_city)
    
len(gdf_cities)

50

In [8]:
# make "name" field like "city, state"
gdf_cities['name'] = gdf_cities.apply(lambda row: '{}, {}'.format(row['NAME'], row['state']), axis=1)
gdf_cities['name'] = gdf_cities['name'].replace({'Indianapolis city (balance), IN' : 'Indianapolis, IN',
                                                 'Nashville-Davidson metropolitan government (balance), TN' : 'Nashville, TN'})

In [9]:
# make gdf of the cities for joining
cities = gdf_cities[['GEOID', 'name', 'geometry']]
cities = cities.rename(columns={'GEOID':'place_geoid', 'name':'place_name'})
cities = cities.set_index('place_geoid')

# make gdf of the tracts for joining
tract_geoms = all_tracts.set_index('GEOID')[['geometry', 'ALAND']]

In [10]:
%%time
# shrink tracts by ~1 meter to avoid peripheral touches on the outside of the city boundary
tract_geoms['geom_tmp'] = tract_geoms['geometry'].buffer(-0.00001)
tract_geoms = tract_geoms.set_geometry('geom_tmp')

Wall time: 29.3 s


In [11]:
%%time
assert tract_geoms.crs == cities.crs
tracts = gpd.sjoin(tract_geoms, cities, how='inner', op='intersects')
print(len(tracts))

12505
Wall time: 14.3 s


In [12]:
# remove the temporary shrunken geometry
tracts = tracts.set_geometry('geometry').drop(columns=['geom_tmp'])
tracts = tracts.rename(columns={'index_right':'place_geoid'})
tracts.head()

Unnamed: 0,geometry,ALAND,place_geoid,place_name
1073005103,"POLYGON ((-86.840604 33.474255, -86.8405339999...",1738959,107000,"Birmingham, AL"
1073004902,"POLYGON ((-86.80923399999999 33.48756, -86.809...",1148589,107000,"Birmingham, AL"
1073005909,"POLYGON ((-86.74520699999999 33.610235, -86.74...",4516001,107000,"Birmingham, AL"
1073012304,"POLYGON ((-86.9791 33.497596, -86.978972 33.49...",7546587,107000,"Birmingham, AL"
1073012305,"POLYGON ((-87.01625 33.481515, -87.016122 33.4...",15730698,107000,"Birmingham, AL"


In [13]:
%%time
gdf_save = tracts.reset_index().rename(columns={'index':'GEOID'})
os.remove(output_path) # due to overwriting bug in fiona
gdf_save.to_file(output_path, driver='GeoJSON')
print(output_path)

data/tracts_in_cities_study_area.geojson
Wall time: 17 s
