# Download census tract data from API

  - Available data: https://api.census.gov/data.html
  - Variables (aka, fields) you can query for: https://api.census.gov/data/2014/acs5/profile/variables.html
  - Browse variables for a place here: https://www.census.gov/acs/www/data/data-tables-and-tools/data-profiles/2014/
  - Sample query: https://api.census.gov/data/2014/acs5/profile?get=DP05_0001E&for=tract:400100&in=state:06+county:001
  - County FIPS codes: https://www.nrcs.usda.gov/wps/portal/nrcs/detail/?cid=nrcs143_013697
  
#### Variables name format

More info: https://www.census.gov/data/developers/data-sets/acs-5year/data-notes.html

variable name format: [TableID]_[RowNumber][VariableType]

Example: Variable DP02_0002PE, "Family households (families)", represents the percent estimate for table DP02 row number 2.

DP (Data Profile): Table type containing broad social, economic, housing, and demographic information in a total of four profiles.

  - DP02: Social Characteristics — includes Education, Marital Status, Relationships, Fertility, Grandparents... 
  - DP03: Economic Characteristics — includes Income, Employment, Occupation, Commuting to Work... 
  - DP04: Housing Characteristics — includes Occupancy and Structure, Housing Value and Costs, Utilities... 
  - DP05: Demographic Characteristics — includes Sex and Age, Race, Hispanic Origin, Housing Units... 

Variable suffixes:

  - E = estimate
  - M = margin of error
  - PE = percent estimate (of total)
  - PM = margin of error for corresponding PE
  - A = annotation

In [1]:
import geopandas as gpd
import getcensus as gc
import os
import pandas as pd
from keys import census_api_key

In [2]:
# which census dataset
dataset = 'acs5'

# which vintage year
year = 2014

# which census variables to retrieve for each tract
variables = ['DP05_0001E',    #total pop
             'DP05_0017E',    #median age
             'DP05_0008PE',   #pct pop aged 20-24
             'DP05_0009PE',   #pct pop aged 25-34
             'DP05_0066PE',   #pct pop hispanic or latino
             'DP05_0072PE',   #pct pop non-hispanic white alone
             'DP05_0033PE',   #pct pop black
             'DP05_0039PE',   #pct pop asian
             'DP04_0007PE',   #pct single family detached homes
             'DP04_0088E',    #median value of owner occupied units (dollars)
             'DP04_0036E',    #median number of rooms in house
             'DP04_0025PE',   #pct structure built 1939 or earlier
             'DP04_0046PE',   #pct renter-occupied housing units
             'DP04_0046E',    #count renter-occupied housing units
             'DP04_0005E',    #rental vacancy rate
             'DP04_0048E',    #average household size of renter-occupied housing units
             'DP04_0132E',    #median gross rent (dollars)
             'DP04_0139PE',   #pct with gross rent 30-34.9% of household income
             'DP04_0140PE',   #pct with gross rent 35% or more of household income
             'DP03_0062E',    #median household income
             'DP03_0025E',    #mean travel time to work
             'DP03_0019PE',   #pct commute drove alone
             'DP03_0128PE',   #pct people with income below povery level
             'DP02_0057PE',   #pct who are students currently enrolled in college or grad school
             'DP02_0022PE',   #pct population in household is nonrelatives
             'DP02_0079PE',   #pct residence 1 year ago was same house
             'DP02_0067PE',   #pct bachelor's degree or higher
             'DP02_0111PE',   #pct with english only language spoken at home
             'DP02_0092PE']   #pct of population foreign born

# data directories
tracts_path = 'data/tracts_in_cities_study_area.geojson'
output_path = 'data/downloaded_census_data.geojson'

In [3]:
# download and display census descriptions of each variable
variable_descriptions = gc.get_census_variable_descriptions(dataset=dataset, 
                                                            year=year, 
                                                            variables=variables)
for v, d in variable_descriptions.items():
    print('{}\t{}'.format(v, d['label']))

DP05_0001E	SEX AND AGE!!Total population
DP05_0017E	SEX AND AGE!!Total population!!Median age (years)
DP05_0008PE	SEX AND AGE!!Total population!!20 to 24 years
DP05_0009PE	SEX AND AGE!!Total population!!25 to 34 years
DP05_0066PE	HISPANIC OR LATINO AND RACE!!Total population!!Hispanic or Latino (of any race)
DP05_0072PE	HISPANIC OR LATINO AND RACE!!Total population!!Not Hispanic or Latino!!White alone
DP05_0033PE	RACE!!One race!!Black or African American
DP05_0039PE	RACE!!One race!!Asian
DP04_0007PE	UNITS IN STRUCTURE!!1-unit, detached
DP04_0088E	VALUE!!Owner-occupied units!!Median (dollars)
DP04_0036E	ROOMS!!Median rooms
DP04_0025PE	YEAR STRUCTURE BUILT!!Built 1939 or earlier
DP04_0046PE	HOUSING TENURE!!Occupied housing units!!Renter-occupied
DP04_0046E	HOUSING TENURE!!Occupied housing units!!Renter-occupied
DP04_0005E	HOUSING OCCUPANCY!!Total housing units!!Rental vacancy rate
DP04_0048E	HOUSING TENURE!!Occupied housing units!!Average household size of renter-occupied unit
DP04_0132E

## Get vars from ACS DP

In [4]:
# load the tracts in our study areas
gdf = gpd.read_file(tracts_path).sort_values(by='GEOID')
len(gdf)

12505

In [5]:
%%time
df = gc.get_census_tracts_data(tract_fips=gdf['GEOID'], api_key=census_api_key, dataset=dataset,
                               year=year, variables=variables, clean=True)

Downloading 29 census vars in 01073 for 101 tracts.
Downloading 29 census vars in 01117 for 4 tracts.
Downloading 29 census vars in 04013 for 382 tracts.
Downloading 29 census vars in 06037 for 1000 tracts.
Downloading 29 census vars in 06037 for 6 tracts.
Downloading 29 census vars in 06065 for 77 tracts.
Downloading 29 census vars in 06067 for 123 tracts.
Downloading 29 census vars in 06073 for 313 tracts.
Downloading 29 census vars in 06075 for 197 tracts.
Downloading 29 census vars in 06085 for 225 tracts.
Downloading 29 census vars in 08031 for 144 tracts.
Downloading 29 census vars in 09003 for 40 tracts.
Downloading 29 census vars in 11001 for 179 tracts.
Downloading 29 census vars in 12031 for 166 tracts.
Downloading 29 census vars in 12057 for 124 tracts.
Downloading 29 census vars in 12086 for 108 tracts.
Downloading 29 census vars in 12095 for 86 tracts.
Downloading 29 census vars in 13089 for 14 tracts.
Downloading 29 census vars in 13121 for 123 tracts.
Downloading 29 cens

In [6]:
%%time
# get 2012 pct white so we can examine its change over time
dataset = 'acs5'
year = 2012
variables = ['DP05_0072PE'] #pct pop non-hispanic white alone
df_2012 = gc.get_census_tracts_data(tract_fips=gdf['GEOID'], api_key=census_api_key, dataset=dataset,
                                    year=year, variables=variables, clean=True)

Downloading 1 census vars in 01073 for 101 tracts.
Downloading 1 census vars in 01117 for 4 tracts.
Downloading 1 census vars in 04013 for 382 tracts.
Downloading 1 census vars in 06037 for 1000 tracts.
Downloading 1 census vars in 06037 for 6 tracts.
Downloading 1 census vars in 06065 for 77 tracts.
Downloading 1 census vars in 06067 for 123 tracts.
Downloading 1 census vars in 06073 for 313 tracts.
Downloading 1 census vars in 06075 for 197 tracts.
Downloading 1 census vars in 06085 for 225 tracts.
Downloading 1 census vars in 08031 for 144 tracts.
Downloading 1 census vars in 09003 for 40 tracts.
Downloading 1 census vars in 11001 for 179 tracts.
Downloading 1 census vars in 12031 for 166 tracts.
Downloading 1 census vars in 12057 for 124 tracts.
Downloading 1 census vars in 12086 for 108 tracts.
Downloading 1 census vars in 12095 for 86 tracts.
Downloading 1 census vars in 13089 for 14 tracts.
Downloading 1 census vars in 13121 for 123 tracts.
Downloading 1 census vars in 17031 for

In [7]:
%%time
# get 2015 pct white so we can examine its change over time
dataset = 'acs5'
year = 2015
variables = ['DP05_0072PE'] #pct pop non-hispanic white alone
df_2015 = gc.get_census_tracts_data(tract_fips=gdf['GEOID'], api_key=census_api_key, dataset=dataset,
                                    year=year, variables=variables, clean=True)

Downloading 1 census vars in 01073 for 101 tracts.
Downloading 1 census vars in 01117 for 4 tracts.
Downloading 1 census vars in 04013 for 382 tracts.
Downloading 1 census vars in 06037 for 1000 tracts.
Downloading 1 census vars in 06037 for 6 tracts.
Downloading 1 census vars in 06065 for 77 tracts.
Downloading 1 census vars in 06067 for 123 tracts.
Downloading 1 census vars in 06073 for 313 tracts.
Downloading 1 census vars in 06075 for 197 tracts.
Downloading 1 census vars in 06085 for 225 tracts.
Downloading 1 census vars in 08031 for 144 tracts.
Downloading 1 census vars in 09003 for 40 tracts.
Downloading 1 census vars in 11001 for 179 tracts.
Downloading 1 census vars in 12031 for 166 tracts.
Downloading 1 census vars in 12057 for 124 tracts.
Downloading 1 census vars in 12086 for 108 tracts.
Downloading 1 census vars in 12095 for 86 tracts.
Downloading 1 census vars in 13089 for 14 tracts.
Downloading 1 census vars in 13121 for 123 tracts.
Downloading 1 census vars in 17031 for

In [8]:
# merge the 2012 and 2015 variables together into a single df
df_2012 = df_2012.drop(columns=['state', 'county'])
df_2015 = df_2015.drop(columns=['state', 'county'])
df_2012_2015 = pd.merge(left=df_2012, right=df_2015, how='inner', left_index=True, right_index=True,
                        suffixes=('_2012', '_2015'))

In [9]:
# merge the tracts with the 2014 variables, then with the 2012/2015 variables
merged = pd.merge(left=gdf.set_index('GEOID'), right=df, how='inner', left_index=True, right_index=True)
merged = pd.merge(left=merged, right=df_2012_2015, how='inner', left_index=True, right_index=True)
merged.head()

Unnamed: 0_level_0,ALAND,place_geoid,place_name,geometry,DP05_0001E,DP05_0017E,DP05_0008PE,DP05_0009PE,DP05_0066PE,DP05_0072PE,...,DP02_0057PE,DP02_0022PE,DP02_0079PE,DP02_0067PE,DP02_0111PE,DP02_0092PE,state,county,DP05_0072PE_2012,DP05_0072PE_2015
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1073000100,7549578,107000,"Birmingham, AL","POLYGON ((-86.74404800000001 33.573646, -86.74...",2970.0,32.2,4.3,16.5,4.6,17.1,...,12.8,3.8,84.7,9.0,94.2,3.1,1,73,15.5,16.9
1073000300,2093104,107000,"Birmingham, AL","POLYGON ((-86.76195 33.540608, -86.761394 33.5...",2494.0,36.5,7.9,17.2,18.0,4.6,...,22.3,3.6,85.9,14.3,77.7,16.5,1,73,7.6,5.4
1073000400,8001582,107000,"Birmingham, AL","POLYGON ((-86.78617300000001 33.5658, -86.7857...",3437.0,30.6,7.0,18.1,0.7,7.9,...,29.5,7.3,78.4,3.3,98.2,0.5,1,73,6.9,9.4
1073000500,4819145,107000,"Birmingham, AL","POLYGON ((-86.791932 33.543298, -86.79187 33.5...",3735.0,35.8,6.1,10.4,1.4,5.0,...,10.3,1.9,82.9,6.9,97.3,0.0,1,73,2.8,3.9
1073000700,3520564,107000,"Birmingham, AL","POLYGON ((-86.815184 33.558129, -86.812606 33....",2562.0,25.4,6.8,14.6,2.5,0.0,...,4.9,4.6,91.9,7.3,98.9,0.5,1,73,0.1,0.0


In [10]:
# make sure everything we merged is the same length
assert len(gdf) == len(df) == len(df_2012_2015) == len(merged)

In [11]:
%%time
os.remove(output_path) # due to overwriting bug in fiona
merged.reset_index().to_file(output_path, driver='GeoJSON')
print(output_path)

data/downloaded_census_data.geojson
Wall time: 24.1 s
