https://api.census.gov/data/2017/acs/acs5/profile/variables.html

In [1]:
import geopandas  as gpd
import getcensus as gc
import os
import pandas as pd
from keys import census_api_key

input_path = 'data/sampled_graph_filepaths.csv'

In [2]:
# which census dataset
dataset = 'acs/acs5'

# which vintage year
year = 2017

# which census variables to retrieve for each tract
variables = ['DP05_0001E',    #total pop
             'DP05_0077PE',   #pct pop non-hispanic white alone
             'DP04_0007PE',   #pct single family detached homes
             'DP04_0089E',    #median value of owner occupied units (dollars)
             'DP04_0037E',    #median number of rooms in house
             'DP03_0062E',    #median household income (inf-adj 2017 usd)
             'DP02_0067PE',   #pct bachelor's degree or higher
             'DP03_0025E',    #mean travel time to work (minutes)
             'DP03_0019PE',   #pct commute drove alone
             'DP04_0026PE',   #1939 or earlier (pct housing structure built)
             'DP04_0025PE',   #1940-49
             'DP04_0024PE',   #1950-59
             'DP04_0023PE',   #1960-69
             'DP04_0022PE',   #1970-79
             'DP04_0021PE',   #1980-89
             'DP04_0020PE',   #1990-99
             'DP04_0019PE',   #2000-09
             'DP04_0018PE',   #2010-13
             'DP04_0017PE']   #2014 or later

In [3]:
tract_filepaths = pd.read_csv(input_path, header=None)[0].sort_values()
tracts = tract_filepaths.map(lambda x: x.split('/')[-1].strip('.graphml'))
len(tracts)

72663

In [4]:
%%time
cd = gc.get_census_tracts_data(tract_fips=tracts, api_key=census_api_key, dataset=dataset,
                               year=year, variables=variables, clean=True)

Downloading 19 census vars in 01001 for 12 tracts.
Downloading 19 census vars in 01003 for 31 tracts.
Downloading 19 census vars in 01005 for 9 tracts.
Downloading 19 census vars in 01007 for 4 tracts.
Downloading 19 census vars in 01009 for 9 tracts.
Downloading 19 census vars in 01011 for 3 tracts.
Downloading 19 census vars in 01013 for 9 tracts.
Downloading 19 census vars in 01015 for 31 tracts.
Downloading 19 census vars in 01017 for 9 tracts.
Downloading 19 census vars in 01019 for 6 tracts.
Downloading 19 census vars in 01021 for 9 tracts.
Downloading 19 census vars in 01023 for 4 tracts.
Downloading 19 census vars in 01025 for 9 tracts.
Downloading 19 census vars in 01027 for 4 tracts.
Downloading 19 census vars in 01029 for 4 tracts.
Downloading 19 census vars in 01031 for 14 tracts.
Downloading 19 census vars in 01033 for 14 tracts.
Downloading 19 census vars in 01035 for 5 tracts.
Downloading 19 census vars in 01037 for 3 tracts.
Downloading 19 census vars in 01039 for 14 tr

In [15]:
cols = {'DP05_0001E'  : 'total_pop',
        'DP05_0077PE' : 'pct_white',
        'DP04_0007PE' : 'pct_single_fam',
        'DP04_0089E'  : 'med_home_value',
        'DP04_0037E'  : 'med_rooms_per_home',
        'DP04_0026PE' : 'pct_1939_earlier',
        'DP04_0025PE' : 'pct_1940_49',
        'DP04_0024PE' : 'pct_1950_59',
        'DP04_0023PE' : 'pct_1960_69',
        'DP04_0022PE' : 'pct_1970_79',
        'DP04_0021PE' : 'pct_1980_89',
        'DP04_0020PE' : 'pct_1990_99',
        'DP04_0019PE' : 'pct_2000_09',
        'DP04_0018PE' : 'pct_2010_13',
        'DP04_0017PE' : 'pct_2014_later',
        'DP03_0062E'  : 'med_hh_income',
        'DP03_0025E'  : 'mean_commute_time',
        'DP03_0019PE' : 'pct_drive_alone',
        'DP02_0067PE' : 'pct_bachelors_higher'}

df = cd.rename(columns=cols)
len(df)

72663

In [16]:
# convert percents to proportions
for col in df.columns:
    if 'pct_' in col:
        new_col = col.replace('pct_', 'prop_')
        df[new_col] = df[col] / 100
        df = df.drop(columns=col)

In [17]:
# merge post-2010 structures-built columns into one
df['prop_2010_later'] = df['prop_2010_13'] + df['prop_2014_later']
df = df.drop(columns=['prop_2010_13', 'prop_2014_later'])

In [18]:
df.head()

Unnamed: 0_level_0,total_pop,med_home_value,med_rooms_per_home,med_hh_income,mean_commute_time,state,county,prop_white,prop_single_fam,prop_bachelors_higher,prop_drive_alone,prop_1939_earlier,prop_1940_49,prop_1950_59,prop_1960_69,prop_1970_79,prop_1980_89,prop_1990_99,prop_2000_09,prop_2010_later
GEOID10,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1001020801,2913.0,299100.0,6.7,80089.0,24.6,1,1,0.863,0.848,0.387,0.895,0.048,0.021,0.029,0.011,0.156,0.188,0.214,0.3,0.033
1001020802,11333.0,163200.0,6.1,64439.0,27.4,1,1,0.818,0.757,0.245,0.891,0.02,0.015,0.017,0.053,0.107,0.101,0.235,0.318,0.136
1001020200,2172.0,96100.0,5.6,41287.0,22.2,1,1,0.416,0.759,0.162,0.905,0.144,0.009,0.047,0.171,0.325,0.028,0.117,0.159,0.0
1001021000,2796.0,96700.0,5.9,46607.0,35.6,1,1,0.715,0.625,0.152,0.839,0.064,0.027,0.128,0.046,0.121,0.179,0.27,0.147,0.017
1001020300,3385.0,98900.0,5.8,46806.0,23.1,1,1,0.614,0.854,0.181,0.883,0.008,0.022,0.028,0.177,0.369,0.075,0.157,0.132,0.031


In [19]:
df.to_csv('data/census_data.csv', index=True, encoding='utf-8')