# Census ACS
Retrieves data from the Census Bureau's American Community Survey 5-year series API for a given area. A specific list of census variables is passed into the script, which are retrieved from the four ACS profile tables. After some processing output is written to a SQLite database. Option to create a metadata table appears at the bottom, but should only be run in limited instances.

https://www.census.gov/data/developers/data-sets/acs-5year.html

## Variables

In [1]:
import os, requests, json, pandas as pd, numpy as np, sqlite3
from IPython.display import clear_output

In [2]:
keyfile='census_key.txt'

#API variables - UPDATE THE YEAR AND GEO
year='2017'
dsource='acs'
dsource2='acs5'
dname='profile'
state='36'
geo='zip code tabulation area'
#geo='public use microdata area'

#Variables to read in - UPDATE THE SHEET
worksheet='acs1'
#worksheet='acs2'
geogs='zctas'
#geogs='pumas'

#SQL output - UPDATE TABLE NAME
tabname='pumas_2017acs1'
#tabname='pumas_2017acs2'
dbname=os.path.join('outputs','testdb.sqlite')

#Dump files for api data storage
jsonpath=os.path.join('outputs', tabname+'_retrieved_data.json')

## Variable List
Get full list of variables from the API, read in our retrieval list, and compare the ID codes and names to make sure nothing is missing and that nothing has changed since the last iteration. Don't move on to the next block until both lists match.

In [3]:
datadict={}
dps=['DP02','DP03','DP04','DP05']
for p in dps:
    vars_url = f'https://api.census.gov/data/{year}/{dsource}/{dsource2}/{dname}/groups/{p}.json'
    response=requests.get(vars_url)
    var_data=response.json()
    datadict.update(var_data['variables'])
next(iter(datadict.items() ))

('DP02_0019EA',
 {'label': 'Annotation of Estimate!!RELATIONSHIP!!Population in households!!Spouse',
  'predicateType': 'string',
  'group': 'DP02',
  'limit': 0,
  'predicateOnly': True})

In [4]:
dfexcel = pd.read_excel(os.path.join('inputs','acs_variables.xlsx'),sheet_name=worksheet)
dfexcel.head()

Unnamed: 0,db_var,census_var,census_label,dtype
0,HSHD01_E,DP02_0001E,Estimate!!HOUSEHOLDS BY TYPE!!Total households,int
1,HSHD01_M,DP02_0001M,Margin of Error!!HOUSEHOLDS BY TYPE!!Total hou...,int
2,HSHD01_PC,DP02_0001PE,Percent Estimate!!HOUSEHOLDS BY TYPE!!Total ho...,int
3,HSHD01_PM,DP02_0001PM,Percent Margin of Error!!HOUSEHOLDS BY TYPE!!T...,int
4,HSHD02_E,DP02_0002E,Estimate!!HOUSEHOLDS BY TYPE!!Total households...,int


In [5]:
dfvars = pd.DataFrame.from_dict(datadict,columns=['label'],orient='index')
dfvars_selected=dfvars.loc[dfvars.index.isin(dfexcel['census_var'])]
dfvars_count=len(dfvars_selected)
dfexcel_count=len(dfexcel['census_var'])

if dfvars_count==dfexcel_count:
    print('There are an equal number of variables in both lists:', dfvars_count)
else:
    print('There is a mismatch in the number of variables; the api has retrieved', 
          dfvars_count, 'while the original list has',dfexcel_count,'. Missing:')
    nomatch=dfexcel[~dfexcel['census_var'].isin(dfvars_selected.index)]
    print(nomatch)

There are an equal number of variables in both lists: 248


In [6]:
mismatch=dfexcel[~dfexcel['census_label'].isin(dfvars_selected['label'])]

if len (mismatch) ==0:
    print('All labels match')
else:
    test=pd.merge(mismatch,dfvars_selected, left_on='census_var', right_on=dfvars_selected.index)
    print('These labels do not match:')
    print(test[['census_var','census_label','label']])

All labels match


In [7]:
excelgeo = pd.read_excel(os.path.join('inputs','acs_variables.xlsx'),sheet_name=geogs, dtype=object)
geoids = excelgeo['GEO'].tolist()
len(geoids)

215

## Retrieve Data
Given the large number of variables in the ACS and limits of the API, variables must be passed to the url in separate blocks or chunks. The first chunk that's captured is written to an empty datalist; the header row and then one row for each geography. Each subsequent chunk is iterated through by row, so each row is appended to the correct row in datalist. In both cases, the last two values (identifiers automatically returned with each API call) are not appended.

In [8]:
def chunks(l, n):
    # For item i in a range that is a length of l,
    for i in range(0, len(l), n):
        # Create an index range for l of n items:
        yield l[i:i+n]

In [9]:
reqvars=list(chunks(dfvars_selected.index.tolist(),46))
reqvars[0].insert(0,'NAME')
reqvars[0].insert(0,'GEO_ID')
print('Number of chunks including header row:',len(reqvars))

Number of chunks including header row: 6


In [10]:
with open(keyfile) as key:
    api_key=key.read().strip()

base_url = f'https://api.census.gov/data/{year}/{dsource}/{dsource2}/{dname}'
base_url

'https://api.census.gov/data/2017/acs/acs5/profile'

#### ***THIS BLOCK IS A REQUESTS BLOCK!***

In [13]:
#TESTING

def getdata():
    dlist=[]
    for i, v in enumerate(reqvars):
        batchcols=','.join(v)
        if geogs=='pumas':
            data_url = f'{base_url}?get={batchcols}&for={geo}:{g}&in=state:{state}&key={api_key}'
            dropvar=-2
        elif geogs=='zctas':
            data_url = f'{base_url}?get={batchcols}&for={geo}:{g}&key={api_key}'
            dropvar=-1     
        response=requests.get(data_url)
        if response.status_code==200:
            clear_output(wait=True)
            data=response.json()
            for i2, v2 in enumerate(data):
                if i == 0:
                    dlist.append(v2[:dropvar])
                else:
                    for item in v2[:dropvar]:
                        dlist[i2].append(item)
        else:
            print('***Problem with retrieval***, response code',response.status_code)
    return dlist

datalist=[]

if geogs=='zctas':
    for g in geoids[0:4]:
        georecord=getdata()
        print('Retrieved data for',g)
        if len(datalist)==0:
            datalist.append(georecord[0])
            datalist.append(georecord[1])
        else:
            datalist.append(georecord[1])
elif geogs=='pumas':
    georecord=getdata()
    datalist.append(georecord)
    
dlrows=len(datalist)
dlitems=sum(len(x) for x in datalist)
dlbyrow=dlitems / dlrows

print('Done - retrieved', dlrows, 'records and', dlitems,'data points with', dlbyrow, 'data points for each record')
        
with open(jsonpath, 'w') as f:
    json.dump(datalist, f)
print('Data dumped to json file')


Retrieved data for 10004
Done - retrieved 5 records and 1250 data points with 250.0 data points for each record
Data dumped to json file


In [None]:
#XXX
#If this block is successful and there are subsequent problems, do not rerun it - start from the following block.

def getdata():
    dlist=[]
    for g in geoids[0:4]:
        georecord=[]
        for i, v in enumerate(reqvars):
            batchcols=','.join(v)
            if geogs=='pumas':
                data_url = f'{base_url}?get={batchcols}&for={geo}:{g}&in=state:{state}&key={api_key}'
                dropvar=-2
            elif geogs=='zctas':
                data_url = f'{base_url}?get={batchcols}&for={geo}:{g}&key={api_key}'
                dropvar=-1     
            response=requests.get(data_url)
            if response.status_code==200:
                clear_output(wait=True)
                data=response.json()
                for i2, v2 in enumerate(data):
                    if i == 0:
                        georecord.append(v2[:dropvar])
                    else:
                        for item in v2[:dropvar]:
                            georecord[i2].append(item)
            else:
                print('***Problem with retrieval***, response code',response.status_code)
        if len(dlist)==0:
            dlist.append(georecord[0])
            dlist.append(georecord[1])
        else:
            dlist.append(georecord[1])
        print('Retrieved all variables for', g)
    return dlist

datalist=getdata()

dlrows=len(datalist)
dlitems=sum(len(x) for x in datalist)
dlbyrow=dlitems / dlrows

print('Done - retrieved', dlrows, 'records and', dlitems,'data points with', dlbyrow, 'data points for each record')
        
with open(jsonpath, 'w') as f:
    json.dump(datalist, f)
print('Data dumped to json file')


In [None]:
#If this block is successful and there are subsequent problems, do not rerun it - start from the following block.
# datalist=[]
# c=0

# for i, v in enumerate(reqvars):
#     batchcols=','.join(v)
#     data_url = f'{base_url}?get={batchcols}&for={geo}:*&in=state:{state}&key={api_key}'
#     response=requests.get(data_url)
#     if response.status_code==200:
#         clear_output(wait=True)
#         data=response.json()
#         for i2, v2 in enumerate(data):
#             if i == 0:
#                 datalist.append(v2[:-2])
#             else:
#                 for item in v2[:-2]:
#                     datalist[i2].append(item)
#         c=c+1
#         print(c,'chunks of data written so far')
#     else:
#         print('***Problem with retrieval***, response code',response.status_code)
        
# dlrows=len(datalist)
# dlitems=sum(len(x) for x in datalist)
# dlbyrow=dlitems / dlrows

# print('Done - retrieved', dlrows, 'records and', dlitems,'data points with', dlbyrow, 'data points for each record')
        
# with open(jsonpath, 'w') as f:
#     json.dump(datalist, f)
# print('Data dumped to json file')


## Process Data
Create a new GEOID2 column, replace footnotes with nulls, replace census variable names with database variable names.

In [None]:
with open(jsonpath, 'r') as f:
    jsondata=json.load(f)
alldata = pd.DataFrame(jsondata[1:],columns=jsondata[0]).rename(columns={'GEO_ID':'GEOID','NAME':'GEOLABEL'}).set_index('GEOID')
alldata.shape
# Shape should be 1 row and 1 column less than previous count (excludes header row and index column) 

In [None]:
alldata.head()

In [None]:
excelgeo = pd.read_excel(os.path.join('inputs','acs_variables.xlsx'),sheet_name=geogs)
geoids = excelgeo['GEOID'].tolist()
len(geoids)

In [None]:
acsdata=alldata.loc[alldata.index.isin(geoids)].copy().sort_index()
acsdata.shape

In [None]:
acsdata.head()

In [None]:
acsdata.insert(loc=0, column='GEOID2',value=acsdata.index.str[-7:])
acsdata.replace('-888888888',np.nan,inplace=True)
acsdata.replace('-888888888.0',np.nan,inplace=True)
acsdata.head()

In [None]:
cv_to_db=dict(zip(dfexcel.census_var, dfexcel.db_var))
cv_to_db

In [None]:
acsdata.rename(columns=cv_to_db,inplace=True)
acsdata.head()

## Write to Database
Update list of variables and data types, build create table string, create datatable in temporary database.


In [None]:
dfexcel.replace({'dtype': {'int': 'INTEGER', 'float': 'REAL'}},inplace=True)
dfexcel.census_label.replace({'!!': ' - '},inplace=True, regex=True)
dfexcel.head()

In [None]:
vardict=dfexcel.set_index('db_var').T.to_dict('list')
vardict

In [None]:
con = sqlite3.connect('test.sqlite') 
cur = con.cursor()

In [None]:
cur.execute('DROP TABLE IF EXISTS {}'.format(tabname))
dbstring="""
CREATE TABLE {} (
GEOID TEXT,
GEOID2 TEXT NOT NULL PRIMARY KEY,
GEOLABEL TEXT,
""".format(tabname)

for k,v in vardict.items():
    dbstring=dbstring+k+' '+v[2]+', \n'
    
dbstring=dbstring[:-3]
dbstring=dbstring+');'
print(dbstring)

In [None]:
cur.execute(dbstring)

In [None]:
acsdata.to_sql(name=tabname, if_exists='append', index=True, con=con)

In [None]:
cur.execute('SELECT COUNT(*) FROM {};'.format(tabname))
rows = cur.fetchone()
print(rows[0], 'records written to', tabname)

In [None]:
con.close()

## Metadata Table
DO NOT RERUN THIS SECTION FOR MULTIPLE GEOGRAPHIES. In the NYC Geodatabase there is only one metadata table for all of the ACS tables (acs1 and acs2) for all geographies. For whichever geography is processed first, set action variable to 'create' and run this entire series of blocks for the acs1 table. For the acs2 table, set the action variable to 'append' and skip the table creation and identifier insertion blocks.

In [None]:
#Change table name and specify an action - you're creating the table for the first time with acs1 variables, 
#or appending the tables with acs2 variables

metatab='acslookup2017'
action='create' # modify to 'create' or 'append'

In [None]:
con = sqlite3.connect('test.sqlite') 
cur = con.cursor()

In [None]:
#Only run this block when creating initial table
if action=='create':
    mdstring="""
    CREATE TABLE {} (
    tabnum TEXT,
    est_id TEXT,
    est_value TEXT);
    """.format(metatab)
    cur.execute(mdstring)
else:
    print('Block not executed because "create" not selected as an action in earlier block')

In [None]:
#Only run this block when creating initial table
if action=='create':
    exstring="""
        INSERT INTO {} VALUES('both','NOTE','Each variable has 4 values that are identified by a particular suffix: E for estimate, M for margin of error for the estimate, PC for percent total, and PM for margin of error for the percent total');
        INSERT INTO {} VALUES('both','GEOID','Id');
        INSERT INTO {} VALUES('both','GEOID2','Id2');
        INSERT INTO {} VALUES('both','GEOLABEL','Geography');
        """.format(metatab,metatab,metatab,metatab)
    cur.executescript(exstring)
    con.commit()
else:
    print('Block not executed because "create" not selected as an action in earlier block')

In [None]:
#Run when creating table or when appending records
if action in ('create','append'):
    for mk, mv in vardict.items():
        cur.execute("INSERT INTO {} values(?,?,?)".format(metatab),(worksheet,mk,mv[1]))
    con.commit()
else:
    print('Block not executed because action not specified in earlier block')

In [None]:
cur.execute('SELECT COUNT(*) FROM {};'.format(metatab))
rows = cur.fetchone()
print(rows[0], 'records in', metatab)

In [None]:
action=''
con.close()