# Access & download US Census data 




In [1]:
# import libraries

import requests
import json
import pandas as pd
from pathlib import Path
import ast
from io import StringIO

## 1) Prep Census query
2020 data

In [2]:
def get_all_variable_table_df(year):
    variable_table_url = f'https://api.census.gov/data/{year}/acs/acs5/variables.html' #f'https://api.census.gov/data/{year}/acs/acs1/variables.html'
    v_table = pd.read_html(variable_table_url)
    return v_table

In [3]:
def get_variable_df(year, v_table):
    variable_df = pd.DataFrame(v_table[0])
    variable_df['Label'].replace({"!!": " ", ":": ""}, regex=True, inplace=True)

    return variable_df

In [4]:
def get_variable_index(variable_table, start_label, end_label):
    start_index = variable_table[variable_table['Label'] == start_label].index[0]
    
    if end_label == 'Estimate Total Hispanic or Latino':
        end_index = variable_table[variable_table['Label'] == end_label].index[1] # second instance of repeated label is the one we want
    else:
        end_index = variable_table[variable_table['Label'] == end_label].index[0]

    return start_index, end_index + 1

In [5]:
def get_variable_names(variable_table, indeces):
    variables_names = ",".join(variable_table.iloc[indeces[0]: indeces[1]]['Name'].values)
    return variables_names

In [6]:
def get_query_url(year, variables, st):
    # API Reference: https://www.census.gov/data/developers/guidance/api-user-guide.Example_API_Queries.html
    #https://api.census.gov/data/2020/acs/acs5/examples.html
    
    
    host = 'https://api.census.gov/data'
    year = f'/{year}'
    dataset_acronym = '/acs/acs5'
    g = '?get='
    #location = '&for=block%20group:*&in=state:'+ st+'&in=county:*&in=tract:*' #cbg
    location = '&for=tract:*&in=state:'+ st  #tract
    usr_key = f"&key=d80e6cde028d24646d48637958f273fa5bc19b3b" #api_key

    query_url = f"{host}{year}{dataset_acronym}{g}{variables}{location}{usr_key}"

    return query_url

In [7]:
def get_query_text(query_url):
    response = requests.get(query_url)
    return response.text

In [8]:
def get_values_from_response(response_text):
    #values = [int(i) for i in ast.literal_eval(response_text)[1][:-1]] 
    
    string_response = StringIO(response_text)
    df = pd.read_csv(string_response, sep=",")
    df = df.replace('[\[, \], "]','', regex=True)
    df.columns = df.columns.str.replace('[\[, \], "]','')
    
    return df

In [9]:
def get_labels(variable_df, indeces):
    labels  = []
    for idx in indeces:
        lab = [i.replace("!!", " ").replace(":", "") for i in variable_df.iloc[idx[0]:idx[1]]['Label'].values]
        labels.append(lab)
    flat_labels = [item for sublist in labels for item in sublist]
    
    return flat_labels

## 2) Variable Selection

#householder gender and dependency ratios
start_label = 'Estimate Total Male'
end_label = 'Estimate Total Female 85 years and over'

#num of units in structure B25024
'Estimate Total 1, detached'
'Estimate Total Mobile home'


#tenure B25008
'Estimate Total Owner occupied'
'Estimate Total Renter occupied'
    

#hispanic or lation or others: B03002 --> dont use #race B02001
'Estimate Total Not Hispanic or Latino White alone'
'Estimate Total Hispanic or Latino'

#year structure built B25034
'Estimate Total Built 2014 or later'
'Estimate Median year structure built'

#year moved in B25038
'Estimate Total Owner occupied Moved in 2019 or later'
'Estimate Median year householder moved into unit -- Total'

#num bedrooms B25041
'Estimate Total No bedroom'
'Estimate Total 5 or more bedrooms'

#num rooms B25017
'Estimate Total 1 room'
'Estimate Median number of rooms'

#primary heating fuel B25040
'Estimate Total Utility gas'
'Estimate Total No fuel used'


#num household members B11016
'Estimate Total Family households 2-person household' 
'Estimate Total Nonfamily households 7-or-more person household'
        

#hh income in past year B19001
'Estimate Total Less than $10,000'
'Estimate Total $200,000 or more'
'Estimate Median household income in the past 12 months (in 2020 inflation-adjusted dollars)'

In [10]:
#VARIABLE SELECTION
#check --> https://api.census.gov/data/2020/acs/acs5/variables.html

#householder gender and dependency ratios
#num of units in structure B25024
#tenure B03002
#hispanic or lation or others: B03002 --> dont use #race B02001
#year structure built B25034 --> do we need this #year moved in B25038? 
# num bedrooms B25041
#num rooms B25017
#primary heating fuel B25040
#num household members B11016
#hh income in past year B19001


start_label = ['Estimate Total Male', 'Estimate Total 1, detached', 'Estimate Total Owner occupied', 'Estimate Total Not Hispanic or Latino White alone',
               'Estimate Total Built 2014 or later',  'Estimate Total No bedroom', 'Estimate Total 1 room', 'Estimate Total Utility gas', 
               'Estimate Total Family households 2-person household','Estimate Total Less than $10,000',
               'Estimate Median household income in the past 12 months (in 2020 inflation-adjusted dollars)',
               'Estimate Total In households Householder Male'] #'Estimate Total Owner occupied Moved in 2019 or later',

end_label = ['Estimate Total Female 85 years and over', 'Estimate Total Mobile home', 'Estimate Total Renter occupied',
             'Estimate Total Hispanic or Latino', 'Estimate Median year structure built',  'Estimate Total 5 or more bedrooms',
             'Estimate Median number of rooms', 'Estimate Total No fuel used', 'Estimate Total Nonfamily households 7-or-more person household',
             'Estimate Total $200,000 or more', 'Estimate Median household income in the past 12 months (in 2020 inflation-adjusted dollars)',
             'Estimate Total In households Householder Female'] #'Estimate Median year householder moved into unit -- Total',


In [11]:
#prep variables for query
year = 2020
#html table for 2020 variables
v_table = get_all_variable_table_df(year)

In [12]:
#check --> https://api.census.gov/data/2020/acs/acs5/variables.html --> should have 27893 rows
var_df = get_variable_df(year, v_table)
#var_df.to_csv('../../data/in/acs5_2020_raw_ct/ACS5_2020_vb_code_labels.csv')

#get selected variable
#loop for all vbs
v_index = []
v_names = []
for start, end in zip(start_label, end_label):
    variable_indeces = get_variable_index(var_df, start, end)
    variables = get_variable_names(var_df, variable_indeces) 
    
    #save for dictionary
    v_index.append(variable_indeces)
    v_names.append(variables)

#convert to one string
v_names_all = ','.join(v_names)
#replace emptuy strings
v_names_all = v_names_all.replace(',,', ',')
#split to do 50 vb queris at a time (max num of queries)
v_names_separate_list = v_names_all.split(',')
name_chunks = [v_names_separate_list[x:x+50] for x in range(0, len(v_names_separate_list), 50)]

## 3) Query census

For all states and all CTs

In [13]:
#loop through states to get data
state_lst = ['01', '02', '04', '05', '06', '08', '09', '10','11', '12',
             '13', '15', '16', '17', '18', '19', '20', '21', '22',
            '23', '24', '25', '26', '27', '28', '29', '30', '31',
            '32', '33', '34', '35', '36', '37', '38', '39', '40',
            '41', '42', '44', '45', '46', '47', '48', '49', '50',
            '51', '53', '54', '55', '56', '72']


#launch query
for st in state_lst:
    df_summary = pd.DataFrame(columns=['state','county','tract'])
    for ch in name_chunks:
        #convert to one string
        vbs = ','.join(ch)
        print(st)
        
        #query
        query_url = get_query_url(year, vbs, st) 
        response_text = get_query_text(query_url)
        
        #save data into one df every 50 vbs
        vals = get_values_from_response(response_text)
        #labels = get_labels(var_df, variable_indeces)
        #dic_labels = dict(zip(v_names_separate_list, labels)) 
        #vals.rename(columns=dic_labels, inplace = True)
        #print(dic_labels)
        
        #merge dataframes
        df_summary = df_summary.merge(vals, on=['state','county','tract'], how='outer') 

    #rename variables from code to label, so that we can understand them
    labels = get_labels(var_df, v_index)
    dic_labels = dict(zip(v_names_all.split(','), labels)) #based on all variables dict
    df_summary.rename(columns=dic_labels, inplace = True)
    df_summary.to_csv('/global/scratch/users/cristina_crespo/p1_data/in_us_census/acs5_2020_raw_ct/ACS5_2020_'+st+'_ct.csv')
    


72


  df.columns = df.columns.str.replace('[\[, \], "]','')


72


  df.columns = df.columns.str.replace('[\[, \], "]','')


72


  df.columns = df.columns.str.replace('[\[, \], "]','')


In [14]:
len(df_summary.columns.to_list()) - 6 #state, county, tract % unnamed

140

## 4) Save all state outputs into one csv

2020 data

In [None]:
#Read census data 
#loop through states to get data
state_lst = ['01', '02', '04', '05', '06', '08', '09', '10','11', '12',
             '13', '15', '16', '17', '18', '19', '20', '21', '22',
            '23', '24', '25', '26', '27', '28', '29', '30', '31',
            '32', '33', '34', '35', '36', '37', '38', '39', '40',
            '41', '42', '44', '45', '46', '47', '48', '49', '50',
            '51', '53', '54', '55', '56', '72']

print(len(state_lst))
dfs= []
#launch query
for st in state_lst: # to do add other functions
    print(st)
    df_summary_raw = pd.read_csv('/global/scratch/users/cristina_crespo/p1_data/in_us_census/acs5_2020_raw_ct/ACS5_2020_'+st+'_ct.csv')
    df_summary = df_summary_raw.loc[:,~df_summary_raw.columns.str.startswith('Unnamed')]
    df_summary = refor.dep_ratios(df_summary)
    df_summary = refor.avg_hh_members(df_summary)
    df_summary = refor.avg_income(df_summary)
    df_summary = refor.hh_gender(df_summary)
    df_summary = refor.unit_str(df_summary)
    df_summary = refor.tenure_type(df_summary)
    df_summary = refor.race_ethn(df_summary)
    df_summary = refor.yr_built(df_summary)
    df_summary = refor.num_bedrooms(df_summary)
    df_summary = refor.num_rooms(df_summary)
    df_summary = refor.fuel_heat(df_summary)
    df_summary = refor.total_pop(df_summary)
    
    #subset to variables of interest in correct format
    df_summary = refor.subset_columns(df_summary)
    #append states
    dfs.append(df_summary)
    
    
data_f = pd.concat(dfs, ignore_index = True, sort=False)

In [None]:
data_f.to_csv(path +'in_us_census/acs5_2020_vbs_per_ct.csv')