In [1]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

import urllib3
from bs4 import BeautifulSoup
import lxml.html as lh

import pandas as pd
import numpy as np

import itertools
from itertools import islice, count

import json
from json import JSONDecodeError

import pickle

import census_config

### Census variables related to racial populations

There are more than 50 population estimates and percentage estimates for different racial and ethnic groups and combinations per state. I'm using the Census Data API website and BeautifulSoup to get a dictionary with the variable name and descriptions from a table – and the exact wording of each description changed after 2016.

In [2]:
def get_variables(year):
    census_variables_url = 'https://api.census.gov/data/{}/acs/acs1/profile/variables.html'.format(year)

    page = requests.get(census_variables_url)
    variable_soup = BeautifulSoup(page.content, 'lxml')
    variable_table = variable_soup.find('table')
    variable_rows = variable_table.find_all('tr')
    
    id2variable_description = dict()
    
    if year <= 2016:
        name_ids = set('DP05_{:04}E'.format(i) for i in range(28,81)) | set('DP05_{:04}PE'.format(i) for i in range(28,81))
    else:
        name_ids = set('DP05_{:04}E'.format(i) for i in range(33,86)) | set('DP05_{:04}PE'.format(i) for i in range(33,86))
    
    for row in variable_rows:
        row_items = row.find_all('td')
        if len(row_items) > 1:
            name_id = row_items[0].get_text('name')
            description = row_items[1].get_text('td')

            if name_id in name_ids:
                id2variable_description[name_id] = description
    return id2variable_description

### U.S. territory IDs

I'm using this [list of territory IDs](https://www2.census.gov/geo/docs/reference/state.txt?#) in order to match U.S. states and territories in the API. Note that the IDs aren't just a list from 1 to 50 and that Washington, D.C. and Puerto Rico are included. Unfortunately, several of the territories listed in this reference do not have detailed data available yearly through the American Community Survey. From what I can tell, there is some longform census data available for several of these island territories, so I might approach that as a next step at some point. 

Thus, my `id2territory` variable really just has IDs for states plus Washington, D.C. and Puerto Rico. 

In [3]:
census_territories_ids_url = 'https://www2.census.gov/geo/docs/reference/state.txt?#'

areas_without_data = set(['American Samoa', 'Guam', 'Northern Mariana Islands', 'U.S. Minor Outlying Islands', 'U.S. Virgin Islands'])

In [4]:
page = requests.get(census_territories_ids_url)
territory_soup = BeautifulSoup(page.content, 'html.parser')
census_territories_ids = territory_soup.prettify().split('\n')[1:-1]

id2territory = dict()

for territory in census_territories_ids:
    items = territory.split('|')
    if items[2] not in areas_without_data:
        id2territory[items[0]]=items[2]

### ACS API data

This loop takes about 8 hours to run. It's a somewhat naive implementation (I could instead get all territories with one API call), but I'm keeping it this way to make the structure that I want (states as rows and variable descriptions as columns) easier to obtain for now. I'm also saving a dictionary of DataFrames for each year so that I can fix issues with renaming columns outside of this already larger-than-I-want-it-to-be loop. 

In [5]:
# api_url = 'https://api.census.gov/data/{}/acs/acs1/profile?get=NAME,{}&for=state:{}&key={}'

# session = requests.Session()
# retry = Retry(connect=3, backoff_factor=1)
# adapter = HTTPAdapter(max_retries=retry)
# session.mount('http://', adapter)
# session.mount('https://', adapter)

# years = range(2011,2019)

# detailed_race_by_year = dict()

# for year in years:
#     detailed_race_list = []
#     for territory in id2territory.keys():
#         row = dict()
#         row['State'] = id2territory[territory]
#         row['Year'] = year

#         id2variable_description = get_variables(year)

#         for variable in id2variable_description.keys():
#             census_api = api_url.format(year, variable, territory, census_config.api_key)

#             try:
#                 data = json.loads(session.get(census_api).text)
#                 row[id2variable_description[variable]] = data[1][1]
#                 success = True
#             except JSONDecodeError as e:
#                 print('Error for {} in {} {}: {}'.format(variable, id2territory[territory], year, e))
#                 success = False

#             if not success:
#                 row[id2variable_description[variable]] = np.nan

#         detailed_race_list.append(row)

#         detailed_race_by_year[year] = pd.DataFrame(detailed_race_list)

### Saving progress

I don't want to run the above loop more often than I have to – I'll save the dictionary for later use. This is also after some manual inspection and retrying of any errors. 

In [7]:
# with open('detailed_race_by_year.pkl', 'wb') as file:
#     pickle.dump(detailed_race_by_year, file)

In [6]:
with open('detailed_race_by_year.pkl', 'rb') as file:
    detailed_race_by_year = pickle.load(file)

### Manual column cleaning/renaming/standardizing

In [29]:
# for year in range(2011,2019):
#     columns = detailed_race_by_year[year].columns
#     new_columns = {column: column.replace('Estimate!!RACE!!Total population!!Some other race', 'Estimate!!RACE!!Total population!!Race alone or in combination with one or more other races!!Total population!!Some other race') for column in columns}
#     detailed_race_by_year[year] = detailed_race_by_year[year].rename(columns=new_columns)
#     detailed_race_by_year[year] = detailed_race_by_year[year].loc[:, ~detailed_race_by_year[year].columns.duplicated()]
#     detailed_race_by_year[year] = detailed_race_by_year[year].drop('Percent Estimate!!RACE!!One race',axis=1)

In [30]:
# for i in range(len(detailed_race_by_year[2013].columns)):
#     columns_string = '\n'+detailed_race_by_year[2013].columns[i] + '\ndoes not match \n' + detailed_race_by_year[2018].columns[i]
#     assert (detailed_race_by_year[2013].columns[i] == detailed_race_by_year[2018].columns[i]), columns_string

### Cleaned result

In [33]:
# with open('detailed_race_by_year_cleaned.pkl', 'wb') as file:
#     pickle.dump(detailed_race_by_year, file)

In [133]:
with open('detailed_race_by_year_cleaned.pkl', 'rb') as file:
    detailed_race_by_year = pickle.load(file)

### Let's do some visualization and analysis

In [31]:
detailed_race_df = pd.concat(list(detailed_race_by_year.values()))

In [38]:
detailed_race_df = detailed_race_df.reset_index(drop=True)

In [39]:
detailed_race_df.to_csv('Detailed_Race_DP05_2011_2018.csv')

In [40]:
detailed_race_df

Unnamed: 0,State,Year,Estimate!!RACE!!Total population!!Total population,Percent Estimate!!RACE!!Total population!!Total population,Estimate!!RACE!!Total population!!One race,Percent Estimate!!RACE!!Total population!!One race,Estimate!!RACE!!Total population!!Two or more races,Percent Estimate!!RACE!!Total population!!Two or more races,Estimate!!RACE!!Total population!!One race!!White,Percent Estimate!!RACE!!Total population!!One race!!White,...,Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Not Hispanic or Latino!!Native Hawaiian and Other Pacific Islander alone,Percent Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Not Hispanic or Latino!!Native Hawaiian and Other Pacific Islander alone,Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Not Hispanic or Latino!!Some other race alone,Percent Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Not Hispanic or Latino!!Some other race alone,Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Not Hispanic or Latino!!Two or more races,Percent Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Not Hispanic or Latino!!Two or more races,Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Not Hispanic or Latino!!Two or more races!!Two races including Some other race,Percent Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Not Hispanic or Latino!!Two or more races!!Two races including Some other race,"Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Not Hispanic or Latino!!Two or more races!!Two races excluding Some other race, and Three or more races","Percent Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Not Hispanic or Latino!!Two or more races!!Two races excluding Some other race, and Three or more races"
0,Alabama,2011,4802740,4802740,4738290,98.7,64450,1.3,3318110,69.1,...,1902,0.0,3880,0.1,57563,1.2,2546,0.1,55017,1.1
1,Alaska,2011,722718,722718,662489,91.7,60229,8.3,482698,66.8,...,7557,1.0,529,0.1,53596,7.4,273,0.0,53323,7.4
2,Arizona,2011,6482505,6482505,6288212,97.0,194293,3.0,5141027,79.3,...,10983,0.2,6376,0.1,115084,1.8,3675,0.1,111409,1.7
3,Arkansas,2011,2937979,2937979,2880750,98.1,57229,1.9,2296248,78.2,...,6967,0.2,1425,0.0,48524,1.7,914,0.0,47610,1.6
4,California,2011,37691912,37691912,36097905,95.8,1594007,4.2,23698393,62.9,...,136245,0.4,73698,0.2,995621,2.6,51890,0.1,943731,2.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
411,Washington,2018,7535591,7535591,7082958,94.0,452633,6.0,5633263,74.8,...,52145,0.7,14078,0.2,369024,4.9,8582,0.1,360442,4.8
412,West Virginia,2018,1805832,1805832,1771421,98.1,34411,1.9,1679773,93.0,...,451,0.0,2286,0.1,31865,1.8,335,0.0,31530,1.7
413,Wisconsin,2018,5813568,5813568,5667055,97.5,146513,2.5,4957243,85.3,...,1800,0.0,6629,0.1,119584,2.1,2648,0.0,116936,2.0
414,Wyoming,2018,577737,577737,564021,97.6,13716,2.4,528453,91.5,...,1065,0.2,537,0.1,10194,1.8,0,0.0,10194,1.8
