# Load American Community Survery (ACS) census data
This notebook queries ACS census counts from census.gov, aggregates them by 
groupings defined in the configuration file data/acs_variables.csv, and exports 
a custom .csv file with one record per geographic level (tract or ZCTA) and one
column per configured field.

> Pre-requisites
> - Add ACS concepts of interest and field aggregations to data/acs_variables.csv

In [None]:
import numpy as np
import os
import pandas as pd

import censusdata
import geopandas as gpd
import matplotlib.pyplot as plt
from us import states

In [None]:
def get_va_acs(level: str, year:int, acs_vars: pd.DataFrame):
    """Method to get American Community Survey census data for Virginia."""

    if level not in ['tract', 'zip code tabulation area']:
        raise ValueError(f"""Invalid level argument given. Options are 'tract' 
            and 'zip code tabulation area'.""") 

    geo = censusdata.censusgeo([('state', states.VA.fips), (level, '*')])
    vars = list(set(acs_vars.variable))
    acs_data = censusdata.download(src='acs5', year=year, geo=geo, var=vars)

    # Reset index to be geographic identifier (Tract or ZCTA).
    if level == 'tract':
        acs_data['IX'] = [''.join(x[1] for x in g.geo) for g in acs_data.index]
    else:
        acs_data['IX'] = [x.geo[-1][1] for x in acs_data.index]
    acs_data.set_index('IX', inplace=True)

    # Merge variables and create output columns.
    for field, data in acs_vars.groupby('field'):
        cols = list(data.variable)
        acs_data[field] = acs_data[cols].sum(axis=1)

    # Drop original ACS variable named columns.
    acs_data.drop(columns=vars, inplace=True)
    return acs_data

def verify_aggregation(acs_data: pd.DataFrame, acs_vars: pd.DataFrame):
    """Verify that all aggregations sum to the total for every level of
    geographic aggregation.
    """

    if 'total' not in acs_data:
        raise ValueError(f"""Input acs_data must contain a 'total' column.""")

    total_sum = np.array(acs_data.total)
    categories = [c for c in set(acs_vars.category)]
    for cat in categories:
        print(f'Checking {cat}...')
        cols = [c for c in acs_data if c.startswith(cat)]
        cat_sum = np.array(acs_data[cols]).sum(axis=1)
        assert all(total_sum == cat_sum), f'{cat} does not sum to total'

def export_va_acs(level: str, start_year:int, end_year:int, 
                  acs_vars: pd.DataFrame):
    """Load verify, and export ACS data across multiple years"""

    # Query ACS data by year
    acs_data = []
    for year in range(start_year, end_year+1):
        year_df = get_va_acs(level=level, year=year, acs_vars=acs_vars)
        year_df.insert(0, 'year', year)
        acs_data.append(year_df)

    #
    acs_data = pd.concat(acs_data)
    verify_aggregation(acs_data, acs_vars)
    agg = 'zcta' if level == 'zip code tabulation area' else level
    acs_data.index.name = agg
    acs_data.to_csv(f'out/acs_{agg}_buckets_year.csv', index=True)
    return acs_data

In [None]:
# Load ACS variable configuration from file.
acs_vars = pd.read_csv('data/acs_variables.csv')

# Export ACS data
export_va_acs('tract', 2015, 2021, acs_vars)
# export_va_acs('zip code tabulation area', 2015, 2021, acs_vars)
