# Demographic and economic data #1

The demographic information comes from "SELECTED ECONOMIC CHARACTERISTICS  
2012-2016 American Community Survey 5-Year Estimates", obtained at this URL: https://factfinder.census.gov/faces/tableservices/jsf/pages/productview.xhtml?pid=ACS_16_5YR_DP03&prodType=table.

We're interested in having some basic economic and demographic data, to quantify the impact that unaffordable water prices can cause.

The other notebook, `demo-2`, contains a very similar procedure for the household income data, after I realized that it was not included in the first table I exported.

In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np

import cawc

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
PATH = cawc.PATH_DATA / 'sec-2016'

data = pd.read_csv(PATH / 'ACS_16_5YR_DP03.csv')
data.sample(10)

Unnamed: 0,GEO.id,GEO.id2,GEO.display-label,HC03_VC03,HC04_VC03,HC03_VC04,HC04_VC04,HC03_VC05,HC04_VC05,HC03_VC06,...,HC03_VC176,HC04_VC176,HC03_VC177,HC04_VC177,HC03_VC178,HC04_VC178,HC03_VC179,HC04_VC179,HC03_VC180,HC04_VC180
2775,1400000US06037533901,6037533901,"Census Tract 5339.01, Los Angeles County, Cali...",4517,,66.2,3.9,66.2,3.9,59.1,...,25.8,6.8,25.9,7.3,24.7,15.8,30.4,7.9,45.2,15.2
1976,1400000US06037240700,6037240700,"Census Tract 2407, Los Angeles County, California",4646,,58.0,3.8,58.0,3.8,52.1,...,17.7,4.8,17.8,5.2,17.0,9.3,19.9,6.3,41.4,13.0
6573,1400000US06077004404,6077004404,"Census Tract 44.04, San Joaquin County, Califo...",2564,,63.4,4.2,63.4,4.2,56.5,...,29.9,7.5,31.7,7.3,16.7,17.2,32.0,8.9,40.1,9.9
3225,1400000US06037650001,6037650001,"Census Tract 6500.01, Los Angeles County, Cali...",4970,,57.5,3.8,57.5,3.8,55.1,...,11.3,5.0,11.1,5.5,11.9,10.1,6.9,6.1,40.2,14.3
5346,1400000US06071001308,6071001308,"Census Tract 13.08, San Bernardino County, Cal...",3683,,68.5,4.1,68.5,4.1,58.1,...,18.4,6.0,18.3,6.4,19.4,15.6,22.4,9.2,38.2,14.8
5835,1400000US06073008358,6073008358,"Census Tract 83.58, San Diego County, California",6488,,70.9,4.0,70.2,4.2,64.5,...,7.2,5.0,8.6,6.0,1.4,3.3,6.5,6.4,17.4,11.8
4298,1400000US06059099217,6059099217,"Census Tract 992.17, Orange County, California",2061,,65.5,3.5,65.5,3.5,63.2,...,4.4,2.2,5.6,2.8,0.4,1.2,1.3,1.6,15.1,6.6
5222,1400000US06067009309,6067009309,"Census Tract 93.09, Sacramento County, California",2028,,53.1,4.1,52.6,4.1,45.2,...,8.3,2.8,8.1,3.7,8.7,5.1,6.7,3.7,18.9,8.7
7945,1400000US06111006700,6111006700,"Census Tract 67, Ventura County, California",2669,,67.1,6.9,66.8,6.9,60.7,...,10.8,7.4,11.3,9.3,9.0,6.9,3.3,4.0,30.6,18.7
4762,1400000US06065043308,6065043308,"Census Tract 433.08, Riverside County, California",2127,,58.7,5.5,58.7,5.5,42.3,...,25.1,6.7,26.7,6.7,13.7,11.3,25.2,8.7,33.5,10.7


The data annotation table is inconvenient to browse; I tried to rearrange the format, but the intrinsic nested, hierarchical structure is poorly matched to a flat table.

In [3]:
def get_kind(c):
    return c.str.split('; ').str.get(0)

def get_description(c):
    descr_full = c.str.split('; ').str.get(1)
    return descr_full.str.split(' - ', expand=True)

def assign_description_columns(df):
    df_descr = get_description(df['fulltext'])
    return df.assign(**df_descr.rename(columns=lambda c: f'descr_{c}'))

def process_ann(df):
    return (df.T
            .rename(columns={0: 'fulltext'})
            .assign(kind=lambda d: get_kind(d.fulltext))
            .pipe(assign_description_columns)
            .drop(columns=['fulltext'])
            .fillna('-')
            .astype('category')
           )

In [4]:
# pd.read_csv?
ANN = (pd.read_csv(PATH / 'ACS_16_5YR_DP03_ann.csv', nrows=1)
       .pipe(process_ann)
      )

# ANN = ann.pipe(process_ann)
ANN

Unnamed: 0,kind,descr_0,descr_1,descr_2,descr_3,descr_4,descr_5
GEO.id,Id,-,-,-,-,-,-
GEO.id2,Id2,-,-,-,-,-,-
GEO.display-label,Geography,-,-,-,-,-,-
HC03_VC03,Percent,EMPLOYMENT STATUS,Population 16 years and over,-,-,-,-
HC04_VC03,Percent Margin of Error,EMPLOYMENT STATUS,Population 16 years and over,-,-,-,-
HC03_VC04,Percent,EMPLOYMENT STATUS,Population 16 years and over,In labor force,-,-,-
HC04_VC04,Percent Margin of Error,EMPLOYMENT STATUS,Population 16 years and over,In labor force,-,-,-
HC03_VC05,Percent,EMPLOYMENT STATUS,Population 16 years and over,In labor force,Civilian labor force,-,-
HC04_VC05,Percent Margin of Error,EMPLOYMENT STATUS,Population 16 years and over,In labor force,Civilian labor force,-,-
HC03_VC06,Percent,EMPLOYMENT STATUS,Population 16 years and over,In labor force,Civilian labor force,Employed,-


In [13]:
ANN.descr_0.cat.categories

Index(['-', 'CLASS OF WORKER', 'COMMUTING TO WORK', 'EMPLOYMENT STATUS',
       'HEALTH INSURANCE COVERAGE',
       'INCOME AND BENEFITS (IN 2016 INFLATION-ADJUSTED DOLLARS)', 'INDUSTRY',
       'OCCUPATION',
       'PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE PAST 12 MONTHS IS BELOW THE POVERTY LEVEL'],
      dtype='object')

In [15]:
ANN.loc[lambda x: x.descr_0.str.contains('income and benefits', case=False)]

Unnamed: 0,kind,descr_0,descr_1,descr_2,descr_3,descr_4,descr_5
HC03_VC74,Percent,INCOME AND BENEFITS (IN 2016 INFLATION-ADJUSTE...,Total households,-,-,-,-
HC04_VC74,Percent Margin of Error,INCOME AND BENEFITS (IN 2016 INFLATION-ADJUSTE...,Total households,-,-,-,-
HC03_VC75,Percent,INCOME AND BENEFITS (IN 2016 INFLATION-ADJUSTE...,Total households,"Less than $10,000",-,-,-
HC04_VC75,Percent Margin of Error,INCOME AND BENEFITS (IN 2016 INFLATION-ADJUSTE...,Total households,"Less than $10,000",-,-,-
HC03_VC76,Percent,INCOME AND BENEFITS (IN 2016 INFLATION-ADJUSTE...,Total households,"$10,000 to $14,999",-,-,-
HC04_VC76,Percent Margin of Error,INCOME AND BENEFITS (IN 2016 INFLATION-ADJUSTE...,Total households,"$10,000 to $14,999",-,-,-
HC03_VC77,Percent,INCOME AND BENEFITS (IN 2016 INFLATION-ADJUSTE...,Total households,"$15,000 to $24,999",-,-,-
HC04_VC77,Percent Margin of Error,INCOME AND BENEFITS (IN 2016 INFLATION-ADJUSTE...,Total households,"$15,000 to $24,999",-,-,-
HC03_VC78,Percent,INCOME AND BENEFITS (IN 2016 INFLATION-ADJUSTE...,Total households,"$25,000 to $34,999",-,-,-
HC04_VC78,Percent Margin of Error,INCOME AND BENEFITS (IN 2016 INFLATION-ADJUSTE...,Total households,"$25,000 to $34,999",-,-,-


In [5]:
ANN.loc[lambda x: x.descr_0.str.contains('health', case=False)]

Unnamed: 0,kind,descr_0,descr_1,descr_2,descr_3,descr_4,descr_5
HC03_VC130,Percent,HEALTH INSURANCE COVERAGE,Civilian noninstitutionalized population,-,-,-,-
HC04_VC130,Percent Margin of Error,HEALTH INSURANCE COVERAGE,Civilian noninstitutionalized population,-,-,-,-
HC03_VC131,Percent,HEALTH INSURANCE COVERAGE,Civilian noninstitutionalized population,With health insurance coverage,-,-,-
HC04_VC131,Percent Margin of Error,HEALTH INSURANCE COVERAGE,Civilian noninstitutionalized population,With health insurance coverage,-,-,-
HC03_VC132,Percent,HEALTH INSURANCE COVERAGE,Civilian noninstitutionalized population,With health insurance coverage,With private health insurance,-,-
HC04_VC132,Percent Margin of Error,HEALTH INSURANCE COVERAGE,Civilian noninstitutionalized population,With health insurance coverage,With private health insurance,-,-
HC03_VC133,Percent,HEALTH INSURANCE COVERAGE,Civilian noninstitutionalized population,With health insurance coverage,With public coverage,-,-
HC04_VC133,Percent Margin of Error,HEALTH INSURANCE COVERAGE,Civilian noninstitutionalized population,With health insurance coverage,With public coverage,-,-
HC03_VC134,Percent,HEALTH INSURANCE COVERAGE,Civilian noninstitutionalized population,No health insurance coverage,-,-,-
HC04_VC134,Percent Margin of Error,HEALTH INSURANCE COVERAGE,Civilian noninstitutionalized population,No health insurance coverage,-,-,-


In [6]:
ANN.loc[lambda x: x.descr_0.str.contains('poverty', case=False)]

Unnamed: 0,kind,descr_0,descr_1,descr_2,descr_3,descr_4,descr_5
HC03_VC161,Percent,PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME...,All families,-,-,-,-
HC04_VC161,Percent Margin of Error,PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME...,All families,-,-,-,-
HC03_VC162,Percent,PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME...,All families,With related children of the householder under...,-,-,-
HC04_VC162,Percent Margin of Error,PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME...,All families,With related children of the householder under...,-,-,-
HC03_VC163,Percent,PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME...,All families,With related children of the householder under...,With related children of the householder under...,-,-
HC04_VC163,Percent Margin of Error,PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME...,All families,With related children of the householder under...,With related children of the householder under...,-,-
HC03_VC164,Percent,PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME...,Married couple families,-,-,-,-
HC04_VC164,Percent Margin of Error,PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME...,Married couple families,-,-,-,-
HC03_VC165,Percent,PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME...,Married couple families,With related children of the householder under...,-,-,-
HC04_VC165,Percent Margin of Error,PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME...,Married couple families,With related children of the householder under...,-,-,-


After (manually) finding the indices for the desired observables, we build a subtable containing only this information, with column labels matching our application.

In [7]:
COLUMNS = {
    'GEO.id2': 'geoid',
    'GEO.display-label': 'label',
    'HC03_VC12': 'rate_unemployed',
    'HC03_VC179': 'rate_underpovlimit_families',
    'HC03_VC157': 'rate_no_health_insurance_unemployed',
}

def get_data_subset(df, col_map):
    cols_to_keep = list(col_map.keys())
    print(cols_to_keep)
    return df[cols_to_keep].rename(columns=col_map)

In [8]:
data.pipe(get_data_subset, COLUMNS).nlargest(20, 'rate_no_health_insurance_unemployed')

['GEO.id2', 'GEO.display-label', 'HC03_VC12', 'HC03_VC179', 'HC03_VC157']


Unnamed: 0,geoid,label,rate_unemployed,rate_underpovlimit_families,rate_no_health_insurance_unemployed
1733,6037208904,"Census Tract 2089.04, Los Angeles County, Cali...",8.2,69.7,71.6
6480,6075980600,"Census Tract 9806, San Francisco County, Calif...",14.1,42.5,70.5
4602,6065041500,"Census Tract 415, Riverside County, California",5.3,23.6,68.3
1631,6037194402,"Census Tract 1944.02, Los Angeles County, Cali...",4.1,5.2,67.2
1734,6037209102,"Census Tract 2091.02, Los Angeles County, Cali...",5.2,55.3,64.5
4484,6061022300,"Census Tract 223, Placer County, California",0.0,5.9,62.8
5745,6073003902,"Census Tract 39.02, San Diego County, California",17.7,37.7,60.2
1763,6037211804,"Census Tract 2118.04, Los Angeles County, Cali...",10.2,35.4,60.0
1727,6037208710,"Census Tract 2087.10, Los Angeles County, Cali...",6.2,18.2,58.0
1772,6037212303,"Census Tract 2123.03, Los Angeles County, Cali...",9.1,39.6,57.5


And now we add that as a dataset:

In [9]:
from cawc import datasets as ds

ds.DEMO_ECON

- `sea2016-A`: _from American Factfinder Selected Economic Characteristics 2012-2016 (CT level)_

<cawc.tools.Dataset object at 0x7fafe33317b8>