# Demographic data by geography

In [1]:
%%html
<style>
table {float:left}
</style>

In [2]:
import glob
import math
import os
import re

import pandas as pd

# Los Angeles County Continuum of Care

Los Angeles County comprises four CoCs. Thanks to [this crosswalk data](https://github.com/tomhbyrne/HUD-CoC-Geography-Crosswalk), I'm able to use demographic data for the subset of the county that exists within CoC ID CA-600.

In [3]:
la_files = glob.glob('../01_inputs/USCB/DP05/LA/*.csv')

In [4]:
la_files.sort()

## Normalize

These are slightly different for each of the ACS 5-Year datasets I plan to use. Based on previous experience, column IDs can also vary among years, so I'm going the inelegant route because the names are more immediately legible to me.

In [5]:
usecols_2020 = [
    'id',
    'Geographic Area Name',
    'Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Not Hispanic or Latino',
    'Margin of Error!!HISPANIC OR LATINO AND RACE!!Total population!!Not Hispanic or Latino',
    'Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Hispanic or Latino (of any race)',
    'Margin of Error!!HISPANIC OR LATINO AND RACE!!Total population!!Hispanic or Latino (of any race)',
    'Estimate!!Race alone or in combination with one or more other races!!Total population',
    'Margin of Error!!Race alone or in combination with one or more other races!!Total population',
    'Estimate!!Race alone or in combination with one or more other races!!Total population!!White',
    'Margin of Error!!Race alone or in combination with one or more other races!!Total population!!White',
    'Estimate!!Race alone or in combination with one or more other races!!Total population!!Black or African American',
    'Margin of Error!!Race alone or in combination with one or more other races!!Total population!!Black or African American',
    'Estimate!!Race alone or in combination with one or more other races!!Total population!!American Indian and Alaska Native',
    'Margin of Error!!Race alone or in combination with one or more other races!!Total population!!American Indian and Alaska Native',
    'Estimate!!Race alone or in combination with one or more other races!!Total population!!Asian',
    'Margin of Error!!Race alone or in combination with one or more other races!!Total population!!Asian',
    'Estimate!!Race alone or in combination with one or more other races!!Total population!!Native Hawaiian and Other Pacific Islander',
    'Margin of Error!!Race alone or in combination with one or more other races!!Total population!!Native Hawaiian and Other Pacific Islander',
    'Estimate!!Race alone or in combination with one or more other races!!Total population!!Some other race',
    'Margin of Error!!Race alone or in combination with one or more other races!!Total population!!Some other race',
]

In [6]:
usecols_2015 = [
    'id',
    'Geographic Area Name',
    'Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Not Hispanic or Latino',
    'Margin of Error!!HISPANIC OR LATINO AND RACE!!Total population!!Not Hispanic or Latino',
    'Estimate!!HISPANIC OR LATINO AND RACE!!Total population!!Hispanic or Latino (of any race)',
    'Margin of Error!!HISPANIC OR LATINO AND RACE!!Total population!!Hispanic or Latino (of any race)',
    'Estimate!!RACE!!Race alone or in combination with one or more other races!!Total population',
    'Margin of Error!!RACE!!Race alone or in combination with one or more other races!!Total population',
    'Estimate!!RACE!!Race alone or in combination with one or more other races!!Total population!!White',
    'Margin of Error!!RACE!!Race alone or in combination with one or more other races!!Total population!!White',
    'Estimate!!RACE!!Race alone or in combination with one or more other races!!Total population!!Black or African American',
    'Margin of Error!!RACE!!Race alone or in combination with one or more other races!!Total population!!Black or African American',
    'Estimate!!RACE!!Race alone or in combination with one or more other races!!Total population!!American Indian and Alaska Native',
    'Margin of Error!!RACE!!Race alone or in combination with one or more other races!!Total population!!American Indian and Alaska Native',
    'Estimate!!RACE!!Race alone or in combination with one or more other races!!Total population!!Asian',
    'Margin of Error!!RACE!!Race alone or in combination with one or more other races!!Total population!!Asian',
    'Estimate!!RACE!!Race alone or in combination with one or more other races!!Total population!!Native Hawaiian and Other Pacific Islander',
    'Margin of Error!!RACE!!Race alone or in combination with one or more other races!!Total population!!Native Hawaiian and Other Pacific Islander',
    'Estimate!!RACE!!Race alone or in combination with one or more other races!!Total population!!Some other race',
    'Margin of Error!!RACE!!Race alone or in combination with one or more other races!!Total population!!Some other race',
]

In [7]:
usecols_2010 = [
    'id',
    'Geographic Area Name',
    'Estimate!!HISPANIC OR LATINO AND RACE!!Not Hispanic or Latino',
    'Estimate Margin of Error!!HISPANIC OR LATINO AND RACE!!Not Hispanic or Latino',
    'Estimate!!HISPANIC OR LATINO AND RACE!!Hispanic or Latino (of any race)',
    'Estimate Margin of Error!!HISPANIC OR LATINO AND RACE!!Hispanic or Latino (of any race)',
    'Estimate!!RACE!!Race alone or in combination with one or more other races!!Total population',
    'Estimate Margin of Error!!RACE!!Race alone or in combination with one or more other races!!Total population',
    'Estimate!!RACE!!White',
    'Estimate Margin of Error!!RACE!!White',
    'Estimate!!RACE!!Black or African American',
    'Estimate Margin of Error!!RACE!!Black or African American',
    'Estimate!!RACE!!American Indian and Alaska Native',
    'Estimate Margin of Error!!RACE!!American Indian and Alaska Native',
    'Estimate!!RACE!!Asian',
    'Estimate Margin of Error!!RACE!!Asian',
    'Estimate!!RACE!!Native Hawaiian and Other Pacific Islander',
    'Estimate Margin of Error!!RACE!!Native Hawaiian and Other Pacific Islander',
    'Estimate!!RACE!!Some other race',
    'Estimate Margin of Error!!RACE!!Some other race',
]

In [8]:
def read_dp05(file):
    usecols_dict = {'2010': usecols_2010,
                    '2015': usecols_2015, '2020': usecols_2020}
    year = os.path.basename(file)[7:11]
    df = pd.read_csv(file, skiprows=1, usecols=usecols_dict[year])
    df = df[usecols_dict[year]]
    df.columns = [
        re.sub('Race alone or in combination with one or more other races!!', '', col)
        for col in df.columns
    ]
    df.columns = [
        re.sub('RACE!!', '', col)
        for col in df.columns
    ]
    df.columns = [
        re.sub('Total population!!', '', col)
        for col in df.columns
    ]
    df.columns = [
        re.sub('HISPANIC OR LATINO AND ', '', col)
        for col in df.columns
    ]
    df.columns = [
        re.sub('Estimate Margin', 'Margin', col)
        for col in df.columns
    ]
    df['census year'] = year
    return df

In [9]:
df = pd.concat([read_dp05(file) for file in la_files])

In [10]:
reshaped_df = df.melt(id_vars=['census year', 'id', 'Geographic Area Name'])

In [11]:
reshaped_df[['measure', 'demographic']] = reshaped_df['variable'].str.split(
    '!!', 1, expand=True)

In [12]:
reshaped_df.head()

Unnamed: 0,census year,id,Geographic Area Name,variable,value,measure,demographic
0,2010,1400000US06037101110,"Census Tract 1011.10, Los Angeles County, Cali...",Estimate!!Not Hispanic or Latino,3932,Estimate,Not Hispanic or Latino
1,2010,1400000US06037101122,"Census Tract 1011.22, Los Angeles County, Cali...",Estimate!!Not Hispanic or Latino,3212,Estimate,Not Hispanic or Latino
2,2010,1400000US06037101210,"Census Tract 1012.10, Los Angeles County, Cali...",Estimate!!Not Hispanic or Latino,4009,Estimate,Not Hispanic or Latino
3,2010,1400000US06037101220,"Census Tract 1012.20, Los Angeles County, Cali...",Estimate!!Not Hispanic or Latino,2041,Estimate,Not Hispanic or Latino
4,2010,1400000US06037101300,"Census Tract 1013, Los Angeles County, California",Estimate!!Not Hispanic or Latino,3385,Estimate,Not Hispanic or Latino


## Tract/CoC Crosswalk

[source](https://github.com/tomhbyrne/HUD-CoC-Geography-Crosswalk)

In [13]:
tract_crosswalk_df = pd.read_csv(
    '../01_inputs/USCB/tract_coc_match.csv',
    encoding='ISO-8859-1',
    dtype={'tract_fips': str},
)

In [14]:
la_tracts = tract_crosswalk_df[tract_crosswalk_df['coc_number']
                               == 'CA-600'].copy()

In [15]:
la_tracts['tract_fips'] = '1400000US' + la_tracts['tract_fips']

In [16]:
coc_tracts = set(la_tracts['tract_fips'])

In [17]:
coc_df = reshaped_df[(reshaped_df['id'].isin(coc_tracts)) & (
    reshaped_df['demographic'] != 'Total population')].copy()

In [18]:
coc_total_df = reshaped_df[(reshaped_df['id'].isin(coc_tracts)) & (
    reshaped_df['variable'] == 'Estimate!!Total population')].copy()

Totals to be used in percent calculation [below](#Combine-LA-CoC-and-county-data)

In [19]:
coc_totals = coc_total_df.groupby(['census year']).agg(
    total_population=('value', sum)).reset_index()

## Aggregate estimates

In [20]:
coc_est_df = coc_df[coc_df['measure'] == 'Estimate'].copy()

In [21]:
coc_est_df[coc_est_df['demographic'] ==
           'Native Hawaiian and Other Pacific Islander']

Unnamed: 0,census year,id,Geographic Area Name,variable,value,measure,demographic
100660,2010,1400000US06037101110,"Census Tract 1011.10, Los Angeles County, Cali...",Estimate!!Native Hawaiian and Other Pacific Is...,0,Estimate,Native Hawaiian and Other Pacific Islander
100661,2010,1400000US06037101122,"Census Tract 1011.22, Los Angeles County, Cali...",Estimate!!Native Hawaiian and Other Pacific Is...,0,Estimate,Native Hawaiian and Other Pacific Islander
100662,2010,1400000US06037101210,"Census Tract 1012.10, Los Angeles County, Cali...",Estimate!!Native Hawaiian and Other Pacific Is...,0,Estimate,Native Hawaiian and Other Pacific Islander
100663,2010,1400000US06037101220,"Census Tract 1012.20, Los Angeles County, Cali...",Estimate!!Native Hawaiian and Other Pacific Is...,8,Estimate,Native Hawaiian and Other Pacific Islander
100664,2010,1400000US06037101300,"Census Tract 1013, Los Angeles County, California",Estimate!!Native Hawaiian and Other Pacific Is...,0,Estimate,Native Hawaiian and Other Pacific Islander
...,...,...,...,...,...,...,...
107835,2020,1400000US06037980025,"Census Tract 9800.25, Los Angeles County, Cali...",Estimate!!Native Hawaiian and Other Pacific Is...,15,Estimate,Native Hawaiian and Other Pacific Islander
107836,2020,1400000US06037980026,"Census Tract 9800.26, Los Angeles County, Cali...",Estimate!!Native Hawaiian and Other Pacific Is...,0,Estimate,Native Hawaiian and Other Pacific Islander
107837,2020,1400000US06037980028,"Census Tract 9800.28, Los Angeles County, Cali...",Estimate!!Native Hawaiian and Other Pacific Is...,0,Estimate,Native Hawaiian and Other Pacific Islander
107838,2020,1400000US06037980030,"Census Tract 9800.30, Los Angeles County, Cali...",Estimate!!Native Hawaiian and Other Pacific Is...,0,Estimate,Native Hawaiian and Other Pacific Islander


In [22]:
agg_est_df = coc_est_df.groupby(['census year', 'demographic']).agg(
    Estimate=('value', sum)).reset_index()

In [23]:
agg_est_df.head()

Unnamed: 0,census year,demographic,Estimate
0,2010,American Indian and Alaska Native,102527
1,2010,Asian,1326930
2,2010,Black or African American,844211
3,2010,Hispanic or Latino (of any race),4331506
4,2010,Native Hawaiian and Other Pacific Islander,36805


## Calculate error for aggregation

Because I'm deriving my own estimates (summing census tracts within CA-600), I want to confirm that the demographic data I'm using is still reliable by calculating coefficients of variance for these estimates as well. For this I consulted ["Calculating Measures Of Error For Derived Estimates"](https://web.archive.org/web/20220119121038/https://www.census.gov/content/dam/Census/library/publications/2020/acs/acs_general_handbook_2020_ch08.pdf) (specifically page 60) of the American Community Survey handbook.

In [24]:
coc_moe_df = coc_df[coc_df['measure'] == 'Margin of Error'].copy()

In [25]:
coc_moe_df.rename(columns={'value': 'MOE'}, inplace=True)

The American Community Survey reports 90% confidence level margins of error (p.3, [Worked Examples for Approximating Standard Errors
Using American Community Survey Data](https://web.archive.org/web/20220120042207/https://www2.census.gov/programs-surveys/acs/tech_docs/accuracy/2020_ACS_Accuracy_Document_Worked_Examples.pdf)).

For each component estimate:

$$\mathrm{SE = \frac{MOE}{1.645}}$$

In [26]:
coc_moe_df['SE'] = coc_moe_df['MOE']/1.645

### Example

#### Estimate

For the aggregate estimate $\mathrm{{\color{blue} \hat{X}}}$:

$~~~~\mathrm{{\color{blue} \hat{X}} = \hat{X}_{1} + \hat{X}_{2} + \cdots + \hat{X}_{n}}$

For `Hispanic or Latino (of any race)` in `2010`:


In [27]:
agg_est_df.iloc[3:4]

Unnamed: 0,census year,demographic,Estimate
3,2010,Hispanic or Latino (of any race),4331506


$\mathrm{{\color{blue} \hat{X}} = 4331506}$

#### Error

So, for the standard error $\mathrm{{\color{red} SE}}$ of the aggregate estimate:
    
$~~~~\mathrm{{\color{red} SE}({\color{blue}4331506}) = \sqrt{[{\color{red} SE}(\hat{X}_{1})]^{2} + [{\color{red} SE}(\hat{X}_{2})]^{2} + \cdots + [{\color{red} SE}(\hat{X}_{n})]^{2}}}$

For `Hispanic or Latino (of any race)` in `2010`:


| census year | demographic                      | Census Tract | SE  |
|-------------|----------------------------------|--------------|-----|
| 2010        | Hispanic or Latino (of any race) | 1            | 237 |
| 2010        | Hispanic or Latino (of any race) | 2            | 102 |
| ...         | ...                              | ...          | ... |
| 2010        | Hispanic or Latino (of any race) | 2158         | 310 |

$~~~~\mathrm{{\color{red} SE}({\color{blue}4331506}) = \sqrt{(237)^{2} + (102)^{2} + \cdots + (310)^{2}}}$

In [28]:
def aggregate_error(values):
    squared_values = [x**2 for x in values]
    summed_sq = sum(squared_values)
    return math.sqrt(summed_sq)

In [29]:
agg_error_df = (
    coc_moe_df.groupby(['census year', 'demographic'])
    .agg({'SE': aggregate_error}
         )
    .reset_index()
)

In [30]:
agg_error_df.iloc[3:4]

Unnamed: 0,census year,demographic,SE
3,2010,Hispanic or Latino (of any race),11629.458223


$~~~~\mathrm{{\color{red} SE} = 11629}$

#### Reliability

$\mathrm{CV = \frac{\color{red} SE}{\color{blue} Estimate}}$

In [31]:
agg_df = pd.merge(agg_error_df, agg_est_df)

In [32]:
agg_df['CV'] = agg_df['SE']/agg_df['Estimate']

In [33]:
agg_df.iloc[3:4]

Unnamed: 0,census year,demographic,SE,Estimate,CV
3,2010,Hispanic or Latino (of any race),11629.458223,4331506,0.002685


In this case, the coefficient of variation suggests the estimate is reliable, because the <span style="color:red">error</span> is quite small compared to the <span style="color:blue">estimate</span>:

$\mathrm{CV = \frac{\color{red} \sim11629}{\color{blue} 4331506}} = 0.27\%$

Stepping back, it should be no surprise that estimates of the `Hispanic or Latino (of any race)` population in census tracts amounting to >90% of the population of Los Angeles County would be reliable.

##### Least reliable

We should check the values for which the coefficient of variation is highest:

In [34]:
agg_df.iloc[agg_df['CV'].idxmax()]

census year                                          2010
demographic    Native Hawaiian and Other Pacific Islander
SE                                            3448.653007
Estimate                                            36805
CV                                               0.093701
Name: 4, dtype: object

Here, the CV is ~9%. The `Native Hawaiian and Other Pacific Islander` population estimated to have lived in the census tracts within the Los Angeles County Continuum of Care in 2010 is much smaller than the `Hispanic or Latino (of any race)` population.

Per p.16 of [Understanding and Using American Community Survey Data: What Journalists Need to Know](https://web.archive.org/web/2/https://www.census.gov/content/dam/Census/library/publications/2020/acs/acs_journalist_handbook_2020.pdf):
> [J]ournalists often wonder “How big an error margin is too big?” Statistics teachers say, “It depends.” They are right, but that is less than helpful. Consider a simpler measure [...] called the coefficient of variation (CV) [. ...] A CV of 10 percent does not seem too wobbly, but one of 50 percent probably is for most purposes.

$$\ast\ast\ast$$

# All other counties

In [35]:
files = glob.glob('../01_inputs/USCB/DP05/*.csv')

In [36]:
def reshape(df):
    reshaped_df = df.melt(id_vars=['census'])
    reshaped_df['geography'] = reshaped_df['variable'].str.split(
        '!!', n=1, expand=True)[0]
    reshaped_df[['geography', 'state']] = reshaped_df['geography'].str.split(
        ', ', 1, expand=True)
    reshaped_df['measure'] = reshaped_df['variable'].str.split(
        '!!', n=1, expand=True)[1]
    reshaped_df.drop(labels=['variable'], axis=1, inplace=True)
    index_cols = ['geography', 'state', 'census']
    final_df = reshaped_df.pivot(
        index=index_cols, columns='measure', values='value').reset_index()
    final_df.rename_axis(None, axis=1, inplace=True)
    return final_df

In [37]:
def read_dp05(file):
    year = os.path.basename(file)[7:11]
    df = pd.read_csv(file)
    df.rename(columns={'Label (Grouping)': 'census'}, inplace=True)
    if year == '2020':
        ethnicity_df = pd.concat([df.iloc[74:75], df.iloc[79:80]])
        race_df = df.iloc[65:72]
    else:
        ethnicity_df = pd.concat([df.iloc[69:70], df.iloc[74:75]])
        race_df = df.iloc[60:67]
    reshaped_ethnicity_df = reshape(ethnicity_df)
    reshaped_ethnicity_df['variable'] = 'ethnicity'
    reshaped_race_df = reshape(race_df)
    reshaped_race_df['variable'] = 'race'
    merged_df = pd.concat([reshaped_race_df, reshaped_ethnicity_df])
    merged_df['census'] = merged_df['census'].str.replace(
        '\xa0', '')
    merged_df['census year'] = year
    return merged_df

In [38]:
dp05 = pd.concat([read_dp05(file) for file in files])

In [39]:
dp05.head()

Unnamed: 0,geography,state,census,Estimate,Percent,variable,census year
0,Alameda County,California,Total population,1661584,1661584,race,2020
1,Alameda County,California,American Indian and Alaska Native,28833,1.7%,race,2020
2,Alameda County,California,Asian,578885,34.8%,race,2020
3,Alameda County,California,Black or African American,204606,12.3%,race,2020
4,Alameda County,California,Native Hawaiian and Other Pacific Islander,25897,1.6%,race,2020


In [40]:
dp05[['Percent', 'Estimate']] = dp05[['Percent', 'Estimate']
                                     ].applymap(lambda x: re.sub('\,|\%', '', x))

In [41]:
dp05.loc[dp05['census'] == 'Total population', 'Percent'] = 100

In [42]:
dp05['Estimate'] = pd.to_numeric(dp05['Estimate'])

In [43]:
dp05['Percent'] = pd.to_numeric(dp05['Percent'])/100

In [44]:
dp05.rename(columns={'census': 'demographic'}, inplace=True)

In [45]:
dp05_total = dp05[dp05['demographic'] == 'Total population'].copy()

In [46]:
dp05_total.columns

Index(['geography', 'state', 'demographic', 'Estimate', 'Percent', 'variable',
       'census year'],
      dtype='object')

In [47]:
dp05_total.rename(columns={'Estimate': 'total_population'}, inplace=True)

In [48]:
dp05 = dp05[dp05['demographic'] != 'Total population'].copy()

In [49]:
dp05_df = pd.merge(dp05, dp05_total[['geography', 'state',  'total_population',
                                     'variable', 'census year']])

In [50]:
dp05_df.columns

Index(['geography', 'state', 'demographic', 'Estimate', 'Percent', 'variable',
       'census year', 'total_population'],
      dtype='object')

## Combine LA CoC and county data

### Percent calculation for LA CoC

In [51]:
agg_df.loc[agg_df['demographic'].str.contains(
    'Hispanic'), 'variable'] = 'ethnicity'

In [52]:
agg_df['variable'].fillna('race', inplace=True)

In [53]:
coc_totals

Unnamed: 0,census year,total_population
0,2010,8963473
1,2015,9228489
2,2020,7659776


In [54]:
la_coc_df = pd.merge(
    agg_df[['census year', 'demographic', 'Estimate',  'variable', ]], coc_totals)

In [55]:
la_coc_df['Percent'] = la_coc_df['Estimate']/la_coc_df['total_population']

In [56]:
la_coc_df['geography'] = 'Los Angeles County*'

In [57]:
la_coc_df['state'] = 'California'

In [58]:
combined_dp05_df = pd.concat([la_coc_df, dp05_df], ignore_index=True)

In [59]:
combined_dp05_df[
    (combined_dp05_df['geography'].str.contains('County'))
    & (combined_dp05_df['geography'] != 'Los Angeles County')
].to_csv('../04_outputs/c01_USCB-ACS5Y-DP05-County.csv', index=False)

In [60]:
combined_dp05_df[~combined_dp05_df['geography'].str.contains('County')].to_csv(
    '../04_outputs/c01_USCB-ACS5Y-DP05-City.csv', index=False
)