# Downloads Demographic data from the American Community Survey

**[Work in progress]**

This notebook demographic data estimates (DP05) from the American Community Survey 5-Year Data (2009-2018).

Data source: [American Community Survey 5-Year Data (2009-2018)](https://www.census.gov/data/developers/data-sets/acs-5year.html)

Authors: Peter Rose (pwrose@ucsd.edu), Ilya Zaslavsky (zaslavsk@sdsc.edu)

In [1]:
import os
import pandas as pd
from pathlib import Path

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/Neo4j Desktop/Application/neo4jDatabases/database-19636412-9e74-4bac-8a4c-c6c8b49bb9d3/installation-4.1.0/import


## Download selected fields

[List of variables as HTML](https://api.census.gov/data/2018/acs/acs5/profile/groups/DP05.html) or [JSON](https://api.census.gov/data/2018/acs/acs5/profile/groups/DP05/)

[Example URLs for API](https://api.census.gov/data/2018/acs/acs5/profile/examples.html)

### Add variables and names here (names must use Neo4j property naming conventions)

The numbers in each subgroup below adds up to the total population

In [4]:
variables = {'DP05_0001E': 'totalPopulation',
             
             # sex
             'DP05_0002E': 'male',
             'DP05_0003E': 'female',
             
             # age
             'DP05_0005E': 'age0_4',
             'DP05_0006E': 'age5_9',
             'DP05_0007E': 'age10_14',
             'DP05_0008E': 'age15_19',
             'DP05_0009E': 'age20_24',
             'DP05_0010E': 'age25_34',
             'DP05_0011E': 'age35_44',
             'DP05_0012E': 'age45_54',
             'DP05_0013E': 'age55_59',
             'DP05_0014E': 'age60_64',
             'DP05_0015E': 'age65_74',
             'DP05_0016E': 'age75_84',
             'DP05_0017E': 'age85_',

              # race
             'DP05_0037E': 'white',
             'DP05_0038E': 'blackOrAfricanAmerican',
             'DP05_0039E': 'americanIndianAndAlaskaNative',
             'DP05_0044E': 'asian',
             'DP05_0052E': 'nativeHawaiianAndOtherPacificIslander',
             'DP05_0057E': 'otherRace',
             'DP05_0058E': 'twoOrMoreRaces',
             
              # hispanic or latino
             'DP05_0071E': 'hispanicOrLatino',
             'DP05_0076E': 'notHispanicOrLatino'
            }

In [5]:
fields = ",".join(variables.keys())

## Download county-level data using UC Census API

In [6]:
url_county = f'https://api.census.gov/data/2018/acs/acs5/profile?get={fields}&for=county:*'

In [7]:
df = pd.read_json(url_county, dtype='str')
df.fillna('', inplace=True)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26
0,DP05_0001E,DP05_0002E,DP05_0003E,DP05_0005E,DP05_0006E,DP05_0007E,DP05_0008E,DP05_0009E,DP05_0010E,DP05_0011E,DP05_0012E,DP05_0013E,DP05_0014E,DP05_0015E,DP05_0016E,DP05_0017E,DP05_0037E,DP05_0038E,DP05_0039E,DP05_0044E,DP05_0052E,DP05_0057E,DP05_0058E,DP05_0071E,DP05_0076E,state,county
1,47086,22018,25068,3614,3107,3554,3152,3351,5844,5236,5760,3224,3291,4282,1961,710,12097,33982,56,282,0,391,278,729,46357,28,151
2,12028,5845,6183,728,776,806,849,671,1279,1562,1602,786,897,1247,589,236,9475,2368,0,17,0,16,152,181,11847,28,111
3,8321,4174,4147,458,564,486,598,416,866,921,1092,536,644,979,547,214,5631,2596,23,15,8,13,35,33,8288,28,019
4,23480,11821,11659,1262,1439,1506,1919,1670,2753,2830,3142,1519,1435,2250,1181,574,21346,1681,75,57,7,170,144,360,23120,28,057


##### Add column names

In [8]:
df = df[1:].copy() # skip first row of labels
columns = list(variables.values())
columns.append('stateFips')
columns.append('countyFips')
df.columns = columns

In [9]:
df.head()

Unnamed: 0,totalPopulation,male,female,age0_4,age5_9,age10_14,age15_19,age20_24,age25_34,age35_44,age45_54,age55_59,age60_64,age65_74,age75_84,age85_,white,blackOrAfricanAmerican,americanIndianAndAlaskaNative,asian,nativeHawaiianAndOtherPacificIslander,otherRace,twoOrMoreRaces,hispanicOrLatino,notHispanicOrLatino,stateFips,countyFips
1,47086,22018,25068,3614,3107,3554,3152,3351,5844,5236,5760,3224,3291,4282,1961,710,12097,33982,56,282,0,391,278,729,46357,28,151
2,12028,5845,6183,728,776,806,849,671,1279,1562,1602,786,897,1247,589,236,9475,2368,0,17,0,16,152,181,11847,28,111
3,8321,4174,4147,458,564,486,598,416,866,921,1092,536,644,979,547,214,5631,2596,23,15,8,13,35,33,8288,28,19
4,23480,11821,11659,1262,1439,1506,1919,1670,2753,2830,3142,1519,1435,2250,1181,574,21346,1681,75,57,7,170,144,360,23120,28,57
5,10129,5116,5013,455,467,740,803,416,812,1212,1390,962,623,1353,726,170,6523,3503,6,0,0,0,97,29,10100,28,15


In [10]:
# Example data
df[(df['stateFips'] == '06') & (df['countyFips'] == '073')]

Unnamed: 0,totalPopulation,male,female,age0_4,age5_9,age10_14,age15_19,age20_24,age25_34,age35_44,age45_54,age55_59,age60_64,age65_74,age75_84,age85_,white,blackOrAfricanAmerican,americanIndianAndAlaskaNative,asian,nativeHawaiianAndOtherPacificIslander,otherRace,twoOrMoreRaces,hispanicOrLatino,notHispanicOrLatino,stateFips,countyFips
1869,3302833,1661931,1640902,211969,198148,197726,209496,262118,541385,436855,420221,201666,183654,251516,127904,60175,2335447,166412,20980,390418,13903,205307,170366,1106925,2195908,6,73


In [11]:
df['source'] = 'American Community Survey 5 year'
df['aggregationLevel'] = 'Admin2'

In [21]:
states = list(df['stateFips'].unique())
states.sort()

In [27]:
df[(df['stateFips'] == '06')]

Unnamed: 0,totalPopulation,male,female,age0_4,age5_9,age10_14,age15_19,age20_24,age25_34,age35_44,age45_54,age55_59,age60_64,age65_74,age75_84,age85_,white,blackOrAfricanAmerican,americanIndianAndAlaskaNative,asian,nativeHawaiianAndOtherPacificIslander,otherRace,twoOrMoreRaces,hispanicOrLatino,notHispanicOrLatino,stateFips,countyFips
1855,64148,31879,32269,3683,3828,3496,3516,3231,6973,6706,8097,5137,5540,8591,3975,1375,49463,1562,2426,661,30,8048,1958,12830,51318,6,33
1856,17540,8920,8620,762,761,751,804,803,1885,1614,2284,1503,1683,2778,1556,356,15579,166,397,243,54,396,705,1909,15631,6,43
1857,75493,38468,37025,6126,5796,5815,4955,5600,11930,9193,8410,4592,4001,5647,2311,1117,56344,2432,1079,5104,343,4126,6065,20990,54503,6,115
1858,1133247,553701,579546,65505,73025,76328,72857,68013,143793,151683,164203,78858,69200,99537,48565,21680,648325,97333,5529,185065,5585,112976,78434,288101,845146,6,13
1859,31185,20671,10514,1384,1272,1477,1444,3218,6287,4353,3899,1803,1891,2504,1250,403,25672,2738,877,437,205,590,666,5834,25351,6,35
1860,443738,222208,221530,28356,27457,27240,37445,49425,58974,50124,49960,25886,24096,35041,19841,9893,344026,8626,4389,23863,814,42024,19996,200060,243678,6,83
1861,501317,245196,256121,25736,28607,28944,29922,30140,65417,61766,66429,35686,37573,55520,23956,11621,375519,7960,4854,20087,1636,63422,27839,132985,368332,6,97
1862,180216,92250,87966,15092,13588,14616,13576,14006,26210,21833,20565,9536,8748,12428,6979,3039,115817,4591,1801,2684,419,47362,7542,151019,29197,6,25
1863,14174,7341,6833,464,786,911,630,1341,2264,1563,1964,1314,907,1385,568,77,12017,105,527,339,0,801,385,3866,10308,6,51
1864,1643700,807171,836529,97506,96428,95792,93865,103705,271867,244557,224507,106455,93101,126820,60948,28149,681725,177135,10712,486434,13768,169771,104155,369061,1274639,6,1


### Save data

In [12]:
df.to_csv(NEO4J_IMPORT / "03a-USCensusDP05Admin2.csv", index=False)

## Download zip-level data using UC Census API

In [13]:
url_zip = f'https://api.census.gov/data/2018/acs/acs5/profile?get={fields}&for=zip%20code%20tabulation%20area:*'

In [14]:
df = pd.read_json(url_zip, dtype='str')
df.fillna('', inplace=True)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25
0,DP05_0001E,DP05_0002E,DP05_0003E,DP05_0005E,DP05_0006E,DP05_0007E,DP05_0008E,DP05_0009E,DP05_0010E,DP05_0011E,DP05_0012E,DP05_0013E,DP05_0014E,DP05_0015E,DP05_0016E,DP05_0017E,DP05_0037E,DP05_0038E,DP05_0039E,DP05_0044E,DP05_0052E,DP05_0057E,DP05_0058E,DP05_0071E,DP05_0076E,zip code tabulation area
1,8642,4312,4330,567,430,453,532,344,901,975,1226,843,745,916,522,188,8226,256,2,21,0,0,137,27,8615,43964
2,51116,23286,27830,3438,3367,3204,4094,3740,7553,8183,7607,2628,2225,2958,1577,542,13583,32929,25,1054,0,2274,1251,4077,47039,28216
3,71605,33421,38184,4222,5769,5767,4563,2944,8540,11786,11186,5117,4079,5063,1842,727,51272,6346,200,10533,0,978,2276,5172,66433,28277
4,27286,12870,14416,2277,1988,1861,1812,1339,4140,4488,3950,1806,1336,1593,533,163,14515,8672,7,1709,53,590,1740,2870,24416,28278


##### Add column names

In [15]:
df = df[1:].copy() # skip first row
columns = list(variables.values())
columns.append('postalCode')
df.columns = columns

In [16]:
df.head()

Unnamed: 0,totalPopulation,male,female,age0_4,age5_9,age10_14,age15_19,age20_24,age25_34,age35_44,age45_54,age55_59,age60_64,age65_74,age75_84,age85_,white,blackOrAfricanAmerican,americanIndianAndAlaskaNative,asian,nativeHawaiianAndOtherPacificIslander,otherRace,twoOrMoreRaces,hispanicOrLatino,notHispanicOrLatino,postalCode
1,8642,4312,4330,567,430,453,532,344,901,975,1226,843,745,916,522,188,8226,256,2,21,0,0,137,27,8615,43964
2,51116,23286,27830,3438,3367,3204,4094,3740,7553,8183,7607,2628,2225,2958,1577,542,13583,32929,25,1054,0,2274,1251,4077,47039,28216
3,71605,33421,38184,4222,5769,5767,4563,2944,8540,11786,11186,5117,4079,5063,1842,727,51272,6346,200,10533,0,978,2276,5172,66433,28277
4,27286,12870,14416,2277,1988,1861,1812,1339,4140,4488,3950,1806,1336,1593,533,163,14515,8672,7,1709,53,590,1740,2870,24416,28278
5,29414,13958,15456,2144,1784,1542,1504,2318,5357,3591,2955,1842,1673,2610,1406,688,14358,11486,187,1185,132,345,1721,3084,26330,28303


In [17]:
# Example data
df.query("postalCode == '92130'")

Unnamed: 0,totalPopulation,male,female,age0_4,age5_9,age10_14,age15_19,age20_24,age25_34,age35_44,age45_54,age55_59,age60_64,age65_74,age75_84,age85_,white,blackOrAfricanAmerican,americanIndianAndAlaskaNative,asian,nativeHawaiianAndOtherPacificIslander,otherRace,twoOrMoreRaces,hispanicOrLatino,notHispanicOrLatino,postalCode
30726,55316,27583,27733,3310,4310,4405,4428,2178,5868,9178,10243,4149,2557,3166,1250,274,33966,1382,177,16573,9,623,2586,3999,51317,92130


In [18]:
df['source'] = 'American Community Survey 5 year'
df['aggregationLevel'] = 'PostalCode'

### Save data

In [19]:
df.to_csv(NEO4J_IMPORT / "03a-USCensusDP05Zip.csv", index=False)