In [1]:
import math
import numpy as np
import pandas as pd
import json
import requests
import urllib

In [2]:
from censusAPI import myAPI

## User-defined parameters

In [3]:
#Variables and predicates for both start/end years
#Please refer to this website for the list of variables
#https://api.census.gov/data/2018/acs/acs5/variables.html
dsource = 'acs/acs5/'
cols = f'GEO_ID,B01001_001E,B01001_001M'  
year = '2019'

### Utility Functions

In [4]:
def get_moe(m):
    result = math.sqrt(sum(map(lambda x: x**2, m)))
    return result

def get_cv(e, m): 
    if e == 0:
        return np.nan
    else:
        return np.absolute(m/1.645/e*100)
    
def get_pct(e,agg_e):
    if agg_e == 0:
        return np.nan
    else:
        return e/agg_e

def get_pctmoe(e,m,agg_e,agg_m): #check to make sure this doesn't break 
    if agg_e == 0:
        return np.nan
    else: 
        return (1/agg_e)*math.sqrt((m**2)-(((e/agg_e)**2)*(agg_m**2)))
    
def clean_data(df,var):
    dff = df[var].copy()
    var_num = var[1:]
    for col in var_num:
        dff[col] = dff[col].astype(float)
    dff = dff.replace([999999999, 555555555, 333333333, 222222222,\
                    666666666, 888888888, -999999999, -555555555,\
                    -333333333, -222222222, -666666666, -888888888], np.nan)
    return dff

## Median HHI and Average HH Size - All Zip Codes in U.S.

In [5]:
base_url = f'https://api.census.gov/data/{year}/{dsource}'

In [6]:
data_url = f'{base_url}?get={cols}&for=zip%20code%20tabulation%20area:*&key={myAPI}'
df = pd.read_json(data_url)
resp = requests.request('GET', data_url).content
df = pd.DataFrame(json.loads(resp)[1:])
df.columns = json.loads(resp)[0]
df.head()

Unnamed: 0,GEO_ID,B01001_001E,B01001_001M,state,zip code tabulation area
0,8600000US25245,600,266,54,25245
1,8600000US25268,964,282,54,25268
2,8600000US25286,1700,435,54,25286
3,8600000US25303,6764,551,54,25303
4,8600000US25311,10964,630,54,25311


In [7]:
df.shape

(33120, 5)

In [8]:
dff = clean_data(df,list(df))
dff.head()

Unnamed: 0,GEO_ID,B01001_001E,B01001_001M,state,zip code tabulation area
0,8600000US25245,600.0,266.0,54.0,25245.0
1,8600000US25268,964.0,282.0,54.0,25268.0
2,8600000US25286,1700.0,435.0,54.0,25286.0
3,8600000US25303,6764.0,551.0,54.0,25303.0
4,8600000US25311,10964.0,630.0,54.0,25311.0


In [9]:
dff['B01001_001C'] = dff.apply(lambda x: (get_cv(x['B01001_001E'],x['B01001_001M'])),axis=1)
dff.head()

Unnamed: 0,GEO_ID,B01001_001E,B01001_001M,state,zip code tabulation area,B01001_001C
0,8600000US25245,600.0,266.0,54.0,25245.0,26.950355
1,8600000US25268,964.0,282.0,54.0,25268.0,17.783047
2,8600000US25286,1700.0,435.0,54.0,25286.0,15.555158
3,8600000US25303,6764.0,551.0,54.0,25303.0,4.952017
4,8600000US25311,10964.0,630.0,54.0,25311.0,3.493057


## Export to Excel

In [10]:
dff.to_excel('../output/US_zips_population_19.xlsx')

FileCreateError: [Errno 2] No such file or directory: '../output/US_zips_population_19.xlsx'