In [1]:
import pandas as pd
import json
import requests
import math
import numpy as np
import urllib

In [2]:
import utilcalcs as calc
import geo_agg as geo
from censusAPI import myAPI
from county_codes import stco

## Create a table of all U.S. counties with HHI & Ducational Attainment
### 2014 - 2018 5-Year Average

In [3]:
#My search parameters
y1 = '2018'
#y0 = '2010'

cols_1 = f'group(DP03)'
cols_2 = f'group(DP02)'

#bsource = 'acs/acs5'
dsource = 'acs/acs5/profile'

In [4]:
def get_data(year,source,cols):
    url = f"https://api.census.gov/data/{year}/{source}?get={cols}&for=county:*&in=state:*&key={myAPI}"
    resp = requests.request('GET', url).content
    df_co = pd.DataFrame(json.loads(resp)[1:])
    df_co.columns = json.loads(resp)[0]

    url = f"https://api.census.gov/data/{year}/{source}?get={cols}&for=us:*&key={myAPI}"
    resp = requests.request('GET', url).content
    df_us = pd.DataFrame(json.loads(resp)[1:])
    df_us.columns = json.loads(resp)[0]

    df = pd.concat([df_co,df_us],sort=True)
    return df

def clean_data(df,var):
    dff = df[var].copy()
    var_num = var[1:]
    for col in var_num:
        dff[col] = dff[col].astype(float)
    dff = dff.replace([999999999, 555555555, 333333333, 222222222,\
                    666666666, 888888888, -999999999, -555555555,\
                    -333333333, -222222222, -666666666, -888888888], 0)
    return dff

In [5]:
#Grouping variables/columns into lists to run calculations for a new table
#Can be usD for all ACS years

#Household Income Bands
HHI_E = ['DP03_0051E']
HHI_M = ['DP03_0051M']
HHI_U25E = ['DP03_0052E','DP03_0053E','DP03_0054E']
HHI_U25M = ['DP03_0052M','DP03_0053M','DP03_0054M']
HHI_2549E = ['DP03_0055E','DP03_0056E']
HHI_2549M = ['DP03_0055M','DP03_0056M']
HHI_5074E = ['DP03_0057E']
HHI_5074M = ['DP03_0057M']
HHI_7599E = ['DP03_0058E']
HHI_7599M = ['DP03_0058M']
HHI_100149E = ['DP03_0059E']
HHI_100149M = ['DP03_0059M']
HHI_O150E = ['DP03_0060E','DP03_0061E']
HHI_O150M = ['DP03_0060M','DP03_0061M']

#HHI_MdE = ['DP03_0062E']
#HHI_MdM = ['DP03_0062M']

HHIE = HHI_E + HHI_U25E + HHI_2549E + HHI_5074E + HHI_7599E \
        + HHI_100149E + HHI_O150E #+ HHI_MdE

HHIM = HHI_M + HHI_U25M + HHI_2549M + HHI_5074M + HHI_7599M \
        + HHI_100149M + HHI_O150M #+ HHI_MdM

#Ducational Attainment Age 25+
Ed_E = ['DP02_0058E']
Ed_M = ['DP02_0058M']
Ed_UHSE = ['DP02_0059E','DP02_0060E']
Ed_UHSM = ['DP02_0059M','DP02_0060M']
Ed_HSE = ['DP02_0061E']
Ed_HSM = ['DP02_0061M']
Ed_ColE = ['DP02_0062E','DP02_0063E']
Ed_ColM = ['DP02_0062M','DP02_0063M']
Ed_BachE = ['DP02_0064E']
Ed_BachM = ['DP02_0064M']
Ed_GradE = ['DP02_0065E']
Ed_GradM = ['DP02_0065M']
Ed_OBachE = ['DP02_0067E']
Ed_OBachM = ['DP02_0067M']

EdE = Ed_E + Ed_UHSE + Ed_HSE + Ed_ColE + Ed_BachE + Ed_GradE + Ed_OBachE
EdM = Ed_M + Ed_UHSM + Ed_HSM + Ed_ColM + Ed_BachM + Ed_GradM + Ed_OBachM


#List of all variables usD for calculation + total population variables for spot checking aggregation
var_data = ['GEO_ID'] + HHIE + HHIM + EdE + EdM

In [6]:
df1 = get_data(y1,dsource,cols_1)
df2 = get_data(y1,dsource,cols_2)
df = pd.merge(df1,df2,how='left',on='GEO_ID')
df = clean_data(df,var_data)
df.head()

Unnamed: 0,GEO_ID,DP03_0051E,DP03_0052E,DP03_0053E,DP03_0054E,DP03_0055E,DP03_0056E,DP03_0057E,DP03_0058E,DP03_0059E,...,DP02_0067E,DP02_0058M,DP02_0059M,DP02_0060M,DP02_0061M,DP02_0062M,DP02_0063M,DP02_0064M,DP02_0065M,DP02_0067M
0,0500000US28151,18299.0,2766.0,1845.0,3056.0,2480.0,2153.0,2891.0,1326.0,1022.0,...,5696.0,54.0,277.0,412.0,602.0,394.0,330.0,368.0,302.0,437.0
1,0500000US28111,4563.0,421.0,410.0,757.0,628.0,548.0,714.0,581.0,240.0,...,898.0,79.0,142.0,198.0,313.0,240.0,181.0,199.0,71.0,216.0
2,0500000US28019,3164.0,286.0,352.0,515.0,340.0,410.0,541.0,392.0,227.0,...,1024.0,60.0,150.0,175.0,242.0,189.0,118.0,134.0,119.0,173.0
3,0500000US28057,8706.0,585.0,612.0,1321.0,1248.0,1217.0,1816.0,827.0,717.0,...,2109.0,113.0,206.0,380.0,409.0,350.0,214.0,249.0,194.0,321.0
4,0500000US28015,3658.0,377.0,192.0,531.0,408.0,515.0,660.0,350.0,420.0,...,1051.0,181.0,161.0,253.0,358.0,380.0,209.0,173.0,141.0,223.0


In [7]:
#Calculate all of the new aggregations
df['HHI_E'] = df['DP03_0051E']
df['HHI_M'] = df['DP03_0051M']
df['HHI_C'] = df.apply(lambda x: (calc.get_cv(x['HHI_E'],x['HHI_M'])),axis=1)

df['HHI_U25E'] = df.loc[:,HHI_U25E].sum(axis=1)
df['HHI_U25M'] = df.apply(lambda x: (calc.get_moe(x[HHI_U25M])),axis=1)
df['HHI_U25C'] = df.apply(lambda x: (calc.get_cv(x['HHI_U25E'],x['HHI_U25M'])),axis=1)
df['HHI_2549E'] = df.loc[:,HHI_2549E].sum(axis=1)
df['HHI_2549M'] = df.apply(lambda x: (calc.get_moe(x[HHI_2549M])),axis=1)
df['HHI_2549C'] = df.apply(lambda x: (calc.get_cv(x['HHI_2549E'],x['HHI_2549M'])),axis=1)
df['HHI_5074E'] = df['DP03_0057E']
df['HHI_5074M'] = df['DP03_0057M']
df['HHI_5074C'] = df.apply(lambda x: (calc.get_cv(x['HHI_5074E'],x['HHI_5074M'])),axis=1)
df['HHI_7599E'] = df['DP03_0058E']
df['HHI_7599M'] = df['DP03_0058M']
df['HHI_7599C'] = df.apply(lambda x: (calc.get_cv(x['HHI_7599E'],x['HHI_7599M'])),axis=1)
df['HHI_100149E'] = df['DP03_0059E']
df['HHI_100149M'] = df['DP03_0059M']
df['HHI_100149C'] = df.apply(lambda x: (calc.get_cv(x['HHI_100149E'],x['HHI_100149M'])),axis=1)
df['HHI_O150E'] = df.loc[:,HHI_O150E].sum(axis=1)
df['HHI_O150M'] = df.apply(lambda x: (calc.get_moe(x[HHI_O150M])),axis=1)
df['HHI_O150C'] = df.apply(lambda x: (calc.get_cv(x['HHI_O150E'],x['HHI_O150M'])),axis=1)

df['HHI_MdE'] = df['DP03_0062E']
df['HHI_MdM'] = df['DP03_0062M']
df['HHI_MdC'] = df.apply(lambda x: (calc.get_cv(x['HHI_MdE'],x['HHI_MdM'])),axis=1)

df['Ed_E'] = df['DP02_0058E']
df['Ed_M'] = df['DP02_0058M']
df['Ed_C'] = df.apply(lambda x: (calc.get_cv(x['Ed_E'],x['Ed_M'])),axis=1)
df['Ed_UHSE'] = df.loc[:,Ed_UHSE].sum(axis=1)
df['Ed_UHSM'] = df.apply(lambda x: (calc.get_moe(x[Ed_UHSM])),axis=1)
df['Ed_UHSC'] = df.apply(lambda x: (calc.get_cv(x['Ed_UHSE'],x['Ed_UHSM'])),axis=1)
df['Ed_HSE'] = df['DP02_0061E']
df['Ed_HSM'] = df['DP02_0061M']
df['Ed_HSC'] = df.apply(lambda x: (calc.get_cv(x['Ed_HSE'],x['Ed_HSM'])),axis=1)
df['Ed_ColE'] = df.loc[:,Ed_ColE].sum(axis=1)
df['Ed_ColM'] = df.apply(lambda x: (calc.get_moe(x[Ed_ColM])),axis=1)
df['Ed_ColC'] = df.apply(lambda x: (calc.get_cv(x['Ed_ColE'],x['Ed_ColM'])),axis=1)
df['Ed_BachE'] = df['DP02_0064E']
df['Ed_BachM'] = df['DP02_0064M']
df['Ed_BachC'] = df.apply(lambda x: (calc.get_cv(x['Ed_BachE'],x['Ed_BachM'])),axis=1)
df['Ed_GradE'] = df['DP02_0065E']
df['Ed_GradM'] = df['DP02_0065M']
df['Ed_GradC'] = df.apply(lambda x: (calc.get_cv(x['Ed_GradE'],x['Ed_GradM'])),axis=1)
df['Ed_OBachE'] = df['DP02_0067E']
df['Ed_OBachM'] = df['DP02_0067M']
df['Ed_OBachC'] = df.apply(lambda x: (calc.get_cv(x['Ed_OBachE'],x['Ed_OBachM'])),axis=1)
    
#df.head()

In [8]:
df = df.drop(var_data[1:],axis=1)
df.head()

Unnamed: 0,GEO_ID,HHI_E,HHI_M,HHI_C,HHI_U25E,HHI_U25M,HHI_U25C,HHI_2549E,HHI_2549M,HHI_2549C,...,Ed_ColC,Ed_BachE,Ed_BachM,Ed_BachC,Ed_GradE,Ed_GradM,Ed_GradC,Ed_OBachE,Ed_OBachM,Ed_OBachC
0,0500000US28151,18299.0,333.0,1.106244,7667.0,491.522126,3.897191,4633.0,459.678148,6.031505,...,3.536237,3476.0,368.0,6.435794,2220.0,302.0,8.269668,5696.0,437.0,4.663861
1,0500000US28111,4563.0,209.0,2.784389,1588.0,226.790652,8.681779,1176.0,198.698264,10.271192,...,7.386266,694.0,199.0,17.431217,204.0,71.0,21.157399,898.0,216.0,14.622159
2,0500000US28019,3164.0,150.0,2.881966,1153.0,189.359447,9.983706,750.0,159.138305,12.898748,...,9.406095,625.0,134.0,13.033435,399.0,119.0,18.130432,1024.0,173.0,10.270232
3,0500000US28057,8706.0,254.0,1.773573,2518.0,303.588537,7.329321,2465.0,333.997006,8.236823,...,4.570009,1337.0,249.0,11.32145,772.0,194.0,15.276312,2109.0,321.0,9.252574
4,0500000US28015,3658.0,252.0,4.187848,1100.0,250.962149,13.869143,923.0,225.816297,14.872627,...,10.756308,692.0,173.0,15.197568,359.0,141.0,23.875846,1051.0,223.0,12.898412


In [11]:
df.to_excel('HHI_EduAttain_county_FullUS.xlsx')

# Grab NYC Metro Region counties only

In [12]:
df_31cr = df[df['GEO_ID'].isin(stco)]
df_31cr.shape

(31, 46)

In [13]:
df_31cr.to_excel('HHI_EduAttain_county_31cr.xlsx')

## Subregion Calc

In [14]:
df_31cr['stco'] = df_31cr['GEO_ID'].str[-5:] #update this using .loc[row,col] = value
df_31cr.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,GEO_ID,HHI_E,HHI_M,HHI_C,HHI_U25E,HHI_U25M,HHI_U25C,HHI_2549E,HHI_2549M,HHI_2549C,...,Ed_BachE,Ed_BachM,Ed_BachC,Ed_GradE,Ed_GradM,Ed_GradC,Ed_OBachE,Ed_OBachM,Ed_OBachC,stco
277,0500000US34025,233874.0,1154.0,0.299956,30485.0,1189.963865,2.372912,33151.0,1285.317082,2.356936,...,117766.0,2210.0,1.140792,77607.0,1601.0,1.254078,195373.0,2144.0,0.667105,34025
278,0500000US34037,53361.0,533.0,0.607208,5285.0,435.03678,5.003974,7707.0,511.250428,4.032575,...,23865.0,780.0,1.98686,11392.0,621.0,3.313796,35257.0,919.0,1.584544,34037
280,0500000US34013,282502.0,1195.0,0.257146,67957.0,1795.2543,1.605927,56018.0,1676.226715,1.819027,...,110505.0,2029.0,1.11618,74587.0,1689.0,1.376577,185092.0,2138.0,0.702189,34013
281,0500000US34029,225270.0,1319.0,0.355939,37705.0,1373.102691,2.213799,46527.0,1392.711384,1.81966,...,78983.0,2005.0,1.543174,39715.0,1281.0,1.960779,118698.0,2344.0,1.200462,34029
284,0500000US34031,163670.0,890.0,0.330564,34238.0,1286.204883,2.283683,31204.0,1176.098635,2.291224,...,62770.0,1631.0,1.579559,30716.0,1170.0,2.315556,93486.0,1696.0,1.102842,34031


In [15]:
geo_xwalk = pd.read_excel('31CR_CoxSub.xlsx')
geo_xwalk['stco'] = geo_xwalk['stco'].apply(lambda x: '{0:0>5}'.format(x))

In [16]:
df_subreg = geo_xwalk.merge(df_31cr,on='stco')
df_subreg = df_subreg.drop(columns=['stco','st','co','stco_int','subreg2','reg','stco_lbl','co_lbl','GEO_ID'])
df_subreg.head()

Unnamed: 0,subreg1,HHI_E,HHI_M,HHI_C,HHI_U25E,HHI_U25M,HHI_U25C,HHI_2549E,HHI_2549M,HHI_2549C,...,Ed_ColC,Ed_BachE,Ed_BachM,Ed_BachC,Ed_GradE,Ed_GradM,Ed_GradC,Ed_OBachE,Ed_OBachM,Ed_OBachC
0,CT,340491.0,1330.0,0.237454,47030.0,1354.387315,1.750661,52177.0,1460.319486,1.701386,...,1.021525,169677.0,2504.0,0.897109,134241.0,2581.0,1.168791,303918.0,3000.0,0.600066
1,CT,73987.0,840.0,0.690173,9766.0,580.572993,3.613884,13369.0,736.387126,3.348431,...,1.865906,27363.0,1096.0,2.434899,19907.0,862.0,2.632301,47270.0,1206.0,1.550943
2,CT,329857.0,1775.0,0.32712,61325.0,1745.295963,1.730078,64792.0,1749.018296,1.640994,...,1.129048,111839.0,2109.0,1.14635,96127.0,2018.0,1.276174,207966.0,2685.0,0.784849
3,NJ In,338249.0,1241.0,0.223033,43482.0,1515.336266,2.118525,47246.0,1468.487657,1.889467,...,1.271165,197262.0,2629.0,0.81018,118845.0,2386.0,1.22046,316107.0,3255.0,0.625966
4,NJ In,282502.0,1195.0,0.257146,67957.0,1795.2543,1.605927,56018.0,1676.226715,1.819027,...,1.130523,110505.0,2029.0,1.11618,74587.0,1689.0,1.376577,185092.0,2138.0,0.702189


In [17]:
df_subreg = geo.calculate_sumgeo(df_subreg,'subreg1')
df_subreg

Unnamed: 0,Ed_BachC,Ed_BachE,Ed_BachM,Ed_C,Ed_ColC,Ed_ColE,Ed_ColM,Ed_E,Ed_GradC,Ed_GradE,...,HHI_MdC,HHI_MdE,HHI_MdM,HHI_O150C,HHI_O150E,HHI_O150M,HHI_U25C,HHI_U25E,HHI_U25M,subreg1
0,0.679466,308879.0,3452.406842,0.00789,0.709482,316292.0,3691.434816,1372394.0,0.822867,250275.0,...,0.631653,238411.0,2477.254327,0.815566,172610.0,2315.745236,1.175541,118121.0,2284.182129,CT
1,0.386761,889092.0,5656.598713,0.004052,0.503002,732401.0,6060.171285,3511442.0,0.515846,583075.0,...,0.351748,674041.0,3900.171919,0.524693,443658.0,3829.299153,0.788509,290430.0,3767.160469,NJ In
2,0.679002,316930.0,3539.974294,0.011939,0.643788,344839.0,3651.952218,1354643.0,0.805294,206356.0,...,0.491145,524910.0,4240.923366,0.833473,170784.0,2341.557601,1.279613,104887.0,2207.83378,NJ Out
3,0.39341,1292814.0,8366.572835,0.003309,0.412765,1195418.0,8116.861462,5923498.0,0.482471,924431.0,...,0.379421,320813.0,2002.346374,0.561181,542993.0,5012.596632,0.508288,772160.0,6456.287323,NYC
4,1.098613,111253.0,2010.584492,0.016441,0.865885,185174.0,2637.583553,628416.0,1.228729,86654.0,...,0.826286,274348.0,3729.048404,1.467882,56389.0,1361.60567,1.762669,56086.0,1626.264124,Mid Hud
5,0.554621,433167.0,3952.004555,0.005391,0.555459,492970.0,4504.417609,1969430.0,0.691822,358268.0,...,0.431612,207915.0,1476.200867,0.676551,293941.0,3271.345442,1.365515,99663.0,2238.702973,LI
6,0.777908,221026.0,2828.379218,0.01181,0.929868,202387.0,3095.778739,939765.0,0.859734,210530.0,...,0.750246,286052.0,3530.325906,0.890908,145393.0,2130.797738,1.730627,68044.0,1937.132159,Low Hud


In [19]:
df_subreg.to_excel('HHI_EduAttai_subregion_31cr.xlsx')