## Pop & Housing variables pulled from 2010, and 2020 Censuses (using Cen API) for urban villages,  City of Phoenix and U.S.
### added Maricopa County, State of Arizona, and comparable cities

Total Housing Units, Population, Population by Ethncity, Total Occupied Housing Units,  

Note: to update list of comp cities, adjust dictionary in getters script. also note Indianapolis is a consolidated city.

In [1]:
import pandas as pd
import numpy as np
import os
import requests
import json

In [2]:
import getters as get

In [34]:
#import block data
blk_10 = pd.read_csv('../data/geo/blk_vil_10_reduced.csv')
blk_20 = pd.read_csv('../data/geo/blk_vil_20_reduced.csv')
#for df in [blk_10,blk_20]: df.geoid = df.geoid.apply(lambda x: '{0:0>15}'.format(x))

drop_cols = ['aland','awater','lat','lon','land_acre']
for df in [blk_10,blk_20]: df.drop(drop_cols,axis=1,inplace=True)
for df in [blk_10,blk_20]: df.rename({'geoid':'GEO_ID'},axis=1,inplace=True)

In [4]:
## set sources, define variable lists by Decennial Census year
#SF1 Dec in 2010, Redistricting file in 2020 (until 2020 tables are released)

source_dec = 'dec/sf1'
source_red = 'dec/pl'

#variables for each Census
years = {'2010':'P001001,P005003,P005004,P005005,P005006,P005007,P005008,P005009,P005010,H003001,H003002,H003003',\
        '2020':'P1_001N,P2_002N,P2_005N,P2_006N,P2_007N,P2_008N,P2_009N,P2_010N,P2_011N,H1_001N,H1_002N,H1_003N'}

#rename 2010 columns to group
col_10_rename={'P001001':'Pop_10E','P005003':'P_Wh_10E','P005004':'P_Bl_10E','P005006':'P_As_10E','P005010':'P_Hi_10E',\
               'P005005':'P_Ot_10E','P005007':'P_Ot_10E','P005008':'P_Ot_10E','P005009':'P_Ot_10E',\
              'H003001':'Hou_10E','H003002':'Hou_O_10E','H003003':'Hou_V_10E'}

#rename 2020 columns to group
col_20_rename = {'P1_001N':'Pop_20E','P2_002N':'P_Hi_20E',\
              'P2_005N':'P_Wh_20E','P2_006N':'P_Bl_20E',\
              'P2_007N':'P_Ot_20E','P2_008N':'P_As_20E','P2_009N':'P_Ot_20E',\
              'P2_010N':'P_Ot_20E','P2_011N':'P_Ot_20E','H1_001N':'Hou_20E',\
              'H1_002N':'Hou_O_20E','H1_003N':'Hou_V_20E'}

### 2010 and 2020 all blocks

In [5]:
census_key = os.getenv('Census_API')
base_url = f'https://api.census.gov/data/'

def get_blk(source,year,col):
    url = f'{base_url}{year}/{source}?get={col}&for=block:*&in=state:04&in=county:013&in=tract*&key={census_key}'
    resp = requests.request('GET',url).content
    df = pd.DataFrame(json.loads(resp)[1:])
    df.columns = json.loads(resp)[0]
    df['GEO_ID'] = df.state + df.county + df.tract + df.block
    df = df.drop(['state','county','tract','block'],axis=1)
    return df

In [6]:
bk10 = get_blk(source_dec,list(years.keys())[0],years.get(list(years.keys())[0]))
bk20 = get_blk(source_red,list(years.keys())[1],years.get(list(years.keys())[1]))

for df in [bk10,bk20]:
    for col in df.columns[2:]:
        df[col] = df[col].astype(int)
    
bk10.rename(columns=col_10_rename,inplace=True)
bk20.rename(columns=col_20_rename,inplace=True)

bk10 = bk10.groupby(bk10.columns,axis=1).sum()
bk20 = bk20.groupby(bk20.columns,axis=1).sum()

In [14]:
bk10[f'P_NWh_10E'] = bk10[f'Pop_10E'] - bk10[f'P_Wh_10E']
bk20[f'P_NWh_20E'] = bk20[f'Pop_20E'] - bk20[f'P_Wh_20E']

In [10]:
xwalk = pd.read_csv('../data/geo/nhgis_blk2010_blk2020_04013.csv')

In [15]:
df = pd.merge(bk10,xwalk,how='left',left_on='GEO_ID',right_on='GEOID10')

In [16]:
adj_vars = ['Hou','Hou_O','Hou_V','P_Bl','P_Hi','P_Ot','P_Wh','Pop','P_NWh']
for v in adj_vars:
    df[f'{v}_10E'] = df[f'{v}_10E']*df['WEIGHT'] 

In [17]:
df.head()

Unnamed: 0,GEO_ID,Hou_10E,Hou_O_10E,Hou_V_10E,P_As_10E,P_Bl_10E,P_Hi_10E,P_Ot_10E,P_Wh_10E,Pop_10E,P_NWh_10E,GEOID10,GEOID20,WEIGHT,PAREA
0,40130101011001,5.0,5.0,0.0,0,0.0,2.0,0.0,10.0,12.0,2.0,40130101011001,40130101031005,1.0,1.0
1,40130101011068,12.0,10.0,2.0,0,0.0,1.0,0.0,23.0,24.0,1.0,40130101011068,40130101032073,1.0,1.0
2,40130101011072,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,40130101011072,40130101032070,1.0,1.0
3,40130101011002,1.596883,1.596883,0.0,0,0.0,0.0,0.0,4.258354,4.258354,0.0,40130101011002,40130101031001,0.532294,0.543901
4,40130101011002,1.403117,1.403117,0.0,0,0.0,0.0,0.0,3.741646,3.741646,0.0,40130101011002,40130101031002,0.467706,0.456099


In [18]:
df.shape

(85341, 15)

In [19]:
dff = df.copy().dropna().drop(['GEOID10','GEO_ID','WEIGHT',\
                               'PAREA'],axis=1).groupby(['GEOID20']).sum().reset_index()

In [26]:
dff = pd.merge(bk20,dff,how='left',left_on='GEO_ID',right_on='GEOID20')
dff.head()

Unnamed: 0,GEO_ID,Hou_20E,Hou_O_20E,Hou_V_20E,P_As_20E,P_Bl_20E,P_Hi_20E,P_Ot_20E,P_Wh_20E,Pop_20E,...,Hou_10E,Hou_O_10E,Hou_V_10E,P_As_10E,P_Bl_10E,P_Hi_10E,P_Ot_10E,P_Wh_10E,Pop_10E,P_NWh_10E
0,40131069002011,9,7,2,0,0,30.0,0,11,41.0,...,9.0,8.0,1.0,1,0.0,18.0,0.0,12.0,31.0,19.0
1,40131075003004,18,18,0,2,3,12.0,3,21,41.0,...,17.0,16.0,1.0,0,0.0,5.0,2.0,27.0,34.0,7.0
2,40130830001001,26,26,0,0,12,54.0,4,14,84.0,...,0.898272,0.898272,0.0,0,0.0,1.347409,0.0,0.449136,1.796545,1.347409
3,40131032122003,18,18,0,0,0,4.0,2,32,38.0,...,18.0,18.0,0.0,0,0.0,2.0,0.0,42.0,44.0,2.0
4,40131042264002,20,20,0,0,6,3.0,2,49,60.0,...,20.0,19.0,1.0,0,0.0,0.0,0.0,49.0,49.0,0.0


In [35]:
test = pd.merge(blk_20,dff,how='left',on='GEO_ID')

In [36]:
test.head(20)

Unnamed: 0,GEO_ID,name,Hou_20E,Hou_O_20E,Hou_V_20E,P_As_20E,P_Bl_20E,P_Hi_20E,P_Ot_20E,P_Wh_20E,...,Hou_10E,Hou_O_10E,Hou_V_10E,P_As_10E,P_Bl_10E,P_Hi_10E,P_Ot_10E,P_Wh_10E,Pop_10E,P_NWh_10E
0,40136147001059,Deer Valley,0,0,0,0,0,0.0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,40131167283000,Ahwatukee Foothills,210,205,5,40,10,35.0,27,456,...,157.398496,154.095217,3.303279,72,10.845681,30.003351,22.824266,397.578489,504.732857,107.154368
2,40131042054007,North Mountain,0,0,0,0,0,0.0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,40131171001000,Encanto,0,0,0,0,0,0.0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
4,40136122002000,Desert View,768,738,30,184,65,185.0,106,1275,...,312.340508,233.139153,79.201356,106,12.590985,42.61608,18.43411,432.785441,554.824213,122.038772
5,40131086022007,Camelback East,195,183,12,3,118,117.0,12,105,...,196.0,154.0,42.0,4,127.0,105.0,15.0,77.0,328.0,251.0
6,40131086023008,Camelback East,30,30,0,0,4,15.0,0,33,...,30.0,25.0,5.0,0,0.0,21.0,0.0,23.0,44.0,21.0
7,40131100022014,Maryvale,40,39,1,0,0,147.0,0,15,...,40.0,35.0,5.0,0,1.0,124.0,2.0,12.0,139.0,127.0
8,40131167182006,Ahwatukee Foothills,0,0,0,0,0,0.0,0,0,...,0.0,0.0,0.0,6,0.0,0.0,0.0,0.0,0.0,0.0
9,40131078002004,Camelback East,424,329,95,6,7,29.0,8,430,...,457.132468,329.139528,127.99294,14,0.437969,29.080354,8.875939,450.585006,495.855207,45.270201


In [37]:
for v in adj_vars:
    test[f'{v}_1020']=test[f'{v}_20E']-test[f'{v}_10E']

In [38]:
test.head()

Unnamed: 0,GEO_ID,name,Hou_20E,Hou_O_20E,Hou_V_20E,P_As_20E,P_Bl_20E,P_Hi_20E,P_Ot_20E,P_Wh_20E,...,P_NWh_10E,Hou_1020,Hou_O_1020,Hou_V_1020,P_Bl_1020,P_Hi_1020,P_Ot_1020,P_Wh_1020,Pop_1020,P_NWh_1020
0,40136147001059,Deer Valley,0,0,0,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,40131167283000,Ahwatukee Foothills,210,205,5,40,10,35.0,27,456,...,107.154368,52.601504,50.904783,1.696721,-0.845681,4.996649,4.175734,58.421511,63.267143,4.845632
2,40131042054007,North Mountain,0,0,0,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,40131171001000,Encanto,0,0,0,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,40136122002000,Desert View,768,738,30,184,65,185.0,106,1275,...,122.038772,455.659492,504.860847,-49.201356,52.409015,142.38392,87.56589,842.214559,1260.175787,417.961228


In [40]:
test.to_csv('test.csv',index=False)