## I. Extract data

In [22]:
import pandas as pd
import requests
import json

import warnings
warnings.filterwarnings('ignore')

# Import API key
from config import key

In [23]:
state_county = pd.read_csv("Resources/all-geocodes-v2017.csv", dtype={'County Code (FIPS)': object, 'State Code (FIPS)': object})
state_county

state_county.columns


Index(['Summary Level', 'State Code (FIPS)', 'County Code (FIPS)',
       'County Subdivision Code (FIPS)', 'Place Code (FIPS)',
       'Consolidtated City Code (FIPS)',
       'Area Name (including legal/statistical area description)'],
      dtype='object')

In [4]:
# import diversity_index data and state_code data
diversity = pd.read_csv("Resources/diversityindex.csv")
state_code = pd.read_csv("Resources/state_code.csv")

In [5]:
# import unemployment data (already cleaned, no further transformation needed)
unemployment= pd.read_csv("Resources/umemployment_bystate.csv")

## II. Transform: data cleaning/transformation

### Diversity data

In [7]:
diversity.head()

Unnamed: 0,Location,Diversity-Index,"Black or African American alone, percent, 2013","American Indian and Alaska Native alone, percent, 2013","Asian alone, percent, 2013","Native Hawaiian and Other Pacific Islander alone, percent,","Two or More Races, percent, 2013","Hispanic or Latino, percent, 2013","White alone, not Hispanic or Latino, percent, 2013"
0,"Aleutians West Census Area, AK",0.769346,7.4,13.8,31.1,2.3,4.8,14.6,29.2
1,"Queens County, NY",0.742224,20.9,1.3,25.2,0.2,2.7,28.0,26.7
2,"Maui County, HI",0.740757,0.8,0.6,28.8,10.6,23.3,10.7,31.5
3,"Alameda County, CA",0.740399,12.4,1.2,28.2,1.0,5.2,22.7,33.2
4,"Aleutians East Borough, AK",0.738867,7.7,21.8,41.4,0.7,3.7,13.5,12.9


In [8]:
state_code.head()

Unnamed: 0,State,Abbrev,Code
0,Alabama,Ala.,AL
1,Alaska,Alaska,AK
2,Arizona,Ariz.,AZ
3,Arkansas,Ark.,AR
4,California,Calif.,CA


In [9]:
# select only "State" and "Code" columns
code_df = state_code[["Code","State"]]
code_df.head()

Unnamed: 0,Code,State
0,AL,Alabama
1,AK,Alaska
2,AZ,Arizona
3,AR,Arkansas
4,CA,California


In [10]:
diversity.columns

Index(['Location', 'Diversity-Index',
       'Black or African American alone, percent, 2013',
       'American Indian and Alaska Native alone, percent, 2013',
       'Asian alone, percent, 2013',
       'Native Hawaiian and Other Pacific Islander alone, percent,',
       'Two or More Races, percent, 2013', 'Hispanic or Latino, percent, 2013',
       'White alone, not Hispanic or Latino, percent, 2013'],
      dtype='object')

In [11]:
# rename the columns
diversity = diversity.rename(columns = {"Black or African American alone, percent, 2013":"BLK_Percent",
                                        "American Indian and Alaska Native alone, percent, 2013" : "Amrican_indian/Alk_native_Percent",
                                        "Asian alone, percent, 2013":"Asian_percent",
                                        "Native Hawaiian and Other Pacific Islander alone, percent,":"Hawaiian/Pacific_Islander_Percent",
                                        "Two or More Races, percent, 2013":"Two_or_more_percent",
                                        "Hispanic or Latino, percent, 2013":"His/Latino_Percent",
                                        "White alone, not Hispanic or Latino, percent, 2013":"white_percent"
                                       })
diversity.head()                      

Unnamed: 0,Location,Diversity-Index,BLK_Percent,Amrican_indian/Alk_native_Percent,Asian_percent,Hawaiian/Pacific_Islander_Percent,Two_or_more_percent,His/Latino_Percent,white_percent
0,"Aleutians West Census Area, AK",0.769346,7.4,13.8,31.1,2.3,4.8,14.6,29.2
1,"Queens County, NY",0.742224,20.9,1.3,25.2,0.2,2.7,28.0,26.7
2,"Maui County, HI",0.740757,0.8,0.6,28.8,10.6,23.3,10.7,31.5
3,"Alameda County, CA",0.740399,12.4,1.2,28.2,1.0,5.2,22.7,33.2
4,"Aleutians East Borough, AK",0.738867,7.7,21.8,41.4,0.7,3.7,13.5,12.9


In [12]:
# split column "Location" by comma so we got both "County" and "State_code" column
diversity[["County","Code"]] = diversity["Location"].str.split("," , n = 1, expand = True)
diversity = diversity.drop(["Location"], axis=1)
diversity.head()

Unnamed: 0,Diversity-Index,BLK_Percent,Amrican_indian/Alk_native_Percent,Asian_percent,Hawaiian/Pacific_Islander_Percent,Two_or_more_percent,His/Latino_Percent,white_percent,County,Code
0,0.769346,7.4,13.8,31.1,2.3,4.8,14.6,29.2,Aleutians West Census Area,AK
1,0.742224,20.9,1.3,25.2,0.2,2.7,28.0,26.7,Queens County,NY
2,0.740757,0.8,0.6,28.8,10.6,23.3,10.7,31.5,Maui County,HI
3,0.740399,12.4,1.2,28.2,1.0,5.2,22.7,33.2,Alameda County,CA
4,0.738867,7.7,21.8,41.4,0.7,3.7,13.5,12.9,Aleutians East Borough,AK


In [13]:
# add a year column filled with 2013
diversity = diversity.assign(Year='2013')
diversity.head()

Unnamed: 0,Diversity-Index,BLK_Percent,Amrican_indian/Alk_native_Percent,Asian_percent,Hawaiian/Pacific_Islander_Percent,Two_or_more_percent,His/Latino_Percent,white_percent,County,Code,Year
0,0.769346,7.4,13.8,31.1,2.3,4.8,14.6,29.2,Aleutians West Census Area,AK,2013
1,0.742224,20.9,1.3,25.2,0.2,2.7,28.0,26.7,Queens County,NY,2013
2,0.740757,0.8,0.6,28.8,10.6,23.3,10.7,31.5,Maui County,HI,2013
3,0.740399,12.4,1.2,28.2,1.0,5.2,22.7,33.2,Alameda County,CA,2013
4,0.738867,7.7,21.8,41.4,0.7,3.7,13.5,12.9,Aleutians East Borough,AK,2013


In [14]:
# there is a space in the "Code" column
diversity["Code"].unique()

array([' AK', ' NY', ' HI', ' CA', ' TX', None, ' NC', ' GA', ' VA',
       ' MD', ' IL', ' NJ', ' KS', ' FL', ' NV', ' MA', ' PA', ' NM',
       ' OK', ' WI', ' SC', ' DC', ' AZ', ' LA', ' CO', ' IN', ' MS',
       ' MI', ' NE', ' MN', ' AR', ' TN', ' MO', ' DE', ' WA', ' AL',
       ' OR', ' UT', ' SD', ' OH', ' CT', ' MT', ' ND', ' ID', ' IA',
       ' RI', ' KY', ' WY', ' WV', ' NH', ' VT', ' ME'], dtype=object)

In [None]:
# get rid of space in the "Code" column
diversity.Code = diversity.Code.str.replace(' ', '')

In [None]:
merged_df = pd.merge(diversity,code_df, how = "left", on="Code")
merged_df

### Income Data

While we had access to a state-level median income file from my (Richa's) previous group project, we decided to do an API pull for this project to make the process more automated. Realizing that every public datafile contains either the name, or the FIPS code of a place, we first downloaded the FIPS codes csv from census.gov. This file had too much information, so we cleaned it down to just state and county level information.

In [24]:
## State and county

state_county_clean = state_county.loc[(state_county['County Subdivision Code (FIPS)'] == 0) & 
                                      (state_county['Place Code (FIPS)'] == 0) & 
                                      (state_county['Consolidtated City Code (FIPS)'] == 0)]

state_county_clean.head()


Unnamed: 0,Summary Level,State Code (FIPS),County Code (FIPS),County Subdivision Code (FIPS),Place Code (FIPS),Consolidtated City Code (FIPS),Area Name (including legal/statistical area description)
0,40,1,0,0,0,0,Alabama
1,50,1,1,0,0,0,Autauga County
2,50,1,3,0,0,0,Baldwin County
3,50,1,5,0,0,0,Barbour County
4,50,1,7,0,0,0,Bibb County


In [25]:
## States only

state_clean = state_county.loc[(state_county['County Code (FIPS)'] == '000') & 
                               (state_county['County Subdivision Code (FIPS)'] == 0) & 
                               (state_county['Place Code (FIPS)'] == 0) & 
                               (state_county['Consolidtated City Code (FIPS)'] == 0)]

state_clean.head()


Unnamed: 0,Summary Level,State Code (FIPS),County Code (FIPS),County Subdivision Code (FIPS),Place Code (FIPS),Consolidtated City Code (FIPS),Area Name (including legal/statistical area description)
0,40,1,0,0,0,0,Alabama
529,40,2,0,0,0,0,Alaska
707,40,4,0,0,0,0,Arizona
814,40,5,0,0,0,0,Arkansas
1391,40,6,0,0,0,0,California


In [26]:
## Counties only

county_clean = state_county.loc[(state_county['County Code (FIPS)'] != '000') & 
                                (state_county['County Subdivision Code (FIPS)'] == 0) & 
                                (state_county['Place Code (FIPS)'] == 0) & 
                                (state_county['Consolidtated City Code (FIPS)'] == 0)]

county_clean.head()


Unnamed: 0,Summary Level,State Code (FIPS),County Code (FIPS),County Subdivision Code (FIPS),Place Code (FIPS),Consolidtated City Code (FIPS),Area Name (including legal/statistical area description)
1,50,1,1,0,0,0,Autauga County
2,50,1,3,0,0,0,Baldwin County
3,50,1,5,0,0,0,Barbour County
4,50,1,7,0,0,0,Bibb County
5,50,1,9,0,0,0,Blount County


I then pulled the income data using the Census API. From the documentation I found out that the median household income variable is coded "DP03_0062E," and the names can be pulled along with the variable if specified in the API URL.

In [27]:
response_list = []

for index, row in county_clean.iterrows():
    state_code = row['State Code (FIPS)']
    county_code = row['County Code (FIPS)']
    
    print(state_code)
    print(county_code)
    
    url = f"https://api.census.gov/data/2018/acs/acs5/profile?get=DP03_0062E,NAME&for=county:{county_code}&in=state:{state_code}&key={key}"
   
    try:
        r = requests.get(url)
        response = requests.get(url)
        data = response.json()
        
        index = data[0]
        value = data[1]
        
        df_dict = {'Index':index,'Value':value}
        response_list.append(df_dict)
        
    except Exception as e:
        print(e)
        
        
response_list


01
001
01
003
01
005
01
007
01
009
01
011
01
013
01
015
01
017
01
019
01
021
01
023
01
025
01
027
01
029
01
031
01
033
01
035
01
037
01
039
01
041
01
043
01
045
01
047
01
049
01
051
01
053
01
055
01
057
01
059
01
061
01
063
01
065
01
067
01
069
01
071
01
073
01
075
01
077
01
079
01
081
01
083
01
085
01
087
01
089
01
091
01
093
01
095
01
097
01
099
01
101
01
103
01
105
01
107
01
109
01
111
01
113
01
115
01
117
01
119
01
121
01
123
01
125
01
127
01
129
01
131
01
133
02
013
02
016
02
020
02
050
02
060
02
068
02
070
02
090
02
100
02
105
02
110
02
122
02
130
02
150
02
158
02
164
02
170
02
180
02
185
02
188
02
195
02
198
02
220
02
230
02
240
02
261
02
275
02
282
02
290
04
001
04
003
04
005
04
007
04
009
04
011
04
012
04
013
04
015
04
017
04
019
04
021
04
023
04
025
04
027
05
001
05
003
05
005
05
007
05
009
05
011
05
013
05
015
05
017
05
019
05
021
05
023
05
025
05
027
05
029
05
031
05
033
05
035
05
037
05
039
05
041
05
043
05
045
05
047
05
049
05
051
05
053
05
055
05
057
05
059
05
061
05
063

22
117
22
119
22
121
22
123
22
125
22
127
23
001
23
003
23
005
23
007
23
009
23
011
23
013
23
015
23
017
23
019
23
021
23
023
23
025
23
027
23
029
23
031
24
001
24
003
24
005
24
009
24
011
24
013
24
015
24
017
24
019
24
021
24
023
24
025
24
027
24
029
24
031
24
033
24
035
24
037
24
039
24
041
24
043
24
045
24
047
24
510
25
001
25
003
25
005
25
007
25
009
25
011
25
013
25
015
25
017
25
019
25
021
25
023
25
025
25
027
26
001
26
003
26
005
26
007
26
009
26
011
26
013
26
015
26
017
26
019
26
021
26
023
26
025
26
027
26
029
26
031
26
033
26
035
26
037
26
039
26
041
26
043
26
045
26
047
26
049
26
051
26
053
26
055
26
057
26
059
26
061
26
063
26
065
26
067
26
069
26
071
26
073
26
075
26
077
26
079
26
081
26
083
26
085
26
087
26
089
26
091
26
093
26
095
26
097
26
099
26
101
26
103
26
105
26
107
26
109
26
111
26
113
26
115
26
117
26
119
26
121
26
123
26
125
26
127
26
129
26
131
26
133
26
135
26
137
26
139
26
141
26
143
26
145
26
147
26
149
26
151
26
153
26
155
26
157
26
159
26
161
26
163
26
165

42
115
42
117
42
119
42
121
42
123
42
125
42
127
42
129
42
131
42
133
44
001
44
003
44
005
44
007
44
009
45
001
45
003
45
005
45
007
45
009
45
011
45
013
45
015
45
017
45
019
45
021
45
023
45
025
45
027
45
029
45
031
45
033
45
035
45
037
45
039
45
041
45
043
45
045
45
047
45
049
45
051
45
053
45
055
45
057
45
059
45
061
45
063
45
065
45
067
45
069
45
071
45
073
45
075
45
077
45
079
45
081
45
083
45
085
45
087
45
089
45
091
46
003
46
005
46
007
46
009
46
011
46
013
46
015
46
017
46
019
46
021
46
023
46
025
46
027
46
029
46
031
46
033
46
035
46
037
46
039
46
041
46
043
46
045
46
047
46
049
46
051
46
053
46
055
46
057
46
059
46
061
46
063
46
065
46
067
46
069
46
071
46
073
46
075
46
077
46
079
46
081
46
083
46
085
46
087
46
089
46
091
46
093
46
095
46
097
46
099
46
101
46
102
46
103
46
105
46
107
46
109
46
111
46
115
46
117
46
119
46
121
46
123
46
125
46
127
46
129
46
135
46
137
47
001
47
003
47
005
47
007
47
009
47
011
47
013
47
015
47
017
47
019
47
021
47
023
47
025
47
027
47
029
47
031

[{'Index': ['DP03_0062E', 'NAME', 'state', 'county'],
  'Value': ['58786', 'Autauga County, Alabama', '01', '001']},
 {'Index': ['DP03_0062E', 'NAME', 'state', 'county'],
  'Value': ['55962', 'Baldwin County, Alabama', '01', '003']},
 {'Index': ['DP03_0062E', 'NAME', 'state', 'county'],
  'Value': ['34186', 'Barbour County, Alabama', '01', '005']},
 {'Index': ['DP03_0062E', 'NAME', 'state', 'county'],
  'Value': ['45340', 'Bibb County, Alabama', '01', '007']},
 {'Index': ['DP03_0062E', 'NAME', 'state', 'county'],
  'Value': ['48695', 'Blount County, Alabama', '01', '009']},
 {'Index': ['DP03_0062E', 'NAME', 'state', 'county'],
  'Value': ['32152', 'Bullock County, Alabama', '01', '011']},
 {'Index': ['DP03_0062E', 'NAME', 'state', 'county'],
  'Value': ['39109', 'Butler County, Alabama', '01', '013']},
 {'Index': ['DP03_0062E', 'NAME', 'state', 'county'],
  'Value': ['45197', 'Calhoun County, Alabama', '01', '015']},
 {'Index': ['DP03_0062E', 'NAME', 'state', 'county'],
  'Value': ['39

In [28]:
income_counties_df1 = pd.DataFrame(response_list[0])

income_counties_df1_transposed = income_counties_df1.T
income_counties_df1_transposed.columns = income_counties_df1_transposed.iloc[0]
income_counties_df1_transposed = income_counties_df1_transposed.iloc[1:]

dicts = range(1,len(response_list))

for d in dicts:
    temp_df = pd.DataFrame.from_dict(response_list[d])
    temp_df_transposed = temp_df.T
    temp_df_transposed.columns = temp_df_transposed.iloc[0]
    temp_df_transposed = temp_df_transposed.iloc[1:]
    
    income_counties_df1_transposed = income_counties_df1_transposed.append(temp_df_transposed)

income_counties_df1_transposed.head()


Index,DP03_0062E,NAME,state,county
Value,58786,"Autauga County, Alabama",1,1
Value,55962,"Baldwin County, Alabama",1,3
Value,34186,"Barbour County, Alabama",1,5
Value,45340,"Bibb County, Alabama",1,7
Value,48695,"Blount County, Alabama",1,9


In [29]:
income_counties_df1_transposed = income_counties_df1_transposed[['DP03_0062E', 'NAME', 'state', 'county']]

income_counties_df1_transposed = income_counties_df1_transposed.rename(columns={"DP03_0062E": "Median_HHI", "state": "state_code", "county": "county_code"})

income_counties_df1_transposed.to_csv('Resources/median_hhi_counties_api.csv')

income_counties_df1_transposed.head()

For safety purposes (and this presentation), I also downloaded the county-level information from the census website, which can be loaded more quickly. This file had A LOT of information, from which I extracted median household income estimates (instead of median household income margin of error, or percent estimate, or percent estimate margin of error).

In [30]:
income_counties_df = pd.read_csv("Archive/median_hhi_raw.csv")

income_counties_df.columns = income_counties_df.iloc[0]

income_counties_df = income_counties_df.iloc[1:]

income_counties_df = income_counties_df.set_index('id')

income_counties_df = income_counties_df.filter(regex="Median household income")

income_counties_df = income_counties_df.filter(regex="Estimate!!")

income_counties_df = income_counties_df.reset_index()

income_counties_df.columns = ["id", "Median_HHI", "Median_HHI_Perc"]

income_counties_df = income_counties_df[['id', 'Median_HHI']]

income_counties_df.head()


Unnamed: 0,id,Median_HHI
0,0500000US01001,58786
1,0500000US01003,55962
2,0500000US01005,34186
3,0500000US01007,45340
4,0500000US01009,48695


In [19]:
income_counties_df['state_code'] = income_counties_df.id.str[9:11]

income_counties_df.head()


Unnamed: 0,id,Median_HHI,state_code
0,0500000US01001,58786,1
1,0500000US01003,55962,1
2,0500000US01005,34186,1
3,0500000US01007,45340,1
4,0500000US01009,48695,1


In [20]:
income_counties_df['county_code'] = income_counties_df.id.str[11:14]

income_counties_df.head()


Unnamed: 0,id,Median_HHI,state_code,county_code
0,0500000US01001,58786,1,1
1,0500000US01003,55962,1,3
2,0500000US01005,34186,1,5
3,0500000US01007,45340,1,7
4,0500000US01009,48695,1,9


In [21]:
income_counties_df = income_counties_df.astype({"state_code": int, "county_code": int})

income_counties_df.to_csv("Resources/median_hhi_counties.csv")

income_counties_df.head()

Unnamed: 0,id,Median_HHI,state_code,county_code
0,0500000US01001,58786,1,1
1,0500000US01003,55962,1,3
2,0500000US01005,34186,1,5
3,0500000US01007,45340,1,7
4,0500000US01009,48695,1,9


From my previous group project, I already had a state-level income file (credits to Julia Leonoff), which I cleaned up to contain the most-recent information for queries that need state-level income only.

In [16]:
income_df = pd.read_csv("https://raw.githubusercontent.com/RichaG7/Commitment-Issues/master/Resources/U.S._Census_income_data/household_median_income_2017.csv")

median_hhi_2017_state = income_df[["State", "2017"]]

median_hhi_2017_state.columns = ['State', 'Median HHI']

median_hhi_2017_state["Year"] = 2017

median_hhi_2017_state

median_hhi_2017_state.to_csv('Resources/median_hhi_2017_state.csv')




## III. Load the final database