## I. Extract data

In [1]:
import pandas as pd
import requests
import json

import warnings
warnings.filterwarnings('ignore')

# Import API key
from config import key

In [None]:
state_county = pd.read_csv("all-geocodes-v2017.csv", dtype={'County Code (FIPS)': object, 'State Code (FIPS)': object})
state_county

state_county.columns


In [2]:
# import diversity_index data and state_code data
diversity = pd.read_csv("Resources/diversityindex.csv")
state_code = pd.read_csv("state_code.csv")

In [3]:
# import unemployment data (already cleaned, no further transformation needed)
unemployment= pd.read_csv("umemployment_bystate.csv")

## II. Transform: data cleaning/transformation

### Diversity data

In [4]:
diversity.head()

Unnamed: 0,Location,Diversity-Index,"Black or African American alone, percent, 2013","American Indian and Alaska Native alone, percent, 2013","Asian alone, percent, 2013","Native Hawaiian and Other Pacific Islander alone, percent,","Two or More Races, percent, 2013","Hispanic or Latino, percent, 2013","White alone, not Hispanic or Latino, percent, 2013"
0,"Aleutians West Census Area, AK",0.769346,7.4,13.8,31.1,2.3,4.8,14.6,29.2
1,"Queens County, NY",0.742224,20.9,1.3,25.2,0.2,2.7,28.0,26.7
2,"Maui County, HI",0.740757,0.8,0.6,28.8,10.6,23.3,10.7,31.5
3,"Alameda County, CA",0.740399,12.4,1.2,28.2,1.0,5.2,22.7,33.2
4,"Aleutians East Borough, AK",0.738867,7.7,21.8,41.4,0.7,3.7,13.5,12.9


In [5]:
state_code.head()

Unnamed: 0,State,Abbrev,Code
0,Alabama,Ala.,AL
1,Alaska,Alaska,AK
2,Arizona,Ariz.,AZ
3,Arkansas,Ark.,AR
4,California,Calif.,CA


In [6]:
# select only "State" and "Code" columns
code_df = state_code[["Code","State"]]
code_df.head()

Unnamed: 0,Code,State
0,AL,Alabama
1,AK,Alaska
2,AZ,Arizona
3,AR,Arkansas
4,CA,California


In [7]:
diversity.columns

Index(['Location', 'Diversity-Index',
       'Black or African American alone, percent, 2013',
       'American Indian and Alaska Native alone, percent, 2013',
       'Asian alone, percent, 2013',
       'Native Hawaiian and Other Pacific Islander alone, percent,',
       'Two or More Races, percent, 2013', 'Hispanic or Latino, percent, 2013',
       'White alone, not Hispanic or Latino, percent, 2013'],
      dtype='object')

In [8]:
# rename the columns
diversity = diversity.rename(columns = {"Black or African American alone, percent, 2013":"BLK_Percent",
                                        "American Indian and Alaska Native alone, percent, 2013" : "Amrican_indian/Alk_native_Percent",
                                        "Asian alone, percent, 2013":"Asian_percent",
                                        "Native Hawaiian and Other Pacific Islander alone, percent,":"Hawaiian/Pacific_Islander_Percent",
                                        "Two or More Races, percent, 2013":"Two_or_more_percent",
                                        "Hispanic or Latino, percent, 2013":"His/Latino_Percent",
                                        "White alone, not Hispanic or Latino, percent, 2013":"white_percent"
                                       })
diversity.head()                      

Unnamed: 0,Location,Diversity-Index,BLK_Percent,Amrican_indian/Alk_native_Percent,Asian_percent,Hawaiian/Pacific_Islander_Percent,Two_or_more_percent,His/Latino_Percent,white_percent
0,"Aleutians West Census Area, AK",0.769346,7.4,13.8,31.1,2.3,4.8,14.6,29.2
1,"Queens County, NY",0.742224,20.9,1.3,25.2,0.2,2.7,28.0,26.7
2,"Maui County, HI",0.740757,0.8,0.6,28.8,10.6,23.3,10.7,31.5
3,"Alameda County, CA",0.740399,12.4,1.2,28.2,1.0,5.2,22.7,33.2
4,"Aleutians East Borough, AK",0.738867,7.7,21.8,41.4,0.7,3.7,13.5,12.9


In [9]:
# split column "Location" by comma so we got both "County" and "State_code" column
diversity[["County","Code"]] = diversity["Location"].str.split("," , n = 1, expand = True)
diversity = diversity.drop(["Location"], axis=1)
diversity.head()

Unnamed: 0,Diversity-Index,BLK_Percent,Amrican_indian/Alk_native_Percent,Asian_percent,Hawaiian/Pacific_Islander_Percent,Two_or_more_percent,His/Latino_Percent,white_percent,County,Code
0,0.769346,7.4,13.8,31.1,2.3,4.8,14.6,29.2,Aleutians West Census Area,AK
1,0.742224,20.9,1.3,25.2,0.2,2.7,28.0,26.7,Queens County,NY
2,0.740757,0.8,0.6,28.8,10.6,23.3,10.7,31.5,Maui County,HI
3,0.740399,12.4,1.2,28.2,1.0,5.2,22.7,33.2,Alameda County,CA
4,0.738867,7.7,21.8,41.4,0.7,3.7,13.5,12.9,Aleutians East Borough,AK


In [10]:
# add a year column filled with 2013
diversity = diversity.assign(Year='2013')
diversity.head()

Unnamed: 0,Diversity-Index,BLK_Percent,Amrican_indian/Alk_native_Percent,Asian_percent,Hawaiian/Pacific_Islander_Percent,Two_or_more_percent,His/Latino_Percent,white_percent,County,Code,Year
0,0.769346,7.4,13.8,31.1,2.3,4.8,14.6,29.2,Aleutians West Census Area,AK,2013
1,0.742224,20.9,1.3,25.2,0.2,2.7,28.0,26.7,Queens County,NY,2013
2,0.740757,0.8,0.6,28.8,10.6,23.3,10.7,31.5,Maui County,HI,2013
3,0.740399,12.4,1.2,28.2,1.0,5.2,22.7,33.2,Alameda County,CA,2013
4,0.738867,7.7,21.8,41.4,0.7,3.7,13.5,12.9,Aleutians East Borough,AK,2013


In [11]:
# there is a space in the "Code" column
diversity["Code"].unique()

array([' AK', ' NY', ' HI', ' CA', ' TX', None, ' NC', ' GA', ' VA',
       ' MD', ' IL', ' NJ', ' KS', ' FL', ' NV', ' MA', ' PA', ' NM',
       ' OK', ' WI', ' SC', ' DC', ' AZ', ' LA', ' CO', ' IN', ' MS',
       ' MI', ' NE', ' MN', ' AR', ' TN', ' MO', ' DE', ' WA', ' AL',
       ' OR', ' UT', ' SD', ' OH', ' CT', ' MT', ' ND', ' ID', ' IA',
       ' RI', ' KY', ' WY', ' WV', ' NH', ' VT', ' ME'], dtype=object)

In [12]:
# get rid of space in the "Code" column
diversity.Code = diversity.Code.str.replace(' ', '')

In [13]:
merged_df = pd.merge(diversity,code_df, how = "left", on="Code")
merged_df

Unnamed: 0,Diversity-Index,BLK_Percent,Amrican_indian/Alk_native_Percent,Asian_percent,Hawaiian/Pacific_Islander_Percent,Two_or_more_percent,His/Latino_Percent,white_percent,County,Code,Year,State
0,0.769346,7.4,13.8,31.1,2.3,4.8,14.6,29.2,Aleutians West Census Area,AK,2013,Alaska
1,0.742224,20.9,1.3,25.2,0.2,2.7,28.0,26.7,Queens County,NY,2013,New York
2,0.740757,0.8,0.6,28.8,10.6,23.3,10.7,31.5,Maui County,HI,2013,Hawaii
3,0.740399,12.4,1.2,28.2,1.0,5.2,22.7,33.2,Alameda County,CA,2013,California
4,0.738867,7.7,21.8,41.4,0.7,3.7,13.5,12.9,Aleutians East Borough,AK,2013,Alaska
...,...,...,...,...,...,...,...,...,...,...,...,...
3190,0.037540,0.3,0.2,0.1,0.0,0.7,0.6,98.1,Osage County,MO,2013,Missouri
3191,0.035585,0.2,0.1,0.1,0.0,0.7,0.6,98.2,Lincoln County,WV,2013,West Virginia
3192,0.035581,0.4,0.1,0.2,0.0,0.7,0.5,98.2,Leslie County,KY,2013,Kentucky
3193,0.023784,0.2,0.0,0.0,0.0,0.8,0.2,98.8,Blaine County,NE,2013,Nebraska


### Income Data

While we had access to a state-level median income file from my (Richa's) previous group project, we decided to do an API pull for this project to make the process more automated. Realizing that every public datafile contains either the name, or the FIPS code of a place, we first downloaded the FIPS codes csv from census.gov. This file had too much information, so we cleaned it down to just state and county level information.

In [None]:
## State and county

state_county_clean = state_county.loc[(state_county['County Subdivision Code (FIPS)'] == 0) & 
                                      (state_county['Place Code (FIPS)'] == 0) & 
                                      (state_county['Consolidtated City Code (FIPS)'] == 0)]

state_county_clean.head()


In [None]:
## States only

state_clean = state_county.loc[(state_county['County Code (FIPS)'] == '000') & 
                               (state_county['County Subdivision Code (FIPS)'] == 0) & 
                               (state_county['Place Code (FIPS)'] == 0) & 
                               (state_county['Consolidtated City Code (FIPS)'] == 0)]

state_clean.head()


In [None]:
## Counties only

county_clean = state_county.loc[(state_county['County Code (FIPS)'] != '000') & 
                                (state_county['County Subdivision Code (FIPS)'] == 0) & 
                                (state_county['Place Code (FIPS)'] == 0) & 
                                (state_county['Consolidtated City Code (FIPS)'] == 0)]

county_clean.head()


I then pulled the income data using the Census API. From the documentation I found out that the median household income variable is coded "DP03_0062E," and the names can be pulled along with the variable if specified in the API URL.

In [None]:
response_list = []

for index, row in county_clean.iterrows():
    state_code = row['State Code (FIPS)']
    county_code = row['County Code (FIPS)']
    
    print(state_code)
    print(county_code)
    
    url = f"https://api.census.gov/data/2018/acs/acs5/profile?get=DP03_0062E,NAME&for=county:{county_code}&in=state:{state_code}&key={key}"
   
    try:
        r = requests.get(url)
        response = requests.get(url)
        data = response.json()
        
        index = data[0]
        value = data[1]
        
        df_dict = {'Index':index,'Value':value}
        response_list.append(df_dict)
        
    except Exception as e:
        print(e)
        
        
response_list


For safety purposes (and this presentation), I also downloaded the county-level information from the census website, which can be loaded more quickly. This file had A LOT of information, from which I extracted median household income estimates (instead of median household income margin of error, or percent estimate, or percent estimate margin of error).

In [None]:
income_counties_df = pd.read_csv("median_hhi_counties.csv")

income_counties_df.columns = income_counties_df.iloc[0]

income_counties_df = income_counties_df.iloc[1:]

income_counties_df = income_counties_df.set_index('id')

income_counties_df = income_counties_df.filter(regex="Median household income")

income_counties_df = income_counties_df.filter(regex="Estimate!!")

income_counties_df = income_counties_df.reset_index()

income_counties_df.columns = ["id", "Median HHI", "Median HHI Perc"]

income_counties_df = income_counties_df[['id', 'Median HHI']]

income_counties_df


In [None]:
income_counties_df['state_code'] = income_counties_df.id.str[9:11]
income_counties_df


In [None]:
income_counties_df['county_code'] = income_counties_df.id.str[11:14]
income_counties_df


In [None]:
income_counties_df = income_counties_df.astype({"state_code": int, "county_code": int})
income_counties_df.to_csv("median_hhi_counties.csv")

income_counties_df

From my previous group project, I already had a state-level income file (credits to Julia Leonoff), which I cleaned up to contain the most-recent information for queries that need state-level income only.

In [None]:
income_df = pd.read_csv("https://raw.githubusercontent.com/gkmatt29/ETL-Enthusiasts/master/Resources/household_median_income_2017.csv")

median_hhi_2017_state = income_df[["State", "2017"]]

median_hhi_2017_state.columns = ['State', 'Median HHI']

median_hhi_2017_state["Year"] = 2017

median_hhi_2017_state

median_hhi_2017_state.to_csv('median_hhi_2017_state.csv')




## III. Load the final database