# Project 2: Extract, Transform, and Load
### Team 3: Chris Schultz and Glen Dagger

In [116]:
# Import dependencies
import pandas as pd
from census import Census
from config import api_key
from sqlalchemy import create_engine, inspect

from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
import county_web_scrape
import time

# Extract and Transform Data

## Beers

In [146]:
# Import beers.csv as dataframe
beers_df = pd.read_csv('./Resources/beers.csv', index_col=[0])

# Filter dataframe to desired columns
beers_cleaned_df = beers_df[['id','name','style','brewery_id','abv']]

# Export to csv
beers_cleaned_df.to_csv('./CleanedCSVs/beers.csv', index=False)

# Display first 5 rows
beers_cleaned_df.head()

Unnamed: 0,id,name,style,brewery_id,abv
0,1436,Pub Beer,American Pale Lager,408,0.05
1,2265,Devil's Cup,American Pale Ale (APA),177,0.066
2,2264,Rise of the Phoenix,American IPA,177,0.071
3,2263,Sinister,American Double / Imperial IPA,177,0.09
4,2262,Sex and Candy,American IPA,177,0.075


## Breweries

In [161]:
# Import breweries.csv as dataframe
breweries_df = pd.read_csv('./Resources/breweries.csv')

# Strip whitespace from state column
breweries_df['state'] = breweries_df['state'].str.strip()

# Strip whitespace from name column
breweries_df['name'] = breweries_df['name'].str.strip()

# Clean dataframe
breweries_cleaned_df = breweries_df[['name', 'city', 'state']]

# Reset and rename index as id
breweries_cleaned_df.reset_index(inplace=True)
breweries_cleaned_df = breweries_cleaned_df.rename(columns = {'index':'brewery_id'})

# Display first 5 rows
breweries_cleaned_df.head()

Unnamed: 0,brewery_id,name,city,state
0,0,NorthGate Brewing,Minneapolis,MN
1,1,Against the Grain Brewery,Louisville,KY
2,2,Jack's Abby Craft Lagers,Framingham,MA
3,3,Mike Hess Brewing Company,San Diego,CA
4,4,Fort Point Beer Company,San Francisco,CA


In [162]:
# Export cleaned breweries table
breweries_cleaned_df.to_csv('./CleanedCSVs/breweries_table.csv', index=False)

In [22]:
# # Run web scraping module to find county for each city in city_df and add column to dataframe
# city_df = county_web_scrape.main()

record 0/401
0.25% complete
Currently scraping Abingdon, VA.
Success! This city is in Washington County
--------------------------------
record 1/401
0.5% complete
Currently scraping Abita Springs, LA.
Success! This city is in St. Tammany Parish
--------------------------------
record 2/401
0.75% complete
Currently scraping Ada, MI.
No county was found.
--------------------------------
record 3/401
1.0% complete
Currently scraping Afton, VA.
No county was found.
--------------------------------
record 4/401
1.25% complete
Currently scraping Airway Heights, WA.
Success! This city is in Spokane County
--------------------------------
record 5/401
1.5% complete
Currently scraping Albuquerque, NM.
Success! This city is in Bernalillo County
--------------------------------
record 6/401
1.75% complete
Currently scraping Alpine, TX.
Success! This city is in Brewster County
--------------------------------
record 7/401
2.0% complete
Currently scraping Anchorage, AK.
Success! This city is in An

In [163]:
city_df = pd.read_csv('./Resources/county_list_df.csv', index_col=[0])
city_df.head()

Unnamed: 0,city,state,county
0,Abingdon,VA,Washington County
1,Abita Springs,LA,St. Tammany Parish
2,Ada,MI,
3,Afton,VA,
4,Airway Heights,WA,Spokane County


In [164]:
# Merge county data into breweries dataframe
breweries_counties_df = breweries_cleaned_df.merge(city_df, how='left', on=['city','state'])

# Filter columns
breweries_counties_df = breweries_counties_df[['brewery_id', 'name', 'city', 'state', 'county']]

# Display first 5 rows
breweries_counties_df.head()

Unnamed: 0,brewery_id,name,city,state,county
0,0,NorthGate Brewing,Minneapolis,MN,Hennepin County
1,1,Against the Grain Brewery,Louisville,KY,Jefferson County
2,2,Jack's Abby Craft Lagers,Framingham,MA,Middlesex County
3,3,Mike Hess Brewing Company,San Diego,CA,San Diego County
4,4,Fort Point Beer Company,San Francisco,CA,San Francisco County


## Census Data

### County

In [165]:
# Create Census object with Census API key from the selected year
c = Census(api_key, year=2020)

# Run Census Search to retrieve income data by county
county_census_data = c.acs5.get(("NAME", "B01003_001E", "B19013_001E","B19301_001E", "B01002_001E"), {'for': 'county:*'})

# Convert to DataFrame
county_census_df = pd.DataFrame(county_census_data)

# Rename columns
county_census_cleaned_df = county_census_df.rename(columns={"B01003_001E": "population",
                                      "B19013_001E": "med_household_income",
                                      "B19301_001E": "per_capita_income",
                                      "B01002_001E": "median_age",
                                      "state": "state_code",
                                      "NAME": "county",
                                      "county": "county_code"
                                    }
                                    )

# Cast population column as integer
county_census_cleaned_df['population'] = county_census_cleaned_df['population'].astype(int)

# Split county column into separate county and state columns
county_census_cleaned_df[['county','state']] = county_census_cleaned_df.county.str.split(', ', expand=True)

# Display first 5 rows
county_census_cleaned_df.head()


Unnamed: 0,county,population,med_household_income,per_capita_income,median_age,state_code,county_code,state
0,Autauga County,55639,57982.0,29804.0,38.6,1,1,Alabama
1,Baldwin County,218289,61756.0,33751.0,43.2,1,3,Alabama
2,Barbour County,25026,34990.0,20074.0,40.1,1,5,Alabama
3,Bibb County,22374,51721.0,22626.0,39.9,1,7,Alabama
4,Blount County,57755,48922.0,25457.0,41.0,1,9,Alabama


In [166]:
# Create dictionary for converting state names to abbreviations
state_abbreviations = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia": "DC",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM",
    "U.S. Virgin Islands": "VI",
}

In [167]:
# Convert State names to state abbreviations
county_census_cleaned_df.replace({"state": state_abbreviations}, inplace=True)

# Drop state_code and county_code columns
county_census_cleaned_df.drop(columns=['county_code', 'state_code'], inplace=True)

# Export dataframe to CSV to more easily inspect final dataset
county_census_cleaned_df.to_csv('./Resources/county_census_data.csv')

# Display first 5 rows
county_census_cleaned_df.head()

Unnamed: 0,county,population,med_household_income,per_capita_income,median_age,state
0,Autauga County,55639,57982.0,29804.0,38.6,AL
1,Baldwin County,218289,61756.0,33751.0,43.2,AL
2,Barbour County,25026,34990.0,20074.0,40.1,AL
3,Bibb County,22374,51721.0,22626.0,39.9,AL
4,Blount County,57755,48922.0,25457.0,41.0,AL


### Create Census Table by State

In [168]:
# Create Census object with Census API key from the selected year
c = Census(api_key, year=2020)

# Run Census Search to retrieve income data by county
state_census_data = c.acs5.get(("NAME", "B01003_001E", "B19013_001E","B19301_001E", "B01002_001E"), {'for': 'state:*'})

# Convert to DataFrame
state_census_df = pd.DataFrame(state_census_data)

# Rename columns
state_census_cleaned_df = state_census_df.rename(columns={"B01003_001E": "population",
                                      "B19013_001E": "med_household_income",
                                      "B19301_001E": "per_capita_income",
                                      "B01002_001E": "median_age",
                                      "state": "state_code",
                                      "NAME": "state"
                                    }
                                    )

# Cast population column as int
state_census_cleaned_df['population'] = state_census_cleaned_df['population'].astype(int)

# Display first 5 rows
state_census_cleaned_df.head()

Unnamed: 0,state,population,med_household_income,per_capita_income,median_age,state_code
0,Pennsylvania,12794885,63627.0,35518.0,40.9,42
1,California,39346023,78672.0,38576.0,36.7,6
2,West Virginia,1807426,48037.0,27346.0,42.7,54
3,Utah,3151239,74197.0,30986.0,31.1,49
4,New York,19514849,71117.0,40898.0,39.0,36


In [169]:
# Convert State names to state abbreviations
state_census_cleaned_df.replace({"state": state_abbreviations}, inplace=True)

# Drop state_code and county_code columns
state_census_cleaned_df.drop(columns='state_code', inplace=True)

# Export dataframe to CSV to more easily inspect final dataset
state_census_cleaned_df.to_csv('./Resources/state_census_data.csv')

# Display first 5 rows
state_census_cleaned_df.head()

Unnamed: 0,state,population,med_household_income,per_capita_income,median_age
0,PA,12794885,63627.0,35518.0,40.9
1,CA,39346023,78672.0,38576.0,36.7
2,WV,1807426,48037.0,27346.0,42.7
3,UT,3151239,74197.0,30986.0,31.1
4,NY,19514849,71117.0,40898.0,39.0


# Load

### Connect to local database

In [156]:
# Create connection to postgresql database
protocol = 'postgresql'
username = 'postgres'
password = 'postgres'
host = 'localhost'
port = 5432
database_name = 'beer_db'
rds_connection_string = f'{protocol}://{username}:{password}@{host}:{port}/{database_name}'
engine = create_engine(rds_connection_string)

In [157]:
# Check table names using inspector
inspector = inspect(engine)
table_names = inspector.get_table_names()

table_names

['state_census', 'county_census', 'breweries', 'beers']

In [180]:
# State Census table
state_census_cleaned_df.to_sql(name='state_census', con=engine, if_exists='append', index=False)

In [181]:
# County Census table
county_census_cleaned_df.to_sql(name='county_census', con=engine, if_exists='append', index=False)

In [182]:
# Breweries table
breweries_counties_df.to_sql(name='breweries', con=engine, if_exists='append', index=False)

In [183]:
# Beers table
beers_cleaned_df.to_sql(name='beers', con=engine, if_exists='append', index=False)