# Imports

In [9]:
# Imports
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import numpy as np
import requests
import plotly.express as px

# Python library for pulling data out of HTML and XML files
from bs4 import BeautifulSoup

# enable the automation of web browsers
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
import time

# Regular expression (re) is specially encoded text strings used as patterns for matching/searching for sets of strings
import re

# imprt preprocessing 
from sklearn import preprocessing

# import k-means 
from sklearn.cluster import KMeans

# 1. Obtain List of Charlotte Breweries
NOTE: The results you get might be in a different order or return different venues.

In [10]:
# Foursquare Credentionals

CLIENT_ID = 'your_client_id' # your Foursquare client ID
CLIENT_SECRET = 'your_client_secret' # your Foursquare client Secret
VERSION = '20210119'

In [11]:
def getNearbyVenues(city_state, latitude, longitude, radius, category):
    """
    Function that returns a list of venues from a specififed location
    
    Parameters
    ----------
    city_state : str
        city and state location (i.e. 'Charlotte, North Carolina')
    latitude : float
        latitude of the location (i.e. 35.2271)
    longitude : float
        longitude of the location (i.e. -80.8431)
    radius : int
        limits the results to venues within this many meters of the specified location
    category : str
        A category to limit results to a specific category. https://developer.foursquare.com/docs/build-with-foursquare/categories/
    
    Returns
    -------
    pandas.core.frame.DataFrame
        Pandas Dataframe object
    
    """
    # Set the limit of the results to return
    LIMIT = 50
    
    # Create an empty list to store the results
    venues_list=[]    
    
    # Create the API request URL
    url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&categoryId={}&limit={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET,
        VERSION,
        latitude, 
        longitude,
        radius,
        category,
        LIMIT)

    # make the GET request
    results = requests.get(url).json()['response']['venues']

    # return only relevant information for each nearby venue
    venues_list.append([(
        city_state, 
        latitude, 
        longitude, 
        v['name'], 
        v['location']['lat'], 
        v['location']['lng'],
        v['categories'][0]['name']) for v in results])
    
    # Create pandas df with from the venues_list
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['City', 
                  'City Latitude', 
                  'City Longitude', 
                  'Brewery', 
                  'Brewery Latitude', 
                  'Brewery Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [12]:
# Define parameters for the getNearbyVenues

clt_nc = 'Charlotte, North Carolina'
clt_lat = 35.2271
clt_lng = -80.8431
radius = 10000

# Brewery Category
category_id = '50327c8591d4c4b30a586d5d'

In [13]:
# Call the getNearbyVenues and store the results

CLT_Brew = getNearbyVenues(clt_nc, clt_lat, clt_lng, radius, category_id)

In [None]:
# show the first 5 rows of the dataframe
CLT_Brew.head()

In [41]:
# Write to CSV for easier retrival later
#CLT_Brew.to_csv('data/charlotte_breweries_raw.csv', index=False)

## 1.1 - Confirm Brewery list with Axios Charlotte and Untappd.com
* https://charlotte.axios.com/31429/breweries-in-charlotte/
* https://untappd.com/

In [4]:
CLT_Brew = pd.read_csv('data/charlotte_breweries_raw.csv')

In [None]:
# Sort the df by Brewery name and identify which Brewery to drop.

CLT_Brew.sort_values(by=['Brewery'])

## 1.2 - Drop invalid breweries 
NOTE: The results you have might be in a different order or return different venues. Make sure to spot check before running the code in this section

1. 16 HG Brewer - doesn't exist
2. Bold Missy Brewery - out of business
3. Donut Taco Palace - doesn't exist
4. Doug's Desk - doesn't exist
5. Free Range Brewing Camp North End - duplicate
6. Furphy's Log Palace - doesn't exist
7. GoodRoad CiderWorks - doesn't sell beer
8. Heist Brewery and Barrel Arts - duplicate
9. Jack Shack - doesn't exist
10. Legion Brewing Southpark - duplicate
11. Red Clay Ciderworks doesn't sell beers
12. Scott M*otherf*ckin' Stallings Brewery - doesn't exist
13. Summit Seltzery- doesnt sell beers
14. Sunstead Brewing - doesn't exist
15. The Chamber By Wooden Robot - Duplicate
16. Thirsty Nomad Brewing Co. - out of business
17. Three Spirits Brewery - out of business
18. Toucan Louie's Gold District - doesn't exist

In [44]:
# Since Protagonist is listed twice, we need to drop one. 
# Upon looking at Protagonist's website and the geo-coordinates, the value at index 8 is the Noda taproom location.
# Since Armored Cow Brewing Co. does not come back with the search results, we can replace values at that index with Armored Cow Brewing Co. values

CLT_Brew.loc[CLT_Brew.index[8], 'Brewery'] = 'Armored Cow Brewing Co.'

CLT_Brew.loc[CLT_Brew.index[8], 'Brewery Latitude'] = 35.313514

CLT_Brew.loc[CLT_Brew.index[8], 'Brewery Longitude'] = -80.753259  

In [62]:
# Define list of the breweries to drop
brews_to_drop = ['16 HG Brewer', 'Bold Missy Brewery', 'Brewers at 4001 Yancey', 'Donut Taco Palace', "Doug's Desk", 'Free Range Brewing Camp North End', 
                 "Furphy's Log Palace", 'GoodRoad CiderWorks', 'Heist Brewery and Barrel Arts', 'Jack Shack', 'Legion Brewing Southpark',
                 'Red Clay Ciderworks', "Scott M*otherf*ckin' Stallings Brewery", 'Summit Seltzery', 'Sunstead Brewing',
                 'The Chamber By Wooden Robot', 'Thirsty Nomad Brewing Co.', 'Three Spirits Brewery', "Toucan Louie's Gold District"
                ]


In [63]:
# Drop the breweries 
CLT_Brew.drop(CLT_Brew[CLT_Brew['Brewery'].isin(brews_to_drop)].index, inplace=True)

# Reset the index of the dataframe
CLT_Brew.reset_index(drop=True, inplace=True)

In [2]:
# Write to CSV for easier retrival later
#CLT_Brew.to_csv('data/charlotte_breweries_clean.csv', index=False)

# 2. View the breweries on a map

In [None]:
CLT_Brew = pd.read_csv('data/charlotte_breweries_clean.csv')

In [30]:
mapbox_token = 'your_mapbox_token'

# Create dictionary of Charlotte, NC geo-coordinates
clt_geo = {
  "lat": clt_lat,
  "lon": clt_lng
}

In [None]:
# Set your mapbox access token
px.set_mapbox_access_token(mapbox_token)

# Create plotly express scatter_mapbox figure
CLT_Brew_Map = px.scatter_mapbox(CLT_Brew,
                        lat=CLT_Brew['Brewery Latitude'],
                        lon=CLT_Brew['Brewery Longitude'],
                        hover_name=CLT_Brew['Brewery'],
                        center= clt_geo,
                        zoom=10)

# Show the plotly express scatter_mapbox figure
CLT_Brew_Map.show()

# 3. Scrape the Beer list for each Brewery from Untappd.com

## 3.1 - Steps to get entire page contents for each Brewery
1. Create a list of Breweries sorted by Alphabetical Order (Section 3.1.1)
2. Create a list of URLs for each "Beers" page of each Brewery. Sorted by Alphebetical Order (Section 3.1.2)
3. Define function that creates a BeautifulSoup object (Section 3.1.3)
4. Initialize a Selenium Webdriver (Chrome) and login to your Untappd Account (Section 3.1.4)
5. Create a list of BeatutifulSoup objects for each url  (Section 3.1.5)

In [None]:
CLT_Brew = pd.read_csv('data/charlotte_breweries_clean.csv')

### 3.1.1 - Create list of the breweries sorted by alphabetical order

In [None]:
# Create a sorted list of the breweries
brewery_list = sorted(CLT_Brew['Brewery'].tolist())

len(brewery_list)

### 3.1.2 - Create a list of URLs for each "Beers" page of each Brewery. Sorted by Alphebetical Order

In [None]:
# Create list of urls from untappd.com

url_list = [ 
    'https://untappd.com/Armored_Cow_Brewing_/beer', # Armored Cow Brewing Co.
    'https://untappd.com/BirdsongBrewingCo/beer', # Birdsong Brewing Co.
    'https://untappd.com/BlueBlazeBrewing/beer', # Blue Blaze Brewing Co
    'https://untappd.com/catawbavalleybc/beer', # Catawba Brewing Charlotte
    'https://untappd.com/DevilsLogicBrewing/beer', # Devil’s Logic Brewing
    'https://untappd.com/DivineBarrelBrewing/beer', # Divine Barrel Brewing
    'https://untappd.com/Edge_City_Brewery/beer', # Edge City Brewery
    'https://untappd.com/FontaFloraBrewery/beer', # Fonta Flora Brewery - Optimist Hall
    'https://untappd.com/freerangebrewing/beer', # Free Range Brewing
    'https://untappd.com/HeistBreweryNC/beer', # Heist Brewery
    'https://untappd.com/LegionBrewingCompany/beer', # Legion Brewing
    'https://untappd.com/LennyBoyBrewing/beer', # Lenny Boy Brewing Co.
    'https://untappd.com/LLBrewCo/beer', # Lower Left Brewing Co.
    'https://untappd.com/nodabrewing/beer', # NoDa Brewing Company North End
    'https://untappd.com/oldemeckbrew/beer', # Olde Mecklenburg Brewery
    'https://untappd.com/PettyThievesBrewingCo/beer', # Petty Thieves Brewing Company
    'https://untappd.com/PilotBrewingCompany/beer', # Pilot Brewing
    'https://untappd.com/ProtagonistBeer/beer', # Protagonist
    'https://untappd.com/Resident_Culture/beer', # Resident Culture Brewing Co.
    'https://untappd.com/SaltyParrotBrewing/beer', # Salty Parrot Brewing Company
    'https://untappd.com/SaludCerveceria/beer', # Salud Cerveceria
    'https://untappd.com/SugarCreekBrewingCompany/beer', # Sugar Creek Brewing Company
    'https://untappd.com/SycamoreBrewing/beer', # Sycamore Brewing
    'https://untappd.com/SuffolkPunchBrewing/beer', # The Suffolk Punch
    'https://untappd.com/TownBrewing/beer', # Town Brewing Company
    'https://untappd.com/triplecbrewing/beer', # Triple C Brewing Company
    'https://untappd.com/TheUnkn/beer', # Unknown Brewing Co.
    'https://untappd.com/WoodenRobot/beer' # Wooden Robot Brewery
]

len(url_list)

### 3.1.3 - Define function that creates a BeautifulSoup object

In [75]:
def make_soup(drive):
    """
    Function that creates a BeautifulSoup object. 
    
    Parameters
    ----------
    arg1 : elenium.webdriver.chrome.webdriver.WebDriver

    Returns
    -------
    bs4.BeautifulSoup
       BeautifulSoup object from a brewery beer list url from Untappd.com (i.e. 'https://untappd.com/BirdsongBrewingCo/beer')
    
    """
    
    html = drive.page_source
    soup = BeautifulSoup(html, 'html.parser')
    return soup

### 3.1.4 - Initialize a Selenium Webdriver (Chrome) and login to your Untappd Account
* Download the Chrome webdriver from https://sites.google.com/a/chromium.org/chromedriver/ and place it in your directory

In [74]:
# Create the driver
driver = webdriver.Chrome(executable_path='/Users/dillondearmond/Desktop/Coursera/IBM_Data_Science_Professional/chromedriver')

# Initialize the driver. Note: this will open up a Chrome browser
driver.get('https://untappd.com/')

### 3.1.5 - Create a list of BeatutifulSoup Objects for each url
* Note: Sit back watch the magic happen. You will need to wait for "Complete" to be printed and the Chrome browser to close before proceeding. 

In [None]:
# Create empty list to store the BeautifulSoup objects
soup_list = []


# Loop through each url in the url_list
for url in url_list:
    # Call driver to navigate to the url
    driver.get(url)
    # Set driver to sleep for 2 seconds to allow page to load
    time.sleep(2)
    # While loop that looks for and clicks the 'Show More' button until it disappears
    while True:
        try:
            # Find the 'Show More' button
            showMoreButton = driver.find_element_by_xpath('//a[contains(@data-href,":more_beer")]')
            time.sleep(2)
            # Click the 'Show More' buttin
            showMoreButton.click()
            # Set driver to sleep for 2 seconds to allow page to load
            time.sleep(2)
        # Print error if an exception is thrown
        except Exception as e:
            print(e)
            break
    # Call the make_soup function and pass in the driver
    soup = make_soup(driver)
    # Append the BeautifulSoup object to the soup_list
    soup_list.append(soup)
    
# Print complete when finished    
print('Complete')

# Close browser
driver.quit()

# 4. Create dataframe of Beer Details for each Brewery
Dataframe will contain the following attributes:
* Brewery
* Beer Name
* Beer Style
* Beer ABV
* Beer IBU
* Beer Rating
* Beer Total Ratings
* Beer Added Date

In [77]:
def get_beer_details(brew_list, soups):
    """
    The function returns a dataframe consisting of beer details for a brewery on Untappd.
     
    Parameters
    ----------
    brew_list : list of strings
        string of the Brewery name associated with arg2

    soups : bs4.BeautifulSoup
        list of BeautifulSoup objects from a brewery beer list url from Untappd.com (i.e. 'https://untappd.com/BirdsongBrewingCo/beer')

    Returns
    -------
    pandas.core.frame.DataFrame
        Pandas Dataframe object
    
    """
    # Create empty list to store beer details of each brewery
    brew_beer_list = []
    
    
    # Loop through every brewery and their respective BeautifulSoup object 
    # Extract their beer details
    for brewery, soup in zip(brew_list, soups):
        brew_name = brewery
    
        # Get Beer names
        beer_names = soup.findAll("p", {"class": "name"})
        beer_name_list = [i.text for i in beer_names]

        # Get Beer styles    
        beer_style = soup.findAll("p", {"class": "style"})[1:]
        # Create list of beer styles
        beer_style_list = [i.text for i in beer_style]

        # Get the ABV of the first 25 beers listed on the page
        beer_ABV = soup.findAll("div", {"class": "details-item abv"})
        # Get the ABV of the remaining beers on the page
        beer_ABV_2 = soup.findAll("p", {"class": "abv"})
        # Append the second result set to the the first set
        beer_ABV.extend(beer_ABV_2)
        # create list of beer ABV
        beer_ABV_list = [i.text for i in beer_ABV]
        
        # Get the IBU of the first 25 beers listed on the page
        beer_IBU = soup.findAll("div", {"class": "details-item ibu"})
        # Get the IBU of the remaining beers on the page
        beer_IBU_2 = soup.findAll("p", {"class": "ibu"})
        # Append the second result set to the the first set
        beer_IBU.extend(beer_IBU_2)
        # Create list of beer IBU
        beer_IBU_list = [i.text for i in beer_IBU]

        # Get the beer ratings 
        beer_rating = soup.findAll("span", {"class": "num"})[1:]
        # Create list of beer rating
        beer_rating_list = [i.text for i in beer_rating]

        # Get the Beer rating totals of the first 25 beers
        beer_rating_totals = soup.findAll("div", {"class": "details-item raters"})
        # Get the beer rating totals of the remaining beers on the page
        beer_rating_totals_2 = soup.findAll("p", {"class": "raters"})[1:]
        # Append the second result set to the first set
        beer_rating_totals.extend(beer_rating_totals_2)
        # Create list of beer rating totals
        beer_rating_totals_list = [i.text for i in beer_rating_totals]
        
        # Get Beer Added dates of the first 25 beers
        beer_added_date = soup.findAll("div", {"class": "details-item date"})
        # Get the beer Added dates of the remaining beers
        beer_added_date_2 = soup.findAll("p", {"class": "date"})
        # Append the second result set to the first set
        beer_added_date.extend(beer_added_date_2)
        # Create list of beer added dates
        beer_added_date_list = [i.text for i in beer_added_date]
        
        # Loop through all the lists and append to the brew_beer_list
        for beer, style, ABV, IBU, ratings, totals, date in zip(beer_name_list, beer_style_list, 
                                                                beer_ABV_list, beer_IBU_list, 
                                                                beer_rating_list, beer_rating_totals_list, 
                                                                beer_added_date_list):
            brew_beer_list.append([(brew_name, beer, style, ABV, IBU, ratings, totals, date)])
    
    # Create dataframe to store values from brew_beer_list
    df = pd.DataFrame([item for brew_beer_list in brew_beer_list for item in brew_beer_list])
    # set the column names
    df.columns = ['Brewery',
                  'Beer Name',
                  'Beer Style',
                  'Beer ABV',
                  'Beer IBU',
                  'Beer Rating',
                  'Beer Total Ratings',
                  'Beer Added Date'
                 ]    
    return(df)

In [78]:
# Call the function and store it in beer_df
beer_df_raw = get_beer_details(brewery_list, soup_list)

In [79]:
# Save df to csv. 
beer_df_raw.to_csv('data/beer_df_raw.csv', index=False)

In [110]:
# Return the first 5 rows of the df
beer_df_raw.head()

# 5. Clean the dataframe
1. Replace unwanted characters (Section 5.1)
2. Replace blank values (white space) with NaN (Section 5.2)
3. Trim leading and trailing whitespaces (Section 5.3)
4. Drop beers with no ratings (Section 5.4)
5. Fill NaN values for ABV and IBU with averages (Section 5.5)

In [None]:
beer_df_raw = pd.read_csv('data/beer_df_raw.csv')

## 5.1 - Replace unwanted characters
* Beer ABV - keep only numeric characters and decimals (Ex. "\n7.3% ABV" to "7.3" OR "\nN/A ABV" to "")
* Beer IBU - keep only numeric characters (Ex. "\n40 IBU" to "40" OR "\nN/A IBU" to "")
* Beer Rating - keep only numeric characters and decimcals (Ex. "(4.02)" to "4.02" OR "(N/A)" to "")
* Beer Total Ratings - keep only numeric characters (Ex. "\n348 Ratings" to "348")
* Beer Added Date - keep only the date (Ex. "\nAdded 05/04/20" to "05/04/20")

In [114]:
# Beer ABV
p = re.compile(r'[^\d.\d]')

beer_df_raw['Beer ABV'] = [p.sub('', x) for x in beer_df_raw['Beer ABV']]


# Beer IBU
p = re.compile(r'\D')

beer_df_raw['Beer IBU'] = [p.sub('', x) for x in beer_df_raw['Beer IBU']]


# Beer Rating Column
p = re.compile(r'[^\d.\d]')

beer_df_raw['Beer Rating'] = [p.sub('', x) for x in beer_df_raw['Beer Rating']]

# Beer Total Ratings Column
p = re.compile(r'\D')

beer_df_raw['Beer Total Ratings'] = [p.sub('', x) for x in beer_df_raw['Beer Total Ratings']]

# Beer Added Date Column
p = re.compile(r'[^\d/\d]')

beer_df_raw['Beer Added Date'] = [p.sub('', x) for x in beer_df_raw['Beer Added Date']]

## 5.2 - Replace blank values (white space) with NaN

In [119]:
# Beer ABV
beer_df_raw['Beer ABV'] = beer_df_raw['Beer ABV'].replace(r'^\s*$', np.nan, regex=True)

# Beer IBU
beer_df_raw['Beer IBU'] = beer_df_raw['Beer IBU'].replace(r'^\s*$', np.nan, regex=True)

# Beer Rating
beer_df_raw['Beer Rating'] = beer_df_raw['Beer Rating'].replace(r'^\s*$', np.nan, regex=True)

## 5.3 - Trim leading and trailing whitespaces

In [120]:
# Beer Name
beer_df_raw['Beer Name'] = beer_df_raw['Beer Name'].str.strip()

# Beer Style
beer_df_raw['Beer Style'] = beer_df_raw['Beer Style'].str.strip()

# Beer Added Date
beer_df_raw['Beer Added Date'] = beer_df_raw['Beer Added Date'].str.strip()

## 5.4 - Drop rows with beers that have no rating

In [122]:
# Drop rows in which beers have no rating. (i.e. value in that cell is "NaN")
beer_df_raw.dropna(subset=['Beer Rating'], inplace=True)

# Reset the index of the dataframe
beer_df_raw.reset_index(drop=True, inplace=True)

In [None]:
# Confirm that no NaN values are present
len(beer_df_raw.loc[(beer_df_raw['Beer Rating'].isna())])

## 5.5 - Fill NaN values for ABV and IBU with averages

In [124]:
# Create a copy to avoid setting view warning
beer_df_raw_copy = beer_df_raw.copy()

In [125]:
# Replace NaN values in the "Beer ABV" column with the mean of the Beer Style's Avg ABV
beer_df_raw_copy['Beer ABV'].fillna(beer_df_raw.groupby('Beer Style')['Beer ABV'].transform('mean'), inplace=True)

# There might be cases where there is a single beer style with "NaN" for ABV. 
## Therefore, we will fill those values with the mean of the entire Beer ABV column
beer_df_raw_copy['Beer ABV'].fillna(beer_df_raw['Beer ABV'].mean(), inplace=True)

In [None]:
# Confirm that no NaN values are present
len(beer_df_raw_copy.loc[(beer_df_raw_copy['Beer ABV'].isna())])

In [127]:
# Replace NaN values in the "Beer IBU" column with the mean of the Beer Style's Avg IBU
beer_df_raw_copy['Beer IBU'].fillna(beer_df_raw.groupby('Beer Style')['Beer IBU'].transform('mean'), inplace=True)

# There might be cases where there is a single beer style with "NaN" for IBU. 
## Therefore, we will fill those values with the mean of the entire Beer IBU column
beer_df_raw_copy['Beer IBU'].fillna(beer_df_raw['Beer IBU'].mean(), inplace=True)

In [None]:
# Confirm that no NaN values are present
len(beer_df_raw_copy.loc[(beer_df_raw_copy['Beer IBU'].isna())])

In [129]:
# Save df to csv. 
beer_df_raw_copy.to_csv('data/beer_df_clean.csv', index=False)

# 6. Exploratory Data Analysis

In [2]:
beer_df_clean = pd.read_csv('data/beer_df_clean.csv')

## 6.1 - Describe the dataframe

In [None]:
beer_df_clean.describe(include='all')

## 6.2 - Top Beer Names
* Court Shoes Only (19)
* Black is Beautiful (16) 

In [None]:
# Court Shoes Only
print(len(beer_df_clean[beer_df_clean['Beer Name'].str.contains("Court Shoes", regex=False)]))

beer_df_clean[beer_df_clean['Beer Name'].str.contains("Court Shoes", regex=False)]

In [None]:
# Black is Beautiful
print(len(beer_df_clean[beer_df_clean['Beer Name'].str.contains("Black Is", regex=False)]))

beer_df_clean[beer_df_clean['Beer Name'].str.contains("Black Is", regex=False)]

## 6.3 - Beers with Max and Min ABV

In [None]:
# Beers with Max ABV
beer_df_clean.loc[beer_df_clean['Beer ABV']== 15]

In [None]:
# Beers with Min ABV
beer_df_clean.loc[beer_df_clean['Beer ABV']== .1]

## 6.4 - CLT Brewery Beer Ratings by Style

In [186]:
df_tree = beer_df_clean.copy()

df_tree['CLT'] = 'Charlotte Breweries' # in order to have a single root node

In [None]:
fig_tree_weights = px.treemap(df_tree, path=['CLT', 'Brewery', 'Beer Style'],
                      values = 'Beer Total Ratings',
                      color='Beer Rating',
                      color_continuous_scale='RdBu',
                      color_continuous_midpoint=np.average(df_tree['Beer Rating'], weights=df_tree['Beer Total Ratings']), 
                      title = 'Charlotte Brewery Beer Ratings by Style'
                     )

fig_tree_weights.update_layout(
    autosize=True,
    hovermode='closest',
    margin=dict(t=40, b=10, l=50, r=50))


fig_tree_weights.show()

# 7. Data Pre-Processing

In [None]:
beer_df_clean = pd.read_csv('data/beer_df_clean.csv')

## 7.1 - Calculate the average frequency of Beer Style occurence by Brewery

In [19]:
# one hot encoding
beer_styles_onehot = pd.get_dummies(beer_df_clean[['Beer Style']], prefix="", prefix_sep="")

# add brewery column back to dataframe
beer_styles_onehot['Brewery'] = beer_df_clean['Brewery'] 

# move brewery column to the first column
fixed_columns = [beer_styles_onehot.columns[-1]] + list(beer_styles_onehot.columns[:-1])
beer_styles_onehot = beer_styles_onehot[fixed_columns]

In [None]:
# Return the first 5 rows
beer_styles_onehot.head()

In [20]:
# Group the rows by Brewery and take the mean of the freq. of occurence of each beer style
beer_styles_freq = beer_styles_onehot.groupby('Brewery').mean().reset_index()

In [None]:
# Return the first 5 rows
beer_styles_freq.head()

## 7.2 - Obtain the 10 most common Beer Styles by Brewery 

In [134]:
# Define function that returns the most common beer styles for each brewery
def return_most_common_beer_styles(row, num_top_beer_styles):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_beer_styles]

In [None]:
# Set number of common beer styles
num_top_beer_styles = 10

# Indicators used for the first 3 column names (i.e. 1st, 2nd, 3rd)
indicators = ['st', 'nd', 'rd']

# create columns according to number of top Beer Styles
columns = ['Brewery']
for ind in np.arange(num_top_beer_styles):
    try:
        columns.append('{}{} Most Common Beer Style'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Beer Style'.format(ind+1))

# create a new dataframe sorted by 1st most to 10th most common beer style
brewery_beer_style_sorted = pd.DataFrame(columns=columns)
brewery_beer_style_sorted['Brewery'] = beer_styles_freq['Brewery']

for ind in np.arange(beer_styles_freq.shape[0]):
    brewery_beer_style_sorted.iloc[ind, 1:] = return_most_common_beer_styles(beer_styles_freq.iloc[ind, :], num_top_beer_styles)

In [None]:
# Return the first 5 rows
brewery_beer_style_sorted.head()

## 7.3 - Define features and normalize data

In [None]:
# Define a function that takes in a dataframe and return the feature values as a numpy array
def cluster_data(df):
    X = df.values[:,1:]
    X = np.nan_to_num(X)
    return X

In [138]:
# Call the function and pass in the beer_styles_freq df
Clus_dataSet_pre_processed = cluster_data(beer_styles_freq)

In [139]:
# Normalize the data
Clus_dataSet = preprocessing.normalize(Clus_dataSet_pre_processed)

# 8. Modeling
1. Elbow Method - Identify the optical number of clusters  (Section 8.1)
2. K-Means Clustering (Section 8.2)
3. Apply Cluster labels (Section 8.3)

## 8.1 - Elbow Method - Identify the optical number of clusters

In [None]:
# Run the Kmeans algorithm and get the index of data points clusters
sse = []
list_k = list(range(1, 10))

for k in list_k:
    km = KMeans(init = "k-means++", n_clusters = k, n_init = 60, random_state=10)
    km.fit(Clus_dataSet)
    sse.append(km.inertia_)
    
fig = px.line(x=list_k, y=sse, 
              labels = {'x': 'Number of Clusters', 'y':'Sum of Squared Distance'}, 
              title='Optimal Number of Clusters using Elbow Method')

fig.show()

## 8.2 - K-Means Clustering Modeling 

In [150]:
clusterNum = 5
k_means = KMeans(init = "k-means++", n_clusters = clusterNum, n_init = 60, random_state=10)
k_means.fit(Clus_dataSet)

# Create labels variable of the cluster #s
labels = k_means.labels_

## 8.3 - Apply Cluster labels

In [152]:
cluster_df = beer_styles_freq.copy()

In [153]:
# Add the labels of the clusters to the df
cluster_df['Clusters'] = labels

# Create dictionary for the labels
label_dict = {0: 'Cluster 1', 1: 'Cluster 2', 
              2: 'Cluster 3', 3: 'Cluster 4', 
              4: 'Cluster 5'}

# Replace the Cluster values from the dictionary (i.e. 0 --> "Cluster 1")
cluster_df['Clusters'].replace(label_dict, inplace=True)

In [None]:
# move Clusters column to the first column
fixed_columns = [cluster_df.columns[-1]] + list(cluster_df.columns[:-1])
cluster_df = cluster_df[fixed_columns]
cluster_df.head()

# 9. Results
1. Visualize clusters on map (Section 9.1)
2. Examine Clusters (Section 9.2)

## 9.1 - Visualize Clusters on map

In [156]:
# Create a copy for Clusters
CLT_Brew_Cluster = CLT_Brew.copy()

In [None]:
# Merge the cluster_df to the CLT_Brew_Cluster
CLT_Brew_Cluster = pd.merge(CLT_Brew_Cluster, cluster_df[['Brewery','Clusters']], on='Brewery', how='left')

CLT_Brew_Cluster.head()

In [None]:
mapbox_token = 'your_mapbox_token'

clt_lat = 35.2271
clt_lng = -80.8431

clt_geo = {
  "lat": clt_lat,
  "lon": clt_lng
}

import plotly.express as px
px.set_mapbox_access_token(mapbox_token)

fig = px.scatter_mapbox(
    CLT_Brew_Cluster,
    lat=CLT_Brew_Cluster['Brewery Latitude'],
    lon=CLT_Brew_Cluster['Brewery Longitude'],
    hover_name=CLT_Brew_Cluster['Brewery'],
    color = CLT_Brew_Cluster['Clusters'],
    center= clt_geo,
    zoom=10.1,
    title = 'Charlotte Brewery Clusters',
    category_orders = {'Clusters': ('Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4', 'Cluster 5')})

fig.update_layout(
    autosize=True,
    hovermode='closest',
    margin=dict(t=40, b=30, l=100, r=50))

    
fig.show()

## 9.2 - Examine Clusters

In [None]:
# Merge the cluster labels on the top 10 most common beer styles df
brewery_beer_style_sorted_cluster = pd.merge(brewery_beer_style_sorted, cluster_df[['Brewery', 'Clusters']], on='Brewery', how='left')

### Cluster 1

In [None]:
# Print out the number of breweries in this cluster
print(len(brewery_beer_style_sorted_cluster.loc[brewery_beer_style_sorted_cluster['Clusters'] == 'Cluster 1']))

# Return the cluster 1 breweries
brewery_beer_style_sorted_cluster.loc[brewery_beer_style_sorted_cluster['Clusters'] == 'Cluster 1']

In [None]:
# Create a parallel categories chart to visualize breweries and their common beer styles
clus_1 = px.parallel_categories(brewery_beer_style_sorted_cluster.loc[brewery_beer_style_sorted_cluster['Clusters'] == 'Cluster 1'], 
                             dimensions=['Brewery','1st Most Common Beer Style', 
                                         '2nd Most Common Beer Style',
                                         '3rd Most Common Beer Style',
                                         '4th Most Common Beer Style'], 
                                title = 'Cluster 1 Breweries')


clus_1.update_layout(
    autosize=True,
    hovermode='closest',
    margin=dict(t=125, b=20, l=160, r=120))

clus_1.show()

### Cluster 2

In [None]:
# Print out the number of breweries in this cluster
print(len(brewery_beer_style_sorted_cluster.loc[brewery_beer_style_sorted_cluster['Clusters'] == 'Cluster 2']))

# Return the cluster 2 breweries 
brewery_beer_style_sorted_cluster.loc[brewery_beer_style_sorted_cluster['Clusters'] == 'Cluster 2']

In [None]:
# Create a parallel categories chart to visualize breweries and their common beer styles
clus_2 = px.parallel_categories(brewery_beer_style_sorted_cluster.loc[brewery_beer_style_sorted_cluster['Clusters'] == 'Cluster 2'], 
                             dimensions=['Brewery','1st Most Common Beer Style', 
                                         '2nd Most Common Beer Style',
                                         '3rd Most Common Beer Style',
                                        '4th Most Common Beer Style'], 
                                title = 'Cluster 2 Breweries')


clus_2.update_layout(
    autosize=True,
    hovermode='closest',
    margin=dict(t=125, b=20, l=130, r=160))

clus_2.show()

### Cluster 3

In [None]:
# Print out the number of breweries in this cluster
print(len(brewery_beer_style_sorted_cluster.loc[brewery_beer_style_sorted_cluster['Clusters'] == 'Cluster 3']))

# Return the cluster 3 breweries 
brewery_beer_style_sorted_cluster.loc[brewery_beer_style_sorted_cluster['Clusters'] == 'Cluster 3']

In [None]:
# Create a parallel categories chart to visualize breweries and their common beer styles
clus_3 = px.parallel_categories(brewery_beer_style_sorted_cluster.loc[brewery_beer_style_sorted_cluster['Clusters'] == 'Cluster 3'], 
                             dimensions=['Brewery','1st Most Common Beer Style', 
                                         '2nd Most Common Beer Style',
                                         '3rd Most Common Beer Style', 
                                        '4th Most Common Beer Style'], 
                                title = 'Cluster 3 Breweries')


clus_3.update_layout(
    autosize=True,
    hovermode='closest',
    margin=dict(t=125, b=20, l=150, r=130))

clus_3.show()

### Cluster 4

In [None]:
# Print out the number of breweries in this cluster
print(len(brewery_beer_style_sorted_cluster.loc[brewery_beer_style_sorted_cluster['Clusters'] == 'Cluster 4']))

# Return the cluster 4 breweries 
brewery_beer_style_sorted_cluster.loc[brewery_beer_style_sorted_cluster['Clusters'] == 'Cluster 4']

In [None]:
# Create a parallel categories chart to visualize breweries and their common beer styles
clus_4 = px.parallel_categories(brewery_beer_style_sorted_cluster.loc[brewery_beer_style_sorted_cluster['Clusters'] == 'Cluster 4'], 
                             dimensions=['Brewery','1st Most Common Beer Style', 
                                         '2nd Most Common Beer Style',
                                         '3rd Most Common Beer Style',
                                        '4th Most Common Beer Style'], 
                                title = 'Cluster 4 Breweries')


clus_4.update_layout(
    autosize=True,
    hovermode='closest',
    margin=dict(t=125, b=20, l=160, r=120))

clus_4.show()

### Cluster 5

In [None]:
# Print out the number of breweries in this cluster
print(len(brewery_beer_style_sorted_cluster.loc[brewery_beer_style_sorted_cluster['Clusters'] == 'Cluster 5']))

# Return the cluster 5 breweries 
brewery_beer_style_sorted_cluster.loc[brewery_beer_style_sorted_cluster['Clusters'] == 'Cluster 5']

In [None]:
# Create a parallel categories chart to visualize breweries and their common beer styles
clus_5 = px.parallel_categories(brewery_beer_style_sorted_cluster.loc[brewery_beer_style_sorted_cluster['Clusters'] == 'Cluster 5'], 
                             dimensions=['Brewery','1st Most Common Beer Style', 
                                         '2nd Most Common Beer Style',
                                         '3rd Most Common Beer Style',
                                         '4th Most Common Beer Style'
                                        ]
                                , title = 'Cluster 5 Breweries')


clus_5.update_layout(
    autosize=True,
    hovermode='closest',
    margin=dict(t=125, b=20, l=160, r=120))

clus_5.show()