# 37.3 Final Capstone Prep
---


<br></br>
This notebook contains the code used to scrape and clean the data for the whisky database

In [817]:
from time import time
from time import sleep
from random import randint
from IPython.core.display import clear_output
import warnings
from warnings import warn
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import itertools
import regex as re

warnings.filterwarnings("ignore")

In [330]:
# HTML and category dictionary as it is being scraped by category per the Whisky Advocate dropdown
html1 = 'https://www.whiskyadvocate.com/ratings-reviews/?search=&submit=+&brand_id=0&rating=0&price=0&category=' 
html2 = '&styles_id=0&issue_id=0'
category_specifier = {'Scotch':'1%2C3%2C4%2C6%2C51', 
                     'Irish':'8%2C17%2C20%2C24%2C37%2C54%2C56%2C65',
                     'American':'5%2C7%2C15%2C18%2C33%2C34%2C40%2C42%2C73',
                     'Canadian':'10',
                     'Japanese':'11',
                     'World':'9%2C11%2C12%2C13%2C16%2C19%2C21%2C23%2C26%2C27%2C28%2C29%2C30%2C31%2C32%2C39%2C44%2C46%2C47%2C48%2C49%2C50%2C52%2C53%2C55%2C57%2C58%2C59%2C60%2C61%2C66%2C70%2C71%2C72',
                     'Flavored':'41',
                     'Other':'2%2C45%2C46',
                     # 'Bourbon/Tennessee' not included as it is part of the 'American' category
                     'Rye':'5',
                     'Single/Blended Malt':'1%2C2%2C3%2C16%2C18%2C21%2C23%2C59%2C61',
                     'Single/Blended Grain':'6%2C20%2C51%2C58',
                     'Blended':'3%2C4%2C17%2C42%2C47%2C54%2C55%2C57'
                    }

In [218]:
# Testing the viability of the html tags
response = requests.get(str(html1 + category_specifier['Rye'] + html2))

if (response.ok):
    data = response.text

soup = BeautifulSoup(data, 'html.parser')

response.ok

True

In [224]:
# Empty lists to build
#points = []
#labels = []
#categories = []
#prices = []
#reviews = []
#reviewers = []
#years = []

In [239]:
# A list of the lists
#scraped_lists = [points, labels, categories, prices, reviews, reviewers, years]

In [226]:
# Getting the scrape points
#def process_data(soup):
#    points.append([point.get_text() for point in soup.select('.review-top > h2 > span')])
#    labels.append([label.get_text() for label in soup.select('.printable-section > h1')])
#    categories.append([category.get_text() for category in soup.select('.entry-meta > span > span:nth-of-type(1)')])
#    prices.append([price.get_text() for price in soup.select('.entry-meta > span > span:nth-of-type(3)')])
#    reviews.append([review.get_text() for review in soup.select('.printable-section > div:nth-of-type(1) > p')])
#    reviewers.append([reviewer.get_text() for reviewer in soup.select('.printable-section > div:nth-of-type(2) > p > span')])
    # Gets the last four characters, which is the year
#    years.append([year.get_text()[-4:] for year in soup.select('.printable-section > div:nth-of-type(2) > p > a')])

In [1008]:
# Getting the scrape points into dictionary
whiskies = []

def process_data2(soup, whiskies, i):
    article = soup.select('article')
    for whisky in article:
        point = whisky.select_one('.review-top > h2 > span').get_text()
        label = whisky.select_one('.printable-section > h1').get_text()
        subcategory = whisky.select_one('.entry-meta > span > span:nth-of-type(1)').get_text()
        category = str(i)
        price = whisky.select_one('.entry-meta > span > span:nth-of-type(3)').get_text()
        review = whisky.select_one('.printable-section > div:nth-of-type(1) > p').get_text()
        # Gets the last four characters, which is the year
        try:
            year = whisky.select_one('.printable-section > div:nth-of-type(2) > p > a').get_text()[-4:]
        except:
            year = np.nan
        try:
            reviewer = whisky.select_one('.printable-section > div:nth-of-type(2) > p > span').get_text()
        except:
            reviewer = np.nan

        whisky = {'point':point, 'label':label,'category':category,'subcategory':subcategory,'price':price,
                  'review':review,'reviewer':reviewer,'year':year}
        whiskies.append(whisky)

In [917]:
# No longer needed after scraping to dictionary
#def flatten_data(scraped_dict):
#    for i, j in enumerate(scraped_dict):
#        collapsed = list(itertools.chain.from_iterable(j))
#        scraped_lists[i] = collapsed

In [1010]:
# Building the scraper
start = time()

headers = {'user-agent':'whisky info scraper - educational project (dancassinatwork@gmail.com)'}

for i in category_specifier:
    clear_output(wait=True)

    response = requests.get(str(html1 + category_specifier[i] + html2), headers=headers)

    if (response.ok):
        data = response.text
        soup = BeautifulSoup(data, 'html.parser')
        process_data2(soup, whiskies, i)

    sleep(randint(1,5))

    elapsed_time = time() - start
    print(f'Categories: {len(whiskies)}, Frequency: {len(whiskies)/elapsed_time} requests/s')



print('Scraping Complete')

Categories: 8563, Frequency: 71.37116696058631 requests/s
Scraping Complete


In [1011]:
initial_df = pd.DataFrame(whiskies)
initial_df.head()

Unnamed: 0,point,label,category,subcategory,price,review,reviewer,year
0,97,"Black Bowmore 42 year old 1964 vintage, 40.5%",Scotch,Single Malt Scotch,4500,What impresses me most is how this whisky evol...,John Hansell,2008
1,97,"Bowmore 46 year old (distilled 1964), 42.9%",Scotch,Single Malt Scotch,13500,There have been some legendary Bowmores from t...,Dave Broom,2012
2,97,"Johnnie Walker Blue Label, 40%",Scotch,Blended Scotch Whisky,225,"Magnificently powerful and intense. Caramels, ...",Jonny McCormick,2018
3,96,"Glenlivet Cellar Collection 1969 vintage, 50.8%",Scotch,Single Malt Scotch,750,It’s great that Glenlivet releases whiskies un...,John Hansell,2007
4,96,The Macallan 29 year old 1976 Vintage (Cask #1...,Scotch,Single Malt Scotch,1500,Classic sherry cask-aged Macallan. Antique amb...,John Hansell,2008


In [1012]:
# Apparently some categories in the dropdown had overlapping whiskies
initial_df.duplicated(subset='review').sum()

3355

In [1013]:
initial_df.drop_duplicates(subset='review',inplace=True)

In [1014]:
initial_df.reset_index(drop=True, inplace=True)

In [1015]:
initial_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5208 entries, 0 to 5207
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   point        5208 non-null   object
 1   label        5208 non-null   object
 2   category     5208 non-null   object
 3   subcategory  5208 non-null   object
 4   price        5208 non-null   object
 5   review       5208 non-null   object
 6   reviewer     5207 non-null   object
 7   year         5207 non-null   object
dtypes: object(8)
memory usage: 325.6+ KB


In [1016]:
initial_df.tail()

Unnamed: 0,point,label,category,subcategory,price,review,reviewer,year
5203,80,"The Hilhaven Lodge, 40%",Other,Miscellaneous,50,Straw to light tawny. This blend of straight A...,Fred Minnick,2016
5204,79,"Orphan Barrel Whoop & Holler, 42%",Other,Miscellaneous,175,This composite includes 28 year old barrels th...,Fred Minnick,2017
5205,76,Buffalo Trace Experimental Collection Organic ...,Other,Miscellaneous,46,"Youth and rawness offer the first impression, ...",Fred Minnick,2017
5206,75,"Rebel Yell American Whiskey, 45%",Other,Miscellaneous,30,"A blend of bourbon and rye whiskeys, 2 years o...",Lew Bryson,2015
5207,73,"Virginia Black American Whiskey, 40%",Other,Miscellaneous,35,"Sourced whiskey from MGP, it’s fairly muted wi...",Fred Minnick,2017


# REGEX
---
<br></br>
Using regex to clean columns and scrape values from one column to create new columns

In [1017]:
# Remove ' ABV' from end of label
initial_df['label'] = initial_df.label.replace(to_replace=['[A][B][V]'], value='', regex=True).str.strip()
# Remove ',' from end of label
initial_df['label'] = initial_df.label.replace('\,$', '', regex=True)
# Removing characters from price column and replacing with space or no space depending on character
initial_df['price'] = initial_df.price.replace(['\.', '\/', '\$'],' ', regex=True)
initial_df['price'] = initial_df.price.replace('\,','', regex=True)

In [1018]:
# Using regex to create a new column for the ABV
initial_df['abv'] = initial_df.label.str.extract(r'([0-9][0-9]\.[0-9][0-9]%$|[0-9][0-9]%$|[0-9][0-9]\.[0-9]%$)')
# Remove abv percents
initial_df['abv'] = initial_df.abv.replace('%','', regex=True)
# ABV to float
initial_df['abv'] = pd.to_numeric(initial_df['abv'], downcast="float")

In [1041]:
# Future update remove [$€£] from reviews and the numbers that follow them
initial_df['review'] = initial_df.review.replace(r'((\$[0-9]+)|(€[0-9]+)|(£[0-9]+))', '', regex=True)

In [1020]:
pd.options.display.float_format = '{:,.2f}'.format

In [1021]:
initial_df.head()

Unnamed: 0,point,label,category,subcategory,price,review,reviewer,year,abv
0,97,"Black Bowmore 42 year old 1964 vintage, 40.5%",Scotch,Single Malt Scotch,4500,What impresses me most is how this whisky evol...,John Hansell,2008,40.5
1,97,"Bowmore 46 year old (distilled 1964), 42.9%",Scotch,Single Malt Scotch,13500,There have been some legendary Bowmores from t...,Dave Broom,2012,42.9
2,97,"Johnnie Walker Blue Label, 40%",Scotch,Blended Scotch Whisky,225,"Magnificently powerful and intense. Caramels, ...",Jonny McCormick,2018,40.0
3,96,"Glenlivet Cellar Collection 1969 vintage, 50.8%",Scotch,Single Malt Scotch,750,It’s great that Glenlivet releases whiskies un...,John Hansell,2007,50.8
4,96,The Macallan 29 year old 1976 Vintage (Cask #1...,Scotch,Single Malt Scotch,1500,Classic sherry cask-aged Macallan. Antique amb...,John Hansell,2008,45.4


In [1022]:
# Creating a new price_float column and turning it into float dtype
initial_df['price_float'] = initial_df.price.str.split('\s',1).str[0].str.strip()
initial_df['price_float'] = pd.to_numeric(initial_df['price_float'], downcast="float")

In [1023]:
# extracting the age of the whisky by finding " year" and getting the preceding numbers
#whisky['age'] = whisky.label.str.extract(r'(\d+)(?=( year)|( Year))')
initial_df['age'] = initial_df.label.str.extract('(\d+(?= year| Year))', expand=True)

In [1024]:
# extracting the vintage of the whisky
initial_df['vintage'] = initial_df.label.str.extract(r'([2][0][01][0-9]|[1][9][4-9][0-9])')

In [1025]:
initial_df.head()

Unnamed: 0,point,label,category,subcategory,price,review,reviewer,year,abv,price_float,age,vintage
0,97,"Black Bowmore 42 year old 1964 vintage, 40.5%",Scotch,Single Malt Scotch,4500,What impresses me most is how this whisky evol...,John Hansell,2008,40.5,4500.0,42.0,1964.0
1,97,"Bowmore 46 year old (distilled 1964), 42.9%",Scotch,Single Malt Scotch,13500,There have been some legendary Bowmores from t...,Dave Broom,2012,42.9,13500.0,46.0,1964.0
2,97,"Johnnie Walker Blue Label, 40%",Scotch,Blended Scotch Whisky,225,"Magnificently powerful and intense. Caramels, ...",Jonny McCormick,2018,40.0,225.0,,
3,96,"Glenlivet Cellar Collection 1969 vintage, 50.8%",Scotch,Single Malt Scotch,750,It’s great that Glenlivet releases whiskies un...,John Hansell,2007,50.8,750.0,,1969.0
4,96,The Macallan 29 year old 1976 Vintage (Cask #1...,Scotch,Single Malt Scotch,1500,Classic sherry cask-aged Macallan. Antique amb...,John Hansell,2008,45.4,1500.0,29.0,1976.0


In [1027]:
initial_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5208 entries, 0 to 5207
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   point        5208 non-null   object 
 1   label        5208 non-null   object 
 2   category     5208 non-null   object 
 3   subcategory  5208 non-null   object 
 4   price        5208 non-null   object 
 5   review       5208 non-null   object 
 6   reviewer     5207 non-null   object 
 7   year         5207 non-null   object 
 8   abv          5182 non-null   float32
 9   price_float  5207 non-null   float32
 10  age          1954 non-null   object 
 11  vintage      1164 non-null   object 
dtypes: float32(2), object(10)
memory usage: 447.7+ KB


In [1048]:
initial_df[initial_df.reviewer.isna()]

Unnamed: 0,point,label,category,subcategory,price,review,reviewer,year,abv,price_float,age,vintage
1036,88,"Cardhu, 1997 vintage, 57.3%",Scotch,Single Malt Scotch,406 00,Matured in a bourbon cask. Cardhu has always b...,,,57.3,406.0,,1997


In [1052]:
# Beautiful Soup coding must have missed these as they are apparent from searching on the website. 
initial_df.reviewer[1036] = 'John Hansell'
initial_df.year[1036] = 2010
initial_df.price_float[5148] = 130

In [1054]:
# Inspecting rows with NaN values to see if anything was missed by regex
initial_df[initial_df.isna().any(axis=1)]

Unnamed: 0,point,label,category,subcategory,price,review,reviewer,year,abv,price_float,age,vintage
2,97,"Johnnie Walker Blue Label, 40%",Scotch,Blended Scotch Whisky,225,"Magnificently powerful and intense. Caramels, ...",Jonny McCormick,2018,40.00,225.00,,
3,96,"Glenlivet Cellar Collection 1969 vintage, 50.8%",Scotch,Single Malt Scotch,750,It’s great that Glenlivet releases whiskies un...,John Hansell,2007,50.80,750.00,,1969
5,96,"The Dalmore 50 year old, 52.8%",Scotch,Single Malt Scotch,1500,The Dalmore is one of a handful of whiskies th...,John Hansell,2009,52.80,1500.00,50,
6,96,"Gold Bowmore 1964 vintage, 42.4%",Scotch,Single Malt Scotch,6250,Deep gold color. Surprisingly lively on the no...,John Hansell,2009,42.40,6250.00,,1964
7,96,"Bowmore 40 year old, 44.8%",Scotch,Single Malt Scotch,11000,"Definitely showing its age, but not in a bad w...",John Hansell,2011,44.80,11000.00,40,
...,...,...,...,...,...,...,...,...,...,...,...,...
5203,80,"The Hilhaven Lodge, 40%",Other,Miscellaneous,50,Straw to light tawny. This blend of straight A...,Fred Minnick,2016,40.00,50.00,,
5204,79,"Orphan Barrel Whoop & Holler, 42%",Other,Miscellaneous,175,This composite includes 28 year old barrels th...,Fred Minnick,2017,42.00,175.00,,
5205,76,Buffalo Trace Experimental Collection Organic ...,Other,Miscellaneous,46,"Youth and rawness offer the first impression, ...",Fred Minnick,2017,45.00,46.00,,
5206,75,"Rebel Yell American Whiskey, 45%",Other,Miscellaneous,30,"A blend of bourbon and rye whiskeys, 2 years o...",Lew Bryson,2015,45.00,30.00,,


In [1059]:
final_df = initial_df[['point', 'label', 'subcategory','category', 'review', 'reviewer', 'year',
       'abv', 'price_float', 'age', 'vintage']]

final_df.rename(columns={'price_float':'price', 'point':'rating'}, inplace=True)

final_df.head()

Unnamed: 0,rating,label,subcategory,category,review,reviewer,year,abv,price,age,vintage
0,97,"Black Bowmore 42 year old 1964 vintage, 40.5%",Single Malt Scotch,Scotch,What impresses me most is how this whisky evol...,John Hansell,2008,40.5,4500.0,42.0,1964.0
1,97,"Bowmore 46 year old (distilled 1964), 42.9%",Single Malt Scotch,Scotch,There have been some legendary Bowmores from t...,Dave Broom,2012,42.9,13500.0,46.0,1964.0
2,97,"Johnnie Walker Blue Label, 40%",Blended Scotch Whisky,Scotch,"Magnificently powerful and intense. Caramels, ...",Jonny McCormick,2018,40.0,225.0,,
3,96,"Glenlivet Cellar Collection 1969 vintage, 50.8%",Single Malt Scotch,Scotch,It’s great that Glenlivet releases whiskies un...,John Hansell,2007,50.8,750.0,,1969.0
4,96,The Macallan 29 year old 1976 Vintage (Cask #1...,Single Malt Scotch,Scotch,Classic sherry cask-aged Macallan. Antique amb...,John Hansell,2008,45.4,1500.0,29.0,1976.0


In [1060]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5208 entries, 0 to 5207
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   rating       5208 non-null   object 
 1   label        5208 non-null   object 
 2   subcategory  5208 non-null   object 
 3   category     5208 non-null   object 
 4   review       5208 non-null   object 
 5   reviewer     5208 non-null   object 
 6   year         5208 non-null   object 
 7   abv          5182 non-null   float32
 8   price        5208 non-null   float32
 9   age          1954 non-null   object 
 10  vintage      1164 non-null   object 
dtypes: float32(2), object(9)
memory usage: 407.0+ KB


In [1061]:
final_df.to_csv('whisky1.csv')